|
1 | 1 | # ========================== begin_copyright_notice ============================ |
2 | 2 | # |
3 | | -# Copyright (C) 2019-2024 Intel Corporation |
| 3 | +# Copyright (C) 2019-2025 Intel Corporation |
4 | 4 | # |
5 | 5 | # SPDX-License-Identifier: MIT |
6 | 6 | # |
|
1616 | 1616 | ### | U8 | 8 | 8-bit unsigned integer | |
1617 | 1617 | ### | BF16 | 9 | bfloat16 (S1E8M7) floating point | |
1618 | 1618 | ### | HF16 | 10 | half-precision (S1E5M10) floating point | |
| 1619 | +### | BF8 | 11 | bfloat8 (S1E5M2) floating point | |
1619 | 1620 | ### | TF32 | 12 | tensorfloat32 (S1E8M10) floating point | |
| 1621 | +### | HF8 | 14 | hfloat8 (S1E4M3) floating point | |
| 1622 | +### | E2M1 | 15 | 4-bit floating point (S1E2M1) | |
1620 | 1623 | ### +---------------+-------+-------------------------------------------------+ |
1621 | 1624 | ### |
1622 | 1625 | ### |
|
1674 | 1677 | "XeHP", "XeHPG", "XeLPGPlus" ], |
1675 | 1678 | }, |
1676 | 1679 |
|
| 1680 | + |
| 1681 | +### ``llvm.genx.bdpas.*`` : bdpas instruction (Dot Product Accumulate Systolic with block scaling) |
| 1682 | +### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 1683 | +### |
| 1684 | +### M = RepeatCount |
| 1685 | +### N = ExecSize = 16 |
| 1686 | +### K = SystolicDepth |
| 1687 | +### |
| 1688 | +### * arg0: accumulator (vLf32 or vLf16 or vLbf16, L=M*N) |
| 1689 | +### * arg1: B matrix (vLi32, L=K*N) |
| 1690 | +### * arg2: A matrix (vLi32, L=M*K) |
| 1691 | +### * arg3: B matrix block scale in E8M0 format (vXi8), X depends on the B matrix data type, see the table below |
| 1692 | +### * arg4: A matrix block scale in E8M0 format (vYi8), Y depends on the A matrix data type, see the table below |
| 1693 | +### * arg5: B matrix data type (i32) |
| 1694 | +### * arg6: A matrix data type (i32) |
| 1695 | +### * arg7: SystolicDepth (i32), must be constant equal to 8 |
| 1696 | +### * arg8: RepeatCount (i32), must be constant equal to 8 |
| 1697 | +### |
| 1698 | +### * Return value: result, (vLf32 or vLf16 or vLbf16, L=M*N) |
| 1699 | +### |
| 1700 | +### See the `dpas2` intrinsic description for the PrecisionType enum values. |
| 1701 | +### |
| 1702 | +### Supported data type combinations: |
| 1703 | +### +-----------+-------------+------------+------------+-----------------+-----+-----+ |
| 1704 | +### | Result | Accumulator | B matrix | A matrix | Ops Per Channel | X | Y | |
| 1705 | +### +-----------+-------------+------------+------------+-----------------+-----+-----+ |
| 1706 | +### | f32, hf16 | f32, hf16 | hf16 | hf16 | 2 | N | M | |
| 1707 | +### | f32, bf16 | f32, bf16 | bf16 | bf16 | 2 | N | M | |
| 1708 | +### | f32, bf16 | f32, bf16 | bf8, hf8 | bf8, hf8 | 4 | N | M | |
| 1709 | +### | f32, bf16 | f32, bf16 | e2m1 | e2m1 | 8 | 2*N | 2*M | |
| 1710 | +### +-----------+-------------+------------+------------+-----------------+-----+-----+ |
| 1711 | +### |
| 1712 | +### Note: i16 can be used as a placeholder for bf16 in the return value and accumulator. |
| 1713 | +### |
| 1714 | + "bdpas" : { "result" : "anyvector", |
| 1715 | + "arguments" : ["anyvector","anyvector","anyvector","anyvector","anyvector","int","int","int","int"], |
| 1716 | + "attributes" : "NoMem", |
| 1717 | + "platforms" : "Xe3P+", |
| 1718 | + }, |
| 1719 | + |
| 1720 | +### ``llvm.genx.mxfp.reduce.32x32.*`` : 32x32 block scale reduction routine |
| 1721 | +### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 1722 | +### |
| 1723 | +### * arg0: v1024i16 or v1024bf16 or v1024f16, 32x32 matrix |
| 1724 | +### |
| 1725 | +### * Return value: v32i16 or v32bf16 or v32f16, 32x1 vector of reduced values |
| 1726 | +### |
| 1727 | +### For each input matrix row the routine produces a single element which is a maximum of |
| 1728 | +### the row absolute values. |
| 1729 | +### |
| 1730 | +### The order of elements in the output is the following: |
| 1731 | +### 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, |
| 1732 | +### 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31 |
| 1733 | +### |
| 1734 | + "mxfp_reduce_32x32" : { "result" : "anyvector", |
| 1735 | + "arguments" : ["anyvector"], |
| 1736 | + "attributes" : "NoMem", |
| 1737 | + "platforms" : "Xe3P+", |
| 1738 | + }, |
| 1739 | + |
| 1740 | +### ``llvm.genx.mxfp.linearize.*`` : routine to linearize the block scale vector |
| 1741 | +### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 1742 | +### |
| 1743 | +### * arg0: v32i16 or v32bf16 or v32f16, 32x1 vector of reduced values |
| 1744 | +### |
| 1745 | +### * Return value: v32i16 or v32bf16 or v32f16, 32x1 vector of linearized values |
| 1746 | +### |
| 1747 | +### |
| 1748 | + "mxfp_linearize" : { "result" : "anyvector", |
| 1749 | + "arguments" : [0], |
| 1750 | + "attributes" : "NoMem", |
| 1751 | + "platforms" : "Xe3P+", |
| 1752 | + }, |
| 1753 | + |
1677 | 1754 | ### ``llvm.genx.*dp4a*.<return type>.<vector type>.<vector type>.<vector type>`` : dp4a instruction (Dot Product 4 Accumulate) |
1678 | 1755 | ### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
1679 | 1756 | ### * ``llvm.genx.ssdp4a`` : result signed, operands signed |
|
1830 | 1907 | "attributes" : "NoMem" |
1831 | 1908 | }, |
1832 | 1909 |
|
| 1910 | +### ``llvm.genx.lfsr.*``: lfsr instructions |
| 1911 | +### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 1912 | +### The instruction is used to generate a pseudo-random number sequence. |
| 1913 | +### The algorithm is based on a linear feedback shift register (LFSR). |
| 1914 | +### |
| 1915 | +### * arg0: seed, vNi32 |
| 1916 | +### * arg1: polynomial, same type as arg0 |
| 1917 | +### * arg2: mode, i8, must be compile-time constant |
| 1918 | +### |
| 1919 | +### * Return value: result, same type as arg0 |
| 1920 | +### |
| 1921 | +### The mode argument is an enum with the following values: |
| 1922 | +### * 0: b32 - input is 32-bit value |
| 1923 | +### * 1: 2xb16 - input is a pair of 16-bit values |
| 1924 | +### * 2: 4xb8 - input is a quad of 8-bit values |
| 1925 | +### |
| 1926 | + "lfsr" : { "result" : "anyint", |
| 1927 | + "arguments" : [0, 0, "char"], |
| 1928 | + "attributes" : "NoMem", |
| 1929 | + "platforms" : "Xe3P+" |
| 1930 | + }, |
| 1931 | + |
| 1932 | + |
1833 | 1933 | ### srnd |
1834 | 1934 | ### ^^^ |
1835 | 1935 | ### |
|
1861 | 1961 | }, |
1862 | 1962 |
|
1863 | 1963 |
|
| 1964 | +### ``llvm.genx.biased.rounding.hf8.*`` : biased rounding instruction |
| 1965 | +### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 1966 | +### |
| 1967 | +### * arg0: input, vNf16 or vNbf16 |
| 1968 | +### * arg1: bias, vNi8 |
| 1969 | +### |
| 1970 | +### * Return value: result, vNi8 |
| 1971 | +### |
| 1972 | +### Note: bf16 can be substituted with i16. |
| 1973 | +### |
| 1974 | + "biased_rounding_hf8" : { "result" : "anyint", |
| 1975 | + "arguments" : ["anyvector", 0], |
| 1976 | + "attributes" : "NoMem", |
| 1977 | + "platforms" : "Xe3P+" |
| 1978 | + }, |
| 1979 | + |
1864 | 1980 | ### bf_cvt |
1865 | 1981 | ### ^^^^^^ |
1866 | 1982 | ### |
|
1927 | 2043 | "platforms" : "Xe3+" |
1928 | 2044 | }, |
1929 | 2045 |
|
| 2046 | + |
| 2047 | +## ``llvm.genx.packed.4bit.upconvert.lut`` : packed fp4/int4 upconvert operation |
| 2048 | +## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 2049 | +## |
| 2050 | +## * arg0: lookup table, v16i32 vector. |
| 2051 | +## * arg1: input packed 4-bit data, vNi8 or vNi16 vector (overloaded), |
| 2052 | +## must be a region of <4;1,0> (for i8) or <2;1,0> (for i16). |
| 2053 | +## |
| 2054 | +## * Return value: packed 8 or 16-bit values, vNi32. |
| 2055 | +## |
| 2056 | +## Restrictions: |
| 2057 | +## * The only supported N values for this operation are 16 and 32. |
| 2058 | +## |
| 2059 | +## This intrinsic represents 4bit->8bit and 4bit->16bit upconvert thru a lookup table |
| 2060 | + "packed_4bit_upconvert_lut" : { "result": "anyint", |
| 2061 | + "arguments": ["int16", "anyint"], |
| 2062 | + "attributes": "NoMem", |
| 2063 | + "platforms" : "Xe3P+", }, |
| 2064 | + |
| 2065 | +### ``llvm.genx.4bit.downconvert`` : fp4/int4 downconvert operation |
| 2066 | +### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 2067 | +### |
| 2068 | +### The intrinsic represents bf16->4bit and f16->4bit downscale operations. |
| 2069 | +### |
| 2070 | +### * arg0: src0, vNi32 |
| 2071 | +### * arg1: src1, vNi32 |
| 2072 | +### * arg2: bias, vNi32, ignored when rounding mode is "round to nearest even" |
| 2073 | +### * arg3: conversion type, i8, enum value, must be constant |
| 2074 | +### * arg4: packing mode, i8, enum value, must be constant |
| 2075 | +### * arg5: rounding mode, i8, enum value, must be constant |
| 2076 | +### |
| 2077 | +### * Return value: result, vNi32. The output values are packed into 32-bit |
| 2078 | +### integers according to the packing mode operand. |
| 2079 | +### |
| 2080 | +### The supported output data formats are the following: |
| 2081 | +### * E2M1 - 2-bit exponent, 1-bit mantissa and 1-bit sign floating point values |
| 2082 | +### * Int4 - 4-bit signed integer values |
| 2083 | +### |
| 2084 | +### The conversion type argument (arg3) is an enum with the following values: |
| 2085 | +### * 1: BFtoE2M1 - bfloat16 to E2M1 |
| 2086 | +### * 2: BFtoInt4 - bfloat16 to Int4 |
| 2087 | +### * 4: HFtoE2M1 - half precision to E2M1 |
| 2088 | +### * 5: HFtoInt4 - half precision to Int4 |
| 2089 | +### |
| 2090 | +### The packing mode argument (arg4) is an emum with the following values (lsb to msb): |
| 2091 | +### * 0: { downscale(src0[0:15]) | downscale(src0[16:31]) << 4, 0, downscale(src1[0:15]) | downscale(src1[16:31]) << 4, 0 } |
| 2092 | +### * 1: { downscale(src0[0:15]) | downscale(src1[0:15]) << 4, 0, downscale(src0[16:31]) | downscale(src1[16:31]) << 4, 0 } |
| 2093 | +### * 2: { 0, downscale(src0[0:15]) | downscale(src0[16:31]) << 4, 0, downscale(src1[0:15]) | downscale(src1[16:31]) << 4 } |
| 2094 | +### * 3: { 0, downscale(src0[0:15]) | downscale(src1[0:15]) << 4, 0, downscale(src0[16:31]) | downscale(src1[16:31]) << 4 } |
| 2095 | +### |
| 2096 | +### The rounding mode argument (arg5) is an emum with the following values: |
| 2097 | +### * 0: biased round |
| 2098 | +### * 1: round to nearest even |
| 2099 | +### |
| 2100 | + "4bit_downconvert" : { "result": "anyint", |
| 2101 | + "arguments": [0, 0, 0, "char", "char", "char"], |
| 2102 | + "attributes": "NoMem", |
| 2103 | + "platforms" : "Xe3P+", |
| 2104 | + }, |
| 2105 | + |
1930 | 2106 | ### ``llvm.genx.lsc.load.*.<return type if not void>.<any type>.<any type>`` : lsc_load instructions |
1931 | 2107 | ### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
1932 | 2108 | ### |
|
4717 | 4893 | "arguments" : ["char","char","anyvector","char","char","char","int","int","anyvector","anyvector"], |
4718 | 4894 | "attributes" : "NoWillReturn" |
4719 | 4895 | }, |
| 4896 | +## ``llvm.genx.raw.sendg`` : raw send generalized message |
| 4897 | +## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 4898 | +## |
| 4899 | +## * arg0: i16 dst size (in bytes) [MBC] |
| 4900 | +## * arg1: i1 IsConditional |
| 4901 | +## * arg2: i1 IsEOT |
| 4902 | +## * arg3: i8 SFID [MBC] |
| 4903 | +## * arg4: vNxi1 Predicate (overloaded) |
| 4904 | +## * arg5: vector src0 (overloaded) |
| 4905 | +## * arg6: i16 src0 size (in bytes) [MBC] |
| 4906 | +## * arg7: vector src1 (overloaded) |
| 4907 | +## * arg8: i16 src1 size (in bytes) [MBC] |
| 4908 | +## * arg9: i64 indirect descriptor 0 |
| 4909 | +## * arg10: i64 indirect descriptor 1 |
| 4910 | +## * arg11: i64 descriptor [MBC] |
| 4911 | +## * arg12: vector to take values for masked simd lanes from |
| 4912 | +## |
| 4913 | +## * Return value: vector dst (overloaded) |
| 4914 | +## |
| 4915 | + "raw_sendg": { "result" : "anyvector", |
| 4916 | + "arguments" : [ |
| 4917 | + "short", # dst size |
| 4918 | + "bool", # cond |
| 4919 | + "bool", # EOT |
| 4920 | + "char", # sfid |
| 4921 | + "anyint", # predicate |
| 4922 | + "anyvector", # src0 |
| 4923 | + "short", # src0 size |
| 4924 | + "anyvector", # src1 |
| 4925 | + "short", # src1 size |
| 4926 | + "long", # ind0 |
| 4927 | + "long", # ind1 |
| 4928 | + "long", # desc |
| 4929 | + 0, # passthru |
| 4930 | + ], |
| 4931 | + "attributes" : "NoWillReturn,SideEffects", |
| 4932 | + "platforms" : "Xe3P+" }, |
4720 | 4933 |
|
4721 | 4934 | ## --------------------------- |
4722 | 4935 | ### Video Analytics Instrinsics |
|
0 commit comments