Skip to content

Commit f8055b7

Browse files
vsemenov368sys-cmllvm
authored andcommitted
Add support for Xe3p devices
.
1 parent db261bc commit f8055b7

File tree

2 files changed

+215
-1
lines changed

2 files changed

+215
-1
lines changed

GenXIntrinsics/include/llvm/GenXIntrinsics/Intrinsic_definitions.py

Lines changed: 214 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# ========================== begin_copyright_notice ============================
22
#
3-
# Copyright (C) 2019-2024 Intel Corporation
3+
# Copyright (C) 2019-2025 Intel Corporation
44
#
55
# SPDX-License-Identifier: MIT
66
#
@@ -1616,7 +1616,10 @@
16161616
### | U8 | 8 | 8-bit unsigned integer |
16171617
### | BF16 | 9 | bfloat16 (S1E8M7) floating point |
16181618
### | HF16 | 10 | half-precision (S1E5M10) floating point |
1619+
### | BF8 | 11 | bfloat8 (S1E5M2) floating point |
16191620
### | TF32 | 12 | tensorfloat32 (S1E8M10) floating point |
1621+
### | HF8 | 14 | hfloat8 (S1E4M3) floating point |
1622+
### | E2M1 | 15 | 4-bit floating point (S1E2M1) |
16201623
### +---------------+-------+-------------------------------------------------+
16211624
###
16221625
###
@@ -1674,6 +1677,80 @@
16741677
"XeHP", "XeHPG", "XeLPGPlus" ],
16751678
},
16761679

1680+
1681+
### ``llvm.genx.bdpas.*`` : bdpas instruction (Dot Product Accumulate Systolic with block scaling)
1682+
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1683+
###
1684+
### M = RepeatCount
1685+
### N = ExecSize = 16
1686+
### K = SystolicDepth
1687+
###
1688+
### * arg0: accumulator (vLf32 or vLf16 or vLbf16, L=M*N)
1689+
### * arg1: B matrix (vLi32, L=K*N)
1690+
### * arg2: A matrix (vLi32, L=M*K)
1691+
### * arg3: B matrix block scale in E8M0 format (vXi8), X depends on the B matrix data type, see the table below
1692+
### * arg4: A matrix block scale in E8M0 format (vYi8), Y depends on the A matrix data type, see the table below
1693+
### * arg5: B matrix data type (i32)
1694+
### * arg6: A matrix data type (i32)
1695+
### * arg7: SystolicDepth (i32), must be constant equal to 8
1696+
### * arg8: RepeatCount (i32), must be constant equal to 8
1697+
###
1698+
### * Return value: result, (vLf32 or vLf16 or vLbf16, L=M*N)
1699+
###
1700+
### See the `dpas2` intrinsic description for the PrecisionType enum values.
1701+
###
1702+
### Supported data type combinations:
1703+
### +-----------+-------------+------------+------------+-----------------+-----+-----+
1704+
### | Result | Accumulator | B matrix | A matrix | Ops Per Channel | X | Y |
1705+
### +-----------+-------------+------------+------------+-----------------+-----+-----+
1706+
### | f32, hf16 | f32, hf16 | hf16 | hf16 | 2 | N | M |
1707+
### | f32, bf16 | f32, bf16 | bf16 | bf16 | 2 | N | M |
1708+
### | f32, bf16 | f32, bf16 | bf8, hf8 | bf8, hf8 | 4 | N | M |
1709+
### | f32, bf16 | f32, bf16 | e2m1 | e2m1 | 8 | 2*N | 2*M |
1710+
### +-----------+-------------+------------+------------+-----------------+-----+-----+
1711+
###
1712+
### Note: i16 can be used as a placeholder for bf16 in the return value and accumulator.
1713+
###
1714+
"bdpas" : { "result" : "anyvector",
1715+
"arguments" : ["anyvector","anyvector","anyvector","anyvector","anyvector","int","int","int","int"],
1716+
"attributes" : "NoMem",
1717+
"platforms" : "Xe3P+",
1718+
},
1719+
1720+
### ``llvm.genx.mxfp.reduce.32x32.*`` : 32x32 block scale reduction routine
1721+
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1722+
###
1723+
### * arg0: v1024i16 or v1024bf16 or v1024f16, 32x32 matrix
1724+
###
1725+
### * Return value: v32i16 or v32bf16 or v32f16, 32x1 vector of reduced values
1726+
###
1727+
### For each input matrix row the routine produces a single element which is a maximum of
1728+
### the row absolute values.
1729+
###
1730+
### The order of elements in the output is the following:
1731+
### 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
1732+
### 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31
1733+
###
1734+
"mxfp_reduce_32x32" : { "result" : "anyvector",
1735+
"arguments" : ["anyvector"],
1736+
"attributes" : "NoMem",
1737+
"platforms" : "Xe3P+",
1738+
},
1739+
1740+
### ``llvm.genx.mxfp.linearize.*`` : routine to linearize the block scale vector
1741+
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1742+
###
1743+
### * arg0: v32i16 or v32bf16 or v32f16, 32x1 vector of reduced values
1744+
###
1745+
### * Return value: v32i16 or v32bf16 or v32f16, 32x1 vector of linearized values
1746+
###
1747+
###
1748+
"mxfp_linearize" : { "result" : "anyvector",
1749+
"arguments" : [0],
1750+
"attributes" : "NoMem",
1751+
"platforms" : "Xe3P+",
1752+
},
1753+
16771754
### ``llvm.genx.*dp4a*.<return type>.<vector type>.<vector type>.<vector type>`` : dp4a instruction (Dot Product 4 Accumulate)
16781755
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
16791756
### * ``llvm.genx.ssdp4a`` : result signed, operands signed
@@ -1830,6 +1907,29 @@
18301907
"attributes" : "NoMem"
18311908
},
18321909

1910+
### ``llvm.genx.lfsr.*``: lfsr instructions
1911+
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1912+
### The instruction is used to generate a pseudo-random number sequence.
1913+
### The algorithm is based on a linear feedback shift register (LFSR).
1914+
###
1915+
### * arg0: seed, vNi32
1916+
### * arg1: polynomial, same type as arg0
1917+
### * arg2: mode, i8, must be compile-time constant
1918+
###
1919+
### * Return value: result, same type as arg0
1920+
###
1921+
### The mode argument is an enum with the following values:
1922+
### * 0: b32 - input is 32-bit value
1923+
### * 1: 2xb16 - input is a pair of 16-bit values
1924+
### * 2: 4xb8 - input is a quad of 8-bit values
1925+
###
1926+
"lfsr" : { "result" : "anyint",
1927+
"arguments" : [0, 0, "char"],
1928+
"attributes" : "NoMem",
1929+
"platforms" : "Xe3P+"
1930+
},
1931+
1932+
18331933
### srnd
18341934
### ^^^
18351935
###
@@ -1861,6 +1961,22 @@
18611961
},
18621962

18631963

1964+
### ``llvm.genx.biased.rounding.hf8.*`` : biased rounding instruction
1965+
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1966+
###
1967+
### * arg0: input, vNf16 or vNbf16
1968+
### * arg1: bias, vNi8
1969+
###
1970+
### * Return value: result, vNi8
1971+
###
1972+
### Note: bf16 can be substituted with i16.
1973+
###
1974+
"biased_rounding_hf8" : { "result" : "anyint",
1975+
"arguments" : ["anyvector", 0],
1976+
"attributes" : "NoMem",
1977+
"platforms" : "Xe3P+"
1978+
},
1979+
18641980
### bf_cvt
18651981
### ^^^^^^
18661982
###
@@ -1927,6 +2043,66 @@
19272043
"platforms" : "Xe3+"
19282044
},
19292045

2046+
2047+
## ``llvm.genx.packed.4bit.upconvert.lut`` : packed fp4/int4 upconvert operation
2048+
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2049+
##
2050+
## * arg0: lookup table, v16i32 vector.
2051+
## * arg1: input packed 4-bit data, vNi8 or vNi16 vector (overloaded),
2052+
## must be a region of <4;1,0> (for i8) or <2;1,0> (for i16).
2053+
##
2054+
## * Return value: packed 8 or 16-bit values, vNi32.
2055+
##
2056+
## Restrictions:
2057+
## * The only supported N values for this operation are 16 and 32.
2058+
##
2059+
## This intrinsic represents 4bit->8bit and 4bit->16bit upconvert thru a lookup table
2060+
"packed_4bit_upconvert_lut" : { "result": "anyint",
2061+
"arguments": ["int16", "anyint"],
2062+
"attributes": "NoMem",
2063+
"platforms" : "Xe3P+", },
2064+
2065+
### ``llvm.genx.4bit.downconvert`` : fp4/int4 downconvert operation
2066+
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2067+
###
2068+
### The intrinsic represents bf16->4bit and f16->4bit downscale operations.
2069+
###
2070+
### * arg0: src0, vNi32
2071+
### * arg1: src1, vNi32
2072+
### * arg2: bias, vNi32, ignored when rounding mode is "round to nearest even"
2073+
### * arg3: conversion type, i8, enum value, must be constant
2074+
### * arg4: packing mode, i8, enum value, must be constant
2075+
### * arg5: rounding mode, i8, enum value, must be constant
2076+
###
2077+
### * Return value: result, vNi32. The output values are packed into 32-bit
2078+
### integers according to the packing mode operand.
2079+
###
2080+
### The supported output data formats are the following:
2081+
### * E2M1 - 2-bit exponent, 1-bit mantissa and 1-bit sign floating point values
2082+
### * Int4 - 4-bit signed integer values
2083+
###
2084+
### The conversion type argument (arg3) is an enum with the following values:
2085+
### * 1: BFtoE2M1 - bfloat16 to E2M1
2086+
### * 2: BFtoInt4 - bfloat16 to Int4
2087+
### * 4: HFtoE2M1 - half precision to E2M1
2088+
### * 5: HFtoInt4 - half precision to Int4
2089+
###
2090+
### The packing mode argument (arg4) is an emum with the following values (lsb to msb):
2091+
### * 0: { downscale(src0[0:15]) | downscale(src0[16:31]) << 4, 0, downscale(src1[0:15]) | downscale(src1[16:31]) << 4, 0 }
2092+
### * 1: { downscale(src0[0:15]) | downscale(src1[0:15]) << 4, 0, downscale(src0[16:31]) | downscale(src1[16:31]) << 4, 0 }
2093+
### * 2: { 0, downscale(src0[0:15]) | downscale(src0[16:31]) << 4, 0, downscale(src1[0:15]) | downscale(src1[16:31]) << 4 }
2094+
### * 3: { 0, downscale(src0[0:15]) | downscale(src1[0:15]) << 4, 0, downscale(src0[16:31]) | downscale(src1[16:31]) << 4 }
2095+
###
2096+
### The rounding mode argument (arg5) is an emum with the following values:
2097+
### * 0: biased round
2098+
### * 1: round to nearest even
2099+
###
2100+
"4bit_downconvert" : { "result": "anyint",
2101+
"arguments": [0, 0, 0, "char", "char", "char"],
2102+
"attributes": "NoMem",
2103+
"platforms" : "Xe3P+",
2104+
},
2105+
19302106
### ``llvm.genx.lsc.load.*.<return type if not void>.<any type>.<any type>`` : lsc_load instructions
19312107
### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
19322108
###
@@ -4717,6 +4893,43 @@
47174893
"arguments" : ["char","char","anyvector","char","char","char","int","int","anyvector","anyvector"],
47184894
"attributes" : "NoWillReturn"
47194895
},
4896+
## ``llvm.genx.raw.sendg`` : raw send generalized message
4897+
## ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4898+
##
4899+
## * arg0: i16 dst size (in bytes) [MBC]
4900+
## * arg1: i1 IsConditional
4901+
## * arg2: i1 IsEOT
4902+
## * arg3: i8 SFID [MBC]
4903+
## * arg4: vNxi1 Predicate (overloaded)
4904+
## * arg5: vector src0 (overloaded)
4905+
## * arg6: i16 src0 size (in bytes) [MBC]
4906+
## * arg7: vector src1 (overloaded)
4907+
## * arg8: i16 src1 size (in bytes) [MBC]
4908+
## * arg9: i64 indirect descriptor 0
4909+
## * arg10: i64 indirect descriptor 1
4910+
## * arg11: i64 descriptor [MBC]
4911+
## * arg12: vector to take values for masked simd lanes from
4912+
##
4913+
## * Return value: vector dst (overloaded)
4914+
##
4915+
"raw_sendg": { "result" : "anyvector",
4916+
"arguments" : [
4917+
"short", # dst size
4918+
"bool", # cond
4919+
"bool", # EOT
4920+
"char", # sfid
4921+
"anyint", # predicate
4922+
"anyvector", # src0
4923+
"short", # src0 size
4924+
"anyvector", # src1
4925+
"short", # src1 size
4926+
"long", # ind0
4927+
"long", # ind1
4928+
"long", # desc
4929+
0, # passthru
4930+
],
4931+
"attributes" : "NoWillReturn,SideEffects",
4932+
"platforms" : "Xe3P+" },
47204933

47214934
## ---------------------------
47224935
### Video Analytics Instrinsics

GenXIntrinsics/include/llvm/GenXIntrinsics/Intrinsics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
"XeHPCVG",
100100
"Xe2",
101101
"Xe3",
102+
"Xe3P",
102103
]
103104

104105
def getAttributeList(Attrs):

0 commit comments

Comments
 (0)