@@ -1519,5 +1519,184 @@ void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegiste
1519
1519
}
1520
1520
}
1521
1521
1522
+ //Implemented using Intel IpSec implementation (intel-ipsec-mb on github)
1523
+ void MacroAssembler::sha512_update_ni_x1(Register arg_hash, Register arg_msg, Register ofs, Register limit, bool multi_block) {
1524
+ Label done_hash, block_loop;
1525
+ address K512_W = StubRoutines::x86::k512_W_addr();
1526
+
1527
+ vbroadcasti128(xmm15, ExternalAddress(StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512()), Assembler::AVX_256bit, r10);
1528
+
1529
+ //load current hash value and transform
1530
+ vmovdqu(xmm0, Address(arg_hash));
1531
+ vmovdqu(xmm1, Address(arg_hash, 32));
1532
+ //ymm0 = D C B A, ymm1 = H G F E
1533
+ vperm2i128(xmm2, xmm0, xmm1, 0x20);
1534
+ vperm2i128(xmm3, xmm0, xmm1, 0x31);
1535
+ //ymm2 = F E B A, ymm3 = H G D C
1536
+ vpermq(xmm13, xmm2, 0x1b, Assembler::AVX_256bit);
1537
+ vpermq(xmm14, xmm3, 0x1b, Assembler::AVX_256bit);
1538
+ //ymm13 = A B E F, ymm14 = C D G H
1539
+
1540
+ lea(rax, ExternalAddress(K512_W));
1541
+ align(32);
1542
+ bind(block_loop);
1543
+ vmovdqu(xmm11, xmm13);//ABEF
1544
+ vmovdqu(xmm12, xmm14);//CDGH
1545
+
1546
+ //R0 - R3
1547
+ vmovdqu(xmm0, Address(arg_msg, 0 * 32));
1548
+ vpshufb(xmm3, xmm0, xmm15, Assembler::AVX_256bit);//ymm0 / ymm3 = W[0..3]
1549
+ vpaddq(xmm0, xmm3, Address(rax, 0 * 32), Assembler::AVX_256bit);
1550
+ sha512rnds2(xmm12, xmm11, xmm0);
1551
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1552
+ sha512rnds2(xmm11, xmm12, xmm0);
1553
+
1554
+ //R4 - R7
1555
+ vmovdqu(xmm0, Address(arg_msg, 1 * 32));
1556
+ vpshufb(xmm4, xmm0, xmm15, Assembler::AVX_256bit);//ymm0 / ymm4 = W[4..7]
1557
+ vpaddq(xmm0, xmm4, Address(rax, 1 * 32), Assembler::AVX_256bit);
1558
+ sha512rnds2(xmm12, xmm11, xmm0);
1559
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1560
+ sha512rnds2(xmm11, xmm12, xmm0);
1561
+ sha512msg1(xmm3, xmm4); //ymm3 = W[0..3] + S0(W[1..4])
1562
+
1563
+ //R8 - R11
1564
+ vmovdqu(xmm0, Address(arg_msg, 2 * 32));
1565
+ vpshufb(xmm5, xmm0, xmm15, Assembler::AVX_256bit);//ymm0 / ymm5 = W[8..11]
1566
+ vpaddq(xmm0, xmm5, Address(rax, 2 * 32), Assembler::AVX_256bit);
1567
+ sha512rnds2(xmm12, xmm11, xmm0);
1568
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1569
+ sha512rnds2(xmm11, xmm12, xmm0);
1570
+ sha512msg1(xmm4, xmm5);//ymm4 = W[4..7] + S0(W[5..8])
1571
+
1572
+ //R12 - R15
1573
+ vmovdqu(xmm0, Address(arg_msg, 3 * 32));
1574
+ vpshufb(xmm6, xmm0, xmm15, Assembler::AVX_256bit); //ymm0 / ymm6 = W[12..15]
1575
+ vpaddq(xmm0, xmm6, Address(rax, 3 * 32), Assembler::AVX_256bit);
1576
+ vpermq(xmm8, xmm6, 0x1b, Assembler::AVX_256bit); //ymm8 = W[12] W[13] W[14] W[15]
1577
+ vpermq(xmm9, xmm5, 0x39, Assembler::AVX_256bit); //ymm9 = W[8] W[11] W[10] W[9]
1578
+ vpblendd(xmm8, xmm8, xmm9, 0x3f, Assembler::AVX_256bit); //ymm8 = W[12] W[11] W[10] W[9]
1579
+ vpaddq(xmm3, xmm3, xmm8, Assembler::AVX_256bit);
1580
+ sha512msg2(xmm3, xmm6);//W[16..19] = xmm3 + W[9..12] + S1(W[14..17])
1581
+ sha512rnds2(xmm12, xmm11, xmm0);
1582
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1583
+ sha512rnds2(xmm11, xmm12, xmm0);
1584
+ sha512msg1(xmm5, xmm6); //ymm5 = W[8..11] + S0(W[9..12])
1585
+
1586
+ //R16 - R19, R32 - R35, R48 - R51
1587
+ for (int i = 4, j = 3; j > 0; j--) {
1588
+ vpaddq(xmm0, xmm3, Address(rax, i * 32), Assembler::AVX_256bit);
1589
+ vpermq(xmm8, xmm3, 0x1b, Assembler::AVX_256bit);//ymm8 = W[16] W[17] W[18] W[19]
1590
+ vpermq(xmm9, xmm6, 0x39, Assembler::AVX_256bit);//ymm9 = W[12] W[15] W[14] W[13]
1591
+ vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//xmm7 = W[16] W[15] W[14] W[13]
1592
+ vpaddq(xmm4, xmm4, xmm7, Assembler::AVX_256bit);//ymm4 = W[4..7] + S0(W[5..8]) + W[13..16]
1593
+ sha512msg2(xmm4, xmm3);//ymm4 += S1(W[14..17])
1594
+ sha512rnds2(xmm12, xmm11, xmm0);
1595
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1596
+ sha512rnds2(xmm11, xmm12, xmm0);
1597
+ sha512msg1(xmm6, xmm3); //ymm6 = W[12..15] + S0(W[13..16])
1598
+ i += 1;
1599
+ //R20 - R23, R36 - R39, R52 - R55
1600
+ vpaddq(xmm0, xmm4, Address(rax, i * 32), Assembler::AVX_256bit);
1601
+ vpermq(xmm8, xmm4, 0x1b, Assembler::AVX_256bit);//ymm8 = W[20] W[21] W[22] W[23]
1602
+ vpermq(xmm9, xmm3, 0x39, Assembler::AVX_256bit);//ymm9 = W[16] W[19] W[18] W[17]
1603
+ vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[20] W[19] W[18] W[17]
1604
+ vpaddq(xmm5, xmm5, xmm7, Assembler::AVX_256bit);//ymm5 = W[8..11] + S0(W[9..12]) + W[17..20]
1605
+ sha512msg2(xmm5, xmm4);//ymm5 += S1(W[18..21])
1606
+ sha512rnds2(xmm12, xmm11, xmm0);
1607
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1608
+ sha512rnds2(xmm11, xmm12, xmm0);
1609
+ sha512msg1(xmm3, xmm4); //ymm3 = W[16..19] + S0(W[17..20])
1610
+ i += 1;
1611
+ //R24 - R27, R40 - R43, R56 - R59
1612
+ vpaddq(xmm0, xmm5, Address(rax, i * 32), Assembler::AVX_256bit);
1613
+ vpermq(xmm8, xmm5, 0x1b, Assembler::AVX_256bit);//ymm8 = W[24] W[25] W[26] W[27]
1614
+ vpermq(xmm9, xmm4, 0x39, Assembler::AVX_256bit);//ymm9 = W[20] W[23] W[22] W[21]
1615
+ vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[24] W[23] W[22] W[21]
1616
+ vpaddq(xmm6, xmm6, xmm7, Assembler::AVX_256bit);//ymm6 = W[12..15] + S0(W[13..16]) + W[21..24]
1617
+ sha512msg2(xmm6, xmm5);//ymm6 += S1(W[22..25])
1618
+ sha512rnds2(xmm12, xmm11, xmm0);
1619
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1620
+ sha512rnds2(xmm11, xmm12, xmm0);
1621
+ sha512msg1(xmm4, xmm5);//ymm4 = W[20..23] + S0(W[21..24])
1622
+ i += 1;
1623
+ //R28 - R31, R44 - R47, R60 - R63
1624
+ vpaddq(xmm0, xmm6, Address(rax, i * 32), Assembler::AVX_256bit);
1625
+ vpermq(xmm8, xmm6, 0x1b, Assembler::AVX_256bit);//ymm8 = W[28] W[29] W[30] W[31]
1626
+ vpermq(xmm9, xmm5, 0x39, Assembler::AVX_256bit);//ymm9 = W[24] W[27] W[26] W[25]
1627
+ vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[28] W[27] W[26] W[25]
1628
+ vpaddq(xmm3, xmm3, xmm7, Assembler::AVX_256bit);//ymm3 = W[16..19] + S0(W[17..20]) + W[25..28]
1629
+ sha512msg2(xmm3, xmm6); //ymm3 += S1(W[26..29])
1630
+ sha512rnds2(xmm12, xmm11, xmm0);
1631
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1632
+ sha512rnds2(xmm11, xmm12, xmm0);
1633
+ sha512msg1(xmm5, xmm6);//ymm5 = W[24..27] + S0(W[25..28])
1634
+ i += 1;
1635
+ }
1636
+ //R64 - R67
1637
+ vpaddq(xmm0, xmm3, Address(rax, 16 * 32), Assembler::AVX_256bit);
1638
+ vpermq(xmm8, xmm3, 0x1b, Assembler::AVX_256bit);//ymm8 = W[64] W[65] W[66] W[67]
1639
+ vpermq(xmm9, xmm6, 0x39, Assembler::AVX_256bit);//ymm9 = W[60] W[63] W[62] W[61]
1640
+ vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[64] W[63] W[62] W[61]
1641
+ vpaddq(xmm4, xmm4, xmm7, Assembler::AVX_256bit);//ymm4 = W[52..55] + S0(W[53..56]) + W[61..64]
1642
+ sha512msg2(xmm4, xmm3);//ymm4 += S1(W[62..65])
1643
+ sha512rnds2(xmm12, xmm11, xmm0);
1644
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1645
+ sha512rnds2(xmm11, xmm12, xmm0);
1646
+ sha512msg1(xmm6, xmm3);//ymm6 = W[60..63] + S0(W[61..64])
1647
+
1648
+ //R68 - R71
1649
+ vpaddq(xmm0, xmm4, Address(rax, 17 * 32), Assembler::AVX_256bit);
1650
+ vpermq(xmm8, xmm4, 0x1b, Assembler::AVX_256bit);//ymm8 = W[68] W[69] W[70] W[71]
1651
+ vpermq(xmm9, xmm3, 0x39, Assembler::AVX_256bit);//ymm9 = W[64] W[67] W[66] W[65]
1652
+ vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[68] W[67] W[66] W[65]
1653
+ vpaddq(xmm5, xmm5, xmm7, Assembler::AVX_256bit);//ymm5 = W[56..59] + S0(W[57..60]) + W[65..68]
1654
+ sha512msg2(xmm5, xmm4);//ymm5 += S1(W[66..69])
1655
+ sha512rnds2(xmm12, xmm11, xmm0);
1656
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1657
+ sha512rnds2(xmm11, xmm12, xmm0);
1658
+
1659
+ //R72 - R75
1660
+ vpaddq(xmm0, xmm5, Address(rax, 18 * 32), Assembler::AVX_256bit);
1661
+ vpermq(xmm8, xmm5, 0x1b, Assembler::AVX_256bit);//ymm8 = W[72] W[73] W[74] W[75]
1662
+ vpermq(xmm9, xmm4, 0x39, Assembler::AVX_256bit);//ymm9 = W[68] W[71] W[70] W[69]
1663
+ vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[72] W[71] W[70] W[69]
1664
+ vpaddq(xmm6, xmm6, xmm7, Assembler::AVX_256bit);//ymm6 = W[60..63] + S0(W[61..64]) + W[69..72]
1665
+ sha512msg2(xmm6, xmm5);//ymm6 += S1(W[70..73])
1666
+ sha512rnds2(xmm12, xmm11, xmm0);
1667
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1668
+ sha512rnds2(xmm11, xmm12, xmm0);
1669
+
1670
+ //R76 - R79
1671
+ vpaddq(xmm0, xmm6, Address(rax, 19 * 32), Assembler::AVX_256bit);
1672
+ sha512rnds2(xmm12, xmm11, xmm0);
1673
+ vperm2i128(xmm0, xmm0, xmm0, 0x01);
1674
+ sha512rnds2(xmm11, xmm12, xmm0);
1675
+
1676
+ //update hash value
1677
+ vpaddq(xmm14, xmm14, xmm12, Assembler::AVX_256bit);
1678
+ vpaddq(xmm13, xmm13, xmm11, Assembler::AVX_256bit);
1679
+
1680
+ if (multi_block) {
1681
+ addptr(arg_msg, 4 * 32);
1682
+ addptr(ofs, 128);
1683
+ cmpptr(ofs, limit);
1684
+ jcc(Assembler::belowEqual, block_loop);
1685
+ movptr(rax, ofs); //return ofs
1686
+ }
1687
+
1688
+ //store the hash value back in memory
1689
+ //xmm13 = ABEF
1690
+ //xmm14 = CDGH
1691
+ vperm2i128(xmm1, xmm13, xmm14, 0x31);
1692
+ vperm2i128(xmm2, xmm13, xmm14, 0x20);
1693
+ vpermq(xmm1, xmm1, 0xb1, Assembler::AVX_256bit);//ymm1 = D C B A
1694
+ vpermq(xmm2, xmm2, 0xb1, Assembler::AVX_256bit);//ymm2 = H G F E
1695
+ vmovdqu(Address(arg_hash, 0 * 32), xmm1);
1696
+ vmovdqu(Address(arg_hash, 1 * 32), xmm2);
1697
+
1698
+ bind(done_hash);
1699
+ }
1700
+
1522
1701
#endif //#ifdef _LP64
1523
1702
0 commit comments