Skip to content

Win32: htonl/htons/ntohl/ntohs change slow winsock exports -> 1 CPU op/ins #23330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: blead
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions XSUB.h
Original file line number Diff line number Diff line change
Expand Up @@ -603,10 +603,12 @@ Rethrows a previously caught exception. See L<perlguts/"Exception Handling">.
# define signal PerlProc_signal
# define getpid PerlProc_getpid
# define gettimeofday PerlProc_gettimeofday
#ifndef PERL_MY_HOST_NET_BYTE_SWAP
# define htonl PerlSock_htonl
# define htons PerlSock_htons
# define ntohl PerlSock_ntohl
# define ntohs PerlSock_ntohs
#endif
# define accept PerlSock_accept
# define bind PerlSock_bind
# define connect PerlSock_connect
Expand Down
8 changes: 7 additions & 1 deletion embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -6468,7 +6468,13 @@ p |bool |get_win32_message_utf8ness \
|NULLOK const char *string
Teor |void |win32_croak_not_implemented \
|NN const char *fname
#else
# if !defined(PERL_MY_HOST_NET_BYTE_SWAP)
DTbo |u_long |win32_htonl |u_long hostlong
DTbo |u_short|win32_htons |u_short hostshort
DTbo |u_long |win32_ntohl |u_long netlong
DTbo |u_short|win32_ntohs |u_short netshort
# endif
#else /* if !defined(WIN32) */
p |bool |do_exec3 |NN const char *incmd \
|int fd \
|int do_report
Expand Down
19 changes: 19 additions & 0 deletions iperlsys.h
Original file line number Diff line number Diff line change
Expand Up @@ -1144,10 +1144,12 @@ struct IPerlProcInfo
/* PerlSock */
struct IPerlSock;
struct IPerlSockInfo;
#ifndef PERL_MY_HOST_NET_BYTE_SWAP
typedef u_long (*LPHtonl)(const struct IPerlSock**, u_long);
typedef u_short (*LPHtons)(const struct IPerlSock**, u_short);
typedef u_long (*LPNtohl)(const struct IPerlSock**, u_long);
typedef u_short (*LPNtohs)(const struct IPerlSock**, u_short);
#endif
typedef SOCKET (*LPAccept)(const struct IPerlSock**, SOCKET,
struct sockaddr*, int*);
typedef int (*LPBind)(const struct IPerlSock**, SOCKET,
Expand Down Expand Up @@ -1208,10 +1210,12 @@ typedef int (*LPClosesocket)(const struct IPerlSock**, SOCKET s);

struct IPerlSock
{
#ifndef PERL_MY_HOST_NET_BYTE_SWAP
LPHtonl pHtonl;
LPHtons pHtons;
LPNtohl pNtohl;
LPNtohs pNtohs;
#endif
LPAccept pAccept;
LPBind pBind;
LPConnect pConnect;
Expand Down Expand Up @@ -1262,6 +1266,19 @@ struct IPerlSockInfo
struct IPerlSock perlSockList;
};

#ifdef PERL_MY_HOST_NET_BYTE_SWAP
/* perl.h has provides a much more efficient inlined implementation of
htonl(), htons(), ntohl(), ntohs() compared to the "native" exported
extern linkage functions exported by ws2_32.dll. Both Mingw GCCs
and MSVCs headers, only offer exports from ws2_32.dll for those 4
tokens, without any alternative. Writing "r = htonl(n);" a C Windows
app is an improper anti-pattern. The official, correct, identifier is
RtlUlongByteSwap() on the Windows Platform. */
# define PerlSock_htonl(x) htonl(x)
# define PerlSock_htons(x) htons(x)
# define PerlSock_ntohl(x) ntohl(x)
# define PerlSock_ntohs(x) ntohs(x)
#else
# define PerlSock_htonl(x) \
((*(PL_Sock))->pHtonl)(PL_Sock, x)
# define PerlSock_htons(x) \
Expand All @@ -1270,6 +1287,8 @@ struct IPerlSockInfo
((*(PL_Sock))->pNtohl)(PL_Sock, x)
# define PerlSock_ntohs(x) \
((*(PL_Sock))->pNtohs)(PL_Sock, x)
#endif

# define PerlSock_accept(s, a, l) \
((*(PL_Sock))->pAccept)(PL_Sock, s, a, l)
# define PerlSock_bind(s, n, l) \
Expand Down
51 changes: 51 additions & 0 deletions mathoms.c
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,57 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen);
}

/* These 4 exported functions are unused/deprecated/mathoms. WinPerl hooks
and replaces the htonl(), ntohl(), etc symbols provided by Mingw GCC and
MSVC headers, which are exported extern "C" functions from ws2_32.dll,
with 1 CPU instruction big, inline intrinsics. ws2_32.dll's implementation
is 11-15 instructions long. Clang and GCC for WinOS correctly convert
expression "& << |" to 1 CPU instruction, MSVC build numbers released prior
to Fall 2023 don't do that 1 CPU opcode optimization.
Because of frozen public API src code compatibility and object code linker
reasons, neither (Mingw or MS SDK) Clang, Mingw GCC or MSVC compilers can
correct the day 1 1993 mistake that tokens htonl(), ntohl(), etc, are
external linkage symbols from ws2_32.dll,

cpangrep shows exactly 1 module uses WinPerl's win32_*() prefixed sockets API
byte order swappers.

cpangrep: win32_htonl|win32_htons|win32_ntohl|win32_ntohs
https://metacpan.org/release/Prima/source/win32/files.c#L750

For now, these symbols are still exported, incase they are linked by a
XS .dll, that has a TU/.o/.c that doesn't #include "perl.h" and
declared function win32_htonl() themselves or they are using GetProcAddress(). */

#undef win32_htonl
#undef win32_htons
#undef win32_ntohl
#undef win32_ntohs

u_long
win32_htonl(u_long hostlong)
{
return htonl(hostlong);
}

u_short
win32_htons(u_short hostshort)
{
return htons(hostshort);
}

u_long
win32_ntohl(u_long netlong)
{
return ntohl(netlong);
}

u_short
win32_ntohs(u_short netshort)
{
return ntohs(netshort);
}

GCC_DIAG_RESTORE

#endif /* NO_MATHOMS */
Expand Down
165 changes: 146 additions & 19 deletions perl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4571,23 +4571,61 @@ struct ptr_tbl {
struct ptr_tbl_ent *tbl_arena_end;
};

#if defined(htonl) && !defined(HAS_HTONL)
#define HAS_HTONL
#endif
#if defined(htons) && !defined(HAS_HTONS)
#define HAS_HTONS
#endif
#if defined(ntohl) && !defined(HAS_NTOHL)
#define HAS_NTOHL
#endif
#if defined(ntohs) && !defined(HAS_NTOHS)
#define HAS_NTOHS
/* Override the C compiler's built in byte order swapping functions and
implement our own BE/LE conversion functions. All modern CCs on all CPU archs
should be using an inline intrinsic, that maps to 1 CPU instructions, max 3.
Worst possible correct and incompetent implementation a C compiler can do:

libperl calls symbol htonl() through PLT/GOT to libc, libc then declares
stack 2 x unsigned char buf[4]; plus 2 calls through PLT/GOT to symbol
memcpy(); plus for( i=0; i<4; i++) {} loop.

Before defining PERL_MY_HOST_NET_BYTE_SWAP, remember to read the -O1 or -O2
optimized assembly code created by a production grade C compiler, to see
if there actually is a defect/flaw or not with the permutation of CC/CPU/OS
you are using. Don't accidentally turn 1 CPU op into 6 CPU ops, and document
that change as an "optimization".

The only known flawed CC is MSVC all versions and build numbers of
cl.exe/link.exe released to the pubic before Fall 2023. It has a 1 of 5
stars, if C token "htonl()" is used (perl uses Win32's PLT/GOT).
2 of 5 stars if "& << |" expression is used. Build numbers of MSVC released
during or after Fall 2023, produce correct and perfect machine code identical
to what GCC/Clang would emit. */

#ifdef PERL_MY_HOST_NET_BYTE_SWAP
# undef htonl
# undef HAS_HTONL
# undef htons
# undef HAS_HTONS
# undef ntohl
# undef HAS_NTOHL
# undef ntohl
# undef HAS_NTOHL
# undef ntohs
# undef HAS_NTOHS
# undef htonll
# undef HAS_HTONLL
#else
# if defined(htonl) && !defined(HAS_HTONL)
# define HAS_HTONL
# endif
# if defined(htons) && !defined(HAS_HTONS)
# define HAS_HTONS
# endif
# if defined(ntohl) && !defined(HAS_NTOHL)
# define HAS_NTOHL
# endif
# if defined(ntohs) && !defined(HAS_NTOHS)
# define HAS_NTOHS
# endif
#endif

#ifndef HAS_HTONL
#define HAS_HTONS
#define HAS_HTONL
#define HAS_NTOHS
#define HAS_NTOHL
# define HAS_HTONS
# define HAS_HTONL
# define HAS_NTOHS
# define HAS_NTOHL
# if (BYTEORDER & 0xffff) == 0x4321
/* Big endian system, so ntohl, ntohs, htonl and htons do not need to
re-order their values. However, to behave identically to the alternative
Expand All @@ -4604,21 +4642,65 @@ struct ptr_tbl {
that *declare* the various functions are still seen. If we declare our own
htonl etc they will clash with the declarations in the Win32 headers. */

# ifdef _MSC_VER
# pragma intrinsic(_byteswap_ulong)
# pragma intrinsic(_byteswap_ushort)
# endif

# if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(DEBUGGING))
PERL_STATIC_INLINE U32
my_swap32(const U32 x) {
# ifdef _MSC_VER
return _byteswap_ulong(x);
# else
return ((x & 0xFF) << 24) | ((x >> 24) & 0xFF)
| ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8);
# endif
}

PERL_STATIC_INLINE U16
my_swap16(const U16 x) {

# ifdef _MSC_VER
return _byteswap_ushort(x);
# else
return ((x & 0xFF) << 8) | ((x >> 8) & 0xFF);
# endif
}
# endif

/* all CCs except MSVC use the static inlines above, unoptimized MSVC Perl
built with -DDEBUGGING, also uses the statics above, to make single stepping
and breakpoints easier to use. MSVC's C/C++ front end parser does not
recogize the traditional "& << |" expression as a synonym for i386/x64/ARM's
byteswap CPU instruction. This MSVC bug was fixed in MSVC 2022 build number
19.37/17.7 released Aug 8 2023. All MSVC 2022 build numbers <= 19.36/17.6
have the bug. Explicitly tell MSVC to use the byte swap opcode solves the
problem. VC's _byteswap_ulong() is declared as an intrinsic function.
It will not cause multi-eval problems the way a macro would. So skip the
my_swap() wrappers.

TODO: add special casing for __bswap_32(), __builtin_bswap32(),
bswap_32(), cpu_to_be32(), swap32(), read_be32(), write_be32(), htobe32(),
OSSwapInt32(), and on April 1st, impliment RtlUlongByteSwap(). Remember to
the history and HW compatibility of each of these CC tokens before adding
them. If hello_world.c executes and the Desktop GUI works, the defaults
in Configure can't produce a binary that SIGILLs on 1 of 2 systems,
made 2 years apart, running the same OS version, after a copy paste or
cloud server deployment of a /usr/bin/perl file.
*/

# define htonl(x) my_swap32(x)
# define ntohl(x) my_swap32(x)
# define ntohs(x) my_swap16(x)
# define htons(x) my_swap16(x)
# if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(DEBUGGING))
# define htonl(x) my_swap32(x)
# define ntohl(x) my_swap32(x)
# define ntohs(x) my_swap16(x)
# define htons(x) my_swap16(x)
# else
# define htonl(x) _byteswap_ulong(x)
# define ntohl(x) _byteswap_ulong(x)
# define ntohs(x) _byteswap_ushort(x)
# define htons(x) _byteswap_ushort(x)
# endif
# else
# error "Unsupported byteorder"
/* The C pre-processor doesn't let us return the value of BYTEORDER as part of
Expand All @@ -4635,6 +4717,51 @@ my_swap16(const U16 x) {
# endif
#endif

#if defined(htonll) && !defined(HAS_HTONLL)
# define HAS_HTONLL
#endif

#ifndef HAS_HTONLL
# define HAS_HTONLL
# if (BYTEORDER & 0xffff) == 0x4321
# define ntohll(x) ((x)&0xFFFFFFFFFFFFFFFF)
# define htonll(x) ntohll(x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't actually thing we need ntohll and htonll

Copy link
Contributor Author

@bulk88 bulk88 May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/Perl/perl5/blob/blead/pp_pack.c#L146
https://github.com/Perl/perl5/blob/blead/pp_pack.c#L240
https://github.com/Perl/perl5/blob/blead/pp_pack.c#L3026
https://github.com/Perl/perl5/blob/blead/pp_pack.c#L2966

PP pack(), is doing the worst possible algorithm to swap byte order for 'Q'/U64.

https://github.com/Perl/perl5/blob/blead/ext/B/B.xs#L1608

B:: wants htonll at this line and is DIYing it.

https://github.com/Perl/perl5/blob/blead/cpan/Digest-SHA/src/sha.h#L91

Digest::SHA wants htonll at this line and is DIYing it.

https://github.com/Perl/perl5/blob/blead/dist/Storable/Storable.xs#L1059

Storable wants htonll at this line and is DIYing it.

https://github.com/Perl/perl5/blob/blead/ext/XS-APItest/APItest.xs#L7881

XS::APItest and P5P's default hash algo S_perl_siphash_seed_state() want it on BE CPUs

https://grep.metacpan.org/search?q=ntohll&qft=*.xs%2C*.c%2C*.h&qd=&qifl=

atleast 5 CPAN modules are DIYing it or using OS version or it, there are more, im using a narrow regexp

Copy link
Contributor Author

@bulk88 bulk88 May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't actually thing we need ntohll and htonll

Whats your thoughts about evangelizing to CPAN authors identifiers my_swap64 or htonll or PerlSock_htonll?

_ # 1 and # 3 are P5P proprietary. # 2 is filling in missing POSIX APIs, where the perl VM impl, will get (good thing) accidental usage by CPAN XS authors, who dont know, dont care, or bare minimum care about P5P invented identifiers.

These XS authors are beginner or low effort or FT non-Perl C devs but low time people. So they will use max POSIX C/Linux C tokens that they are comfortable with where possible, vs the P5P invented "portable" C tokens that are an alien language from an extraterrestrial planet to them.

# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
# ifdef _MSC_VER
# pragma intrinsic(_byteswap_uint64)
# endif

# if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(DEBUGGING))
PERL_STATIC_INLINE U64
my_swap64(const U64 x) {
# ifdef _MSC_VER
return _byteswap_uint64(x);
# else
/* return ( ((x & 0xff00000000000000) >> 56) | ((x & 0x00ff000000000000) >> 40)
| ((x & 0x0000ff0000000000) >> 24) | ((x & 0x000000ff00000000) >> 8)
| ((x & 0x00000000ff000000) << 8) | ((x & 0x0000000000ff0000) << 24)
| ((x & 0x000000000000ff00) << 40) | ((x & 0x00000000000000ff) << 56));
return ((U64)htonl(x & 0xFFFFFFFF) << 32) | htonl(x >> 32); */
U64 r;
r = (x & 0x00000000FFFFFFFF) << 32 | (x & 0xFFFFFFFF00000000) >> 32;
r = (r & 0x0000FFFF0000FFFF) << 16 | (r & 0xFFFF0000FFFF0000) >> 16;
r = (r & 0x00FF00FF00FF00FF) << 8 | (r & 0xFF00FF00FF00FF00) >> 8;
return r;
# endif
}
# endif
# if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(DEBUGGING))
# define htonll(x) my_swap64(x)
# define ntohll(x) my_swap64(x)
# else
# define htonll(x) _byteswap_uint64(x)
# define ntohll(x) _byteswap_uint64(x)
# endif
# else
# error "Unsupported byteorder"
# endif
#endif

/*
* Little-endian byte order functions - 'v' for 'VAX', or 'reVerse'.
* -DWS
Expand Down
25 changes: 25 additions & 0 deletions proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading