55#include < cstdint>
66#include < cstring>
77#include < iterator>
8+ #include < type_traits>
89
910#include " float_common.h"
1011
12+ #ifdef FASTFLOAT_SSE2
13+ #include < emmintrin.h>
14+ #endif
15+
16+
1117namespace fast_float {
1218
19+ template <typename UC>
20+ fastfloat_really_inline constexpr bool has_simd_opt () {
21+ #ifdef FASTFLOAT_HAS_SIMD
22+ return std::is_same<UC, char16_t >::value;
23+ #else
24+ return false ;
25+ #endif
26+ }
27+
1328// Next function can be micro-optimized, but compilers are entirely
1429// able to optimize it well.
1530template <typename UC>
@@ -28,12 +43,14 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
2843 | (val & 0x00000000000000FF ) << 56 ;
2944}
3045
46+ // Read 8 UC into a u64. Truncates UC if not char.
47+ template <typename UC>
3148fastfloat_really_inline FASTFLOAT_CONSTEXPR20
32- uint64_t read_u64 (const char *chars) {
33- if (cpp20_and_in_constexpr ()) {
49+ uint64_t read8_to_u64 (const UC *chars) {
50+ if (cpp20_and_in_constexpr () || !std::is_same<UC, char >::value ) {
3451 uint64_t val = 0 ;
3552 for (int i = 0 ; i < 8 ; ++i) {
36- val |= uint64_t (*chars) << (i*8 );
53+ val |= uint64_t (uint8_t ( *chars) ) << (i*8 );
3754 ++chars;
3855 }
3956 return val;
@@ -47,6 +64,39 @@ uint64_t read_u64(const char *chars) {
4764 return val;
4865}
4966
67+ #ifdef FASTFLOAT_SSE2
68+
69+ fastfloat_really_inline
70+ uint64_t simd_read8_to_u64 (const __m128i data) {
71+ FASTFLOAT_SIMD_DISABLE_WARNINGS
72+ const __m128i packed = _mm_packus_epi16 (data, data);
73+ #ifdef FASTFLOAT_64BIT
74+ return uint64_t (_mm_cvtsi128_si64 (packed));
75+ #else
76+ uint64_t value;
77+ // Visual Studio + older versions of GCC don't support _mm_storeu_si64
78+ _mm_storel_epi64 (reinterpret_cast <__m128i*>(&value), packed);
79+ return value;
80+ #endif
81+ FASTFLOAT_SIMD_RESTORE_WARNINGS
82+ }
83+
84+ fastfloat_really_inline
85+ uint64_t simd_read8_to_u64 (const char16_t * chars) {
86+ FASTFLOAT_SIMD_DISABLE_WARNINGS
87+ return simd_read8_to_u64 (_mm_loadu_si128 (reinterpret_cast <const __m128i*>(chars)));
88+ FASTFLOAT_SIMD_RESTORE_WARNINGS
89+ }
90+
91+ #endif
92+
93+ // dummy for compile
94+ template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
95+ uint64_t simd_read8_to_u64 (UC const *) {
96+ return 0 ;
97+ }
98+
99+
50100fastfloat_really_inline FASTFLOAT_CONSTEXPR20
51101void write_u64 (uint8_t *chars, uint64_t val) {
52102 if (cpp20_and_in_constexpr ()) {
@@ -76,40 +126,80 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
76126 return uint32_t (val);
77127}
78128
79- fastfloat_really_inline constexpr
80- uint32_t parse_eight_digits_unrolled (const char16_t *) noexcept {
81- return 0 ;
82- }
83-
84- fastfloat_really_inline constexpr
85- uint32_t parse_eight_digits_unrolled (const char32_t *) noexcept {
86- return 0 ;
87- }
88129
130+ // Call this if chars are definitely 8 digits.
131+ template <typename UC>
89132fastfloat_really_inline FASTFLOAT_CONSTEXPR20
90- uint32_t parse_eight_digits_unrolled (const char *chars) noexcept {
91- return parse_eight_digits_unrolled (read_u64 (chars));
133+ uint32_t parse_eight_digits_unrolled (UC const * chars) noexcept {
134+ if (cpp20_and_in_constexpr () || !has_simd_opt<UC>()) {
135+ return parse_eight_digits_unrolled (read8_to_u64 (chars)); // truncation okay
136+ }
137+ return parse_eight_digits_unrolled (simd_read8_to_u64 (chars));
92138}
93139
140+
94141// credit @aqrit
95- fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast (uint64_t val) noexcept {
142+ fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast (uint64_t val) noexcept {
96143 return !((((val + 0x4646464646464646 ) | (val - 0x3030303030303030 )) &
97144 0x8080808080808080 ));
98145}
99146
100- fastfloat_really_inline constexpr
101- bool is_made_of_eight_digits_fast (const char16_t *) noexcept {
102- return false ;
147+
148+ #ifdef FASTFLOAT_HAS_SIMD
149+
150+ // Call this if chars might not be 8 digits.
151+ // Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
152+ // ensures we don't load SIMD registers twice.
153+ fastfloat_really_inline FASTFLOAT_CONSTEXPR20
154+ bool simd_parse_if_eight_digits_unrolled (const char16_t * chars, uint64_t & i) noexcept {
155+ if (cpp20_and_in_constexpr ()) {
156+ return false ;
157+ }
158+ #ifdef FASTFLOAT_SSE2
159+ FASTFLOAT_SIMD_DISABLE_WARNINGS
160+ const __m128i data = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(chars));
161+
162+ // (x - '0') <= 9
163+ // http://0x80.pl/articles/simd-parsing-int-sequences.html
164+ const __m128i t0 = _mm_add_epi16 (data, _mm_set1_epi16 (32720 ));
165+ const __m128i t1 = _mm_cmpgt_epi16 (t0, _mm_set1_epi16 (-32759 ));
166+
167+ if (_mm_movemask_epi8 (t1) == 0 ) {
168+ i = i * 100000000 + parse_eight_digits_unrolled (simd_read8_to_u64 (data));
169+ return true ;
170+ }
171+ else return false ;
172+ FASTFLOAT_SIMD_RESTORE_WARNINGS
173+ #endif
103174}
104175
105- fastfloat_really_inline constexpr
106- bool is_made_of_eight_digits_fast (const char32_t *) noexcept {
107- return false ;
176+ #endif
177+
178+ // dummy for compile
179+ template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
180+ uint64_t simd_parse_if_eight_digits_unrolled (UC const *, uint64_t &) {
181+ return 0 ;
182+ }
183+
184+
185+ template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char >::value)>
186+ fastfloat_really_inline FASTFLOAT_CONSTEXPR20
187+ void loop_parse_if_eight_digits (const UC*& p, const UC* const pend, uint64_t & i) {
188+ if (!has_simd_opt<UC>()) {
189+ return ;
190+ }
191+ while ((std::distance (p, pend) >= 8 ) && simd_parse_if_eight_digits_unrolled (p, i)) { // in rare cases, this will overflow, but that's ok
192+ p += 8 ;
193+ }
108194}
109195
110196fastfloat_really_inline FASTFLOAT_CONSTEXPR20
111- bool is_made_of_eight_digits_fast (const char *chars) noexcept {
112- return is_made_of_eight_digits_fast (read_u64 (chars));
197+ void loop_parse_if_eight_digits (const char *& p, const char * const pend, uint64_t & i) {
198+ // optimizes better than parse_if_eight_digits_unrolled() for UC = char.
199+ while ((std::distance (p, pend) >= 8 ) && is_made_of_eight_digits_fast (read8_to_u64 (p))) {
200+ i = i * 100000000 + parse_eight_digits_unrolled (read8_to_u64 (p)); // in rare cases, this will overflow, but that's ok
201+ p += 8 ;
202+ }
113203}
114204
115205template <typename UC>
@@ -124,8 +214,10 @@ struct parsed_number_string_t {
124214 span<const UC> integer{}; // non-nullable
125215 span<const UC> fraction{}; // nullable
126216};
127- using byte_span = span<char >;
217+
218+ using byte_span = span<const char >;
128219using parsed_number_string = parsed_number_string_t <char >;
220+
129221// Assuming that you use no more than 19 digits, this will
130222// parse an ASCII string.
131223template <typename UC>
@@ -171,12 +263,8 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
171263 UC const * before = p;
172264 // can occur at most twice without overflowing, but let it occur more, since
173265 // for integers with many digits, digit parsing is the primary bottleneck.
174- if (std::is_same<UC,char >::value) {
175- while ((std::distance (p, pend) >= 8 ) && is_made_of_eight_digits_fast (p)) {
176- i = i * 100000000 + parse_eight_digits_unrolled (p); // in rare cases, this will overflow, but that's ok
177- p += 8 ;
178- }
179- }
266+ loop_parse_if_eight_digits (p, pend, i);
267+
180268 while ((p != pend) && is_integer (*p)) {
181269 uint8_t digit = uint8_t (*p - UC (' 0' ));
182270 ++p;
@@ -241,29 +329,31 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
241329 if (*start == UC (' 0' )) { digit_count --; }
242330 start++;
243331 }
332+
244333 if (digit_count > 19 ) {
245334 answer.too_many_digits = true ;
246335 // Let us start again, this time, avoiding overflows.
247336 // We don't need to check if is_integer, since we use the
248337 // pre-tokenized spans from above.
249338 i = 0 ;
250339 p = answer.integer .ptr ;
251- UC const * int_end = p + answer.integer .len ();
252- const uint64_t minimal_nineteen_digit_integer{1000000000000000000 };
253- while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
340+ UC const * int_end = p + answer.integer .len ();
341+ const uint64_t minimal_nineteen_digit_integer{ 1000000000000000000 };
342+ while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
254343 i = i * 10 + uint64_t (*p - UC (' 0' ));
255344 ++p;
256345 }
257346 if (i >= minimal_nineteen_digit_integer) { // We have a big integers
258347 exponent = end_of_integer_part - p + exp_number;
259- } else { // We have a value with a fractional component.
260- p = answer.fraction .ptr ;
261- UC const * frac_end = p + answer.fraction .len ();
262- while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
263- i = i * 10 + uint64_t (*p - UC (' 0' ));
264- ++p;
265- }
266- exponent = answer.fraction .ptr - p + exp_number;
348+ }
349+ else { // We have a value with a fractional component.
350+ p = answer.fraction .ptr ;
351+ UC const * frac_end = p + answer.fraction .len ();
352+ while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
353+ i = i * 10 + uint64_t (*p - UC (' 0' ));
354+ ++p;
355+ }
356+ exponent = answer.fraction .ptr - p + exp_number;
267357 }
268358 // We have now corrected both exponent and i, to a truncated value
269359 }
0 commit comments