mirror of
				https://github.com/Drezil/imgui.git
				synced 2025-10-31 05:01:05 +01:00 
			
		
		
		
	Replace UTF-8 decoder with one based on branchless version by Christopher Wellons. (not branchless anymore tho)
Decoding performance increase ~30%
This commit is contained in:
		
							
								
								
									
										102
									
								
								imgui.cpp
									
									
									
									
									
								
							
							
						
						
									
										102
									
								
								imgui.cpp
									
									
									
									
									
								
							| @@ -1554,66 +1554,58 @@ void*   ImFileLoadToMemory(const char* filename, const char* mode, size_t* out_f | ||||
| //----------------------------------------------------------------------------- | ||||
|  | ||||
| // Convert UTF-8 to 32-bit character, process single character input. | ||||
| // Based on stb_from_utf8() from github.com/nothings/stb/ | ||||
| // A nearly-branchless UTF-8 decoder, based on work of Christopher Wellons (https://github.com/skeeto/branchless-utf8). | ||||
| // We handle UTF-8 decoding error by skipping forward. | ||||
| int ImTextCharFromUtf8(unsigned int* out_char, const char* in_text, const char* in_text_end) | ||||
| { | ||||
|     unsigned int c = (unsigned int)-1; | ||||
|     const unsigned char* str = (const unsigned char*)in_text; | ||||
|     if (!(*str & 0x80)) | ||||
|     static const char lengths[32] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 }; | ||||
|     static const int masks[]  = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 }; | ||||
|     static const uint32_t mins[] = { 0x400000, 0, 0x80, 0x800, 0x10000 }; | ||||
|     static const int shiftc[] = { 0, 18, 12, 6, 0 }; | ||||
|     static const int shifte[] = { 0, 6, 4, 2, 0 }; | ||||
|     int len = lengths[*(const unsigned char*)in_text >> 3]; | ||||
|     int wanted = len + !len; | ||||
|  | ||||
|     if (in_text_end == NULL) | ||||
|         in_text_end = in_text + wanted; // Max length, nulls will be taken into account. | ||||
|  | ||||
|     // Copy at most 'len' bytes, stop copying at 0 or past in_text_end. Branch predictor does a good job here, | ||||
|     // so it is fast even with excessive branching. | ||||
|     unsigned char s[4]; | ||||
|     s[0] = in_text + 0 < in_text_end ? in_text[0] : 0; | ||||
|     s[1] = in_text + 1 < in_text_end ? in_text[1] : 0; | ||||
|     s[2] = in_text + 2 < in_text_end ? in_text[2] : 0; | ||||
|     s[3] = in_text + 3 < in_text_end ? in_text[3] : 0; | ||||
|  | ||||
|     // Assume a four-byte character and load four bytes. Unused bits are shifted out. | ||||
|     *out_char  = (uint32_t)(s[0] & masks[len]) << 18; | ||||
|     *out_char |= (uint32_t)(s[1] & 0x3f) << 12; | ||||
|     *out_char |= (uint32_t)(s[2] & 0x3f) <<  6; | ||||
|     *out_char |= (uint32_t)(s[3] & 0x3f) <<  0; | ||||
|     *out_char >>= shiftc[len]; | ||||
|  | ||||
|     // Accumulate the various error conditions. | ||||
|     int e = 0; | ||||
|     e  = (*out_char < mins[len]) << 6; // non-canonical encoding | ||||
|     e |= ((*out_char >> 11) == 0x1b) << 7;  // surrogate half? | ||||
|     e |= (*out_char > IM_UNICODE_CODEPOINT_MAX) << 8;  // out of range? | ||||
|     e |= (s[1] & 0xc0) >> 2; | ||||
|     e |= (s[2] & 0xc0) >> 4; | ||||
|     e |= (s[3]       ) >> 6; | ||||
|     e ^= 0x2a; // top two bits of each tail byte correct? | ||||
|     e >>= shifte[len]; | ||||
|  | ||||
|     if (e) | ||||
|     { | ||||
|         c = (unsigned int)(*str++); | ||||
|         *out_char = c; | ||||
|         return 1; | ||||
|         // No bytes are consumed when *in_text == 0 || in_text == in_text_end. | ||||
|         // One byte is consumed in case of invalid first byte of in_text. | ||||
|         // All available bytes (at most `len` bytes) are consumed on incomplete/invalid second to last bytes. | ||||
|         // Invalid or incomplete input may consume less bytes than wanted, therefore every byte has to be inspected in s. | ||||
|         wanted = ImMin(wanted, !!s[0] + !!s[1] + !!s[2] + !!s[3]); | ||||
|         *out_char = IM_UNICODE_CODEPOINT_INVALID; | ||||
|     } | ||||
|     if ((*str & 0xe0) == 0xc0) | ||||
|     { | ||||
|         *out_char = IM_UNICODE_CODEPOINT_INVALID; // will be invalid but not end of string | ||||
|         if (in_text_end && in_text_end - (const char*)str < 2) return 1; | ||||
|         if (*str < 0xc2) return 2; | ||||
|         c = (unsigned int)((*str++ & 0x1f) << 6); | ||||
|         if ((*str & 0xc0) != 0x80) return 2; | ||||
|         c += (*str++ & 0x3f); | ||||
|         *out_char = c; | ||||
|         return 2; | ||||
|     } | ||||
|     if ((*str & 0xf0) == 0xe0) | ||||
|     { | ||||
|         *out_char = IM_UNICODE_CODEPOINT_INVALID; // will be invalid but not end of string | ||||
|         if (in_text_end && in_text_end - (const char*)str < 3) return 1; | ||||
|         if (*str == 0xe0 && (str[1] < 0xa0 || str[1] > 0xbf)) return 3; | ||||
|         if (*str == 0xed && str[1] > 0x9f) return 3; // str[1] < 0x80 is checked below | ||||
|         c = (unsigned int)((*str++ & 0x0f) << 12); | ||||
|         if ((*str & 0xc0) != 0x80) return 3; | ||||
|         c += (unsigned int)((*str++ & 0x3f) << 6); | ||||
|         if ((*str & 0xc0) != 0x80) return 3; | ||||
|         c += (*str++ & 0x3f); | ||||
|         *out_char = c; | ||||
|         return 3; | ||||
|     } | ||||
|     if ((*str & 0xf8) == 0xf0) | ||||
|     { | ||||
|         *out_char = IM_UNICODE_CODEPOINT_INVALID; // will be invalid but not end of string | ||||
|         if (in_text_end && in_text_end - (const char*)str < 4) return 1; | ||||
|         if (*str > 0xf4) return 4; | ||||
|         if (*str == 0xf0 && (str[1] < 0x90 || str[1] > 0xbf)) return 4; | ||||
|         if (*str == 0xf4 && str[1] > 0x8f) return 4; // str[1] < 0x80 is checked below | ||||
|         c = (unsigned int)((*str++ & 0x07) << 18); | ||||
|         if ((*str & 0xc0) != 0x80) return 4; | ||||
|         c += (unsigned int)((*str++ & 0x3f) << 12); | ||||
|         if ((*str & 0xc0) != 0x80) return 4; | ||||
|         c += (unsigned int)((*str++ & 0x3f) << 6); | ||||
|         if ((*str & 0xc0) != 0x80) return 4; | ||||
|         c += (*str++ & 0x3f); | ||||
|         // utf-8 encodings of values used in surrogate pairs are invalid | ||||
|         if ((c & 0xFFFFF800) == 0xD800) return 4; | ||||
|         // If codepoint does not fit in ImWchar, use replacement character U+FFFD instead | ||||
|         if (c > IM_UNICODE_CODEPOINT_MAX) c = IM_UNICODE_CODEPOINT_INVALID; | ||||
|         *out_char = c; | ||||
|         return 4; | ||||
|     } | ||||
|     *out_char = 0; | ||||
|     return 0; | ||||
|  | ||||
|     return wanted; | ||||
| } | ||||
|  | ||||
| int ImTextStrFromUtf8(ImWchar* buf, int buf_size, const char* in_text, const char* in_text_end, const char** in_text_remaining) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user