判断utf8 – 源码巴士

bool is_valid_utf8(const std::string &str)
{
if (str.empty()) {
return false;
}
const std::basic_string check_str =
std::basic_string(str.begin(), str.end());
auto iter_begin = check_str.begin();
const auto iter_end = check_str.cend();
size_t data_size = str.size();
size_t asic_num = 0;
while (iter_begin < iter_end)
{
if ((0xF8 & *iter_begin) == 0xF0 && *iter_begin <= 0xF4)
{
// The UTF-8 codepoint begin with 0b11110xxx -> 4-byte codepoint
// If the iterator reach the end of the string before the
// end of the 4-byte codepoint -> invalid string
if ((iter_begin + 1) == iter_end || (iter_begin + 2) == iter_end || (iter_begin + 3) == iter_end)
return false;
// Each of the following bytes is a value
// between 0x80 and 0xBF
if (((0xC0 & *(iter_begin + 1)) != 0x80) || ((0xC0 & *(iter_begin + 2)) != 0x80)
|| ((0xC0 & *(iter_begin + 3)) != 0x80))
{
return false;
}
// If the first byte of the sequence is 0xF0
// then the first continuation byte must be between 0x90 and 0xBF
// otherwise, if the byte is 0xF4
// then the first continuation byte must be between 0x80 and 0x8F
if (iter_begin == 0xF0)
{
if ((iter_begin + 1) < 0x90 || *(iter_begin + 1) > 0xBF)
return false;
}
else if (iter_begin == 0xF4)
{
if ((iter_begin + 1) < 0x80 || *(iter_begin + 1) > 0x8F)
return false;
}
iter_begin += 4; // Jump to the next codepoint
}
else if ((0xF0 & *iter_begin) == 0xE0)
{
// The UTF-8 codepoint begin with 0b1110xxxx -> 3-byte codepoint
if ((iter_begin + 1) == iter_end || (iter_begin + 2) == iter_end)
return false;
// Each of the following bytes starts with
// 0b10xxxxxx in a valid string
if (((0xC0 & *(iter_begin + 1)) != 0x80) || ((0xC0 & *(iter_begin + 2)) != 0x80))
return false;
// If the first byte of the sequence is 0xE0
// then the first continuation byte must be between 0xA0 and 0xBF
// otherwise, if the byte is 0xF4
// then the first continuation byte must be between 0x80 and 0x9F
if (iter_begin == 0xE0)
{
if ((iter_begin + 1) < 0xA0 || *(iter_begin + 1) > 0xBF)
return false;
}
else if (iter_begin == 0xED)
{
if ((iter_begin + 1) > 0x9F)
return false;
}
iter_begin += 3;
}
else if ((0xE0 & *iter_begin) == 0xC0)
{
// The UTF-8 codepoint begin with 0b110xxxxx -> 2-byte codepoint
if ((iter_begin + 1) == iter_end)
return false;
// The following byte starts with 0b10xxxxxx in a valid string
if ((0xC0 & *(iter_begin + 1)) != 0x80)
return false;
iter_begin += 2;
}
else if ((0x80 & *iter_begin) == 0x00)
{
// The UTF-8 codepoint begin with 0b0xxxxxxx -> 1-byte codepoint
iter_begin += 1;
++asic_num;
}
else
{
// Invalid codepoint
return false;
}
}
return asic_num == data_size ? false : true;
}

原文链接：https://blog.csdn.net/qq_53332653/article/details/110817920