C++判断字符串编码格式(ANSI\UTF16_LE\UTF16_BE\UTF8\UTF8_BOM)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 };
__inline static
Encode IsUtf8Data(const uint8_t* data, size_t size)
{
bool bAnsi= true;
uint8_t ch = 0x00;
int32_t nBytes = 0;
for (auto i = 0; i < size; i++)
{
ch = *(data + i);
if ((ch & 0x80) != 0x00)
{
bAnsi = false;
}
if (nBytes == 0)
{
if (ch >= 0x80)
{
if (ch >= 0xFC && ch <= 0xFD)
{
nBytes = 6;
}
else if (ch >= 0xF8)
{
nBytes = 5;
}
else if (ch >= 0xF0)
{
nBytes = 4;
}
else if (ch >= 0xE0)
{
nBytes = 3;
}
else if (ch >= 0xC0)
{
nBytes = 2;
}
else
{
return Encode::ANSI;
}
nBytes--;
}
}
else
{
if ((ch & 0xC0) != 0x80)
{
return Encode::ANSI;
}
nBytes--;
}
}
if (nBytes > 0 || bAnsi)
{
return Encode::ANSI;
}
return Encode::UTF8;
}
__inline static
Encode DetectEncode(const uint8_t* data, size_t size)
{
if (size > 2 && data[0] == 0xFF && data[1] == 0xFE)
{
return Encode::UTF16_LE;
}
else if (size > 2 && data[0] == 0xFE && data[1] == 0xFF)
{
return Encode::UTF16_BE;
}
else if (size > 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)
{
return Encode::UTF8_BOM;
}
else
{
return IsUtf8Data(data, size);
}
}
|
调用例子:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | auto s = FILE_READER(sv.begin()->c_str(), std::ios::binary);
switch (DetectEncode((const uint8_t*)s.data(), s.size()))
{
case ANSI:
break;
case UTF16_LE:
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));
break;
case UTF16_BE:
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));
break;
case UTF8_BOM:
s.erase(s.begin());
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));
break;
case UTF8:
s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));
break;
default:
break;
}
|