C++逐字读字符串里面的字(中英文)

只读 utf-8 每个字最多由 4 个字节组成的情况。

#include <iostream>
#include <string>
#include <assert.h>
#include <vector>

std::vector <std::string> read_utf8_onebyone(char *chars) {
    std::vector<std::string> words;
    std::string input(chars);
    int len = input.length();
    int i = 0;
    
    while (i < len) {
      assert ((input[i] & 0xF8) <= 0xF0);
      int next = 1;
      if ((input[i] & 0x80) == 0x00) {
        std::cout << "one character: " << input[i] << std::endl;
      } else if ((input[i] & 0xE0) == 0xC0) {
        next = 2;
        std::cout << "two character: " << input.substr(i, next) << std::endl;
      } else if ((input[i] & 0xF0) == 0xE0) {
        next = 3;
        std::cout << "three character: " << input.substr(i, next) << std::endl;
      } else if ((input[i] & 0xF8) == 0xF0) {
        next = 4;
        std::cout << "four character: " << input.substr(i, next) << std::endl;
      }
      words.push_back(input.substr(i, next));
      i += next;
    }
    return words;
}

版权声明:本文为CY_TEC原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。