UTF-8 Code Point Counting: Nibble Table vs Range Checks
Question 12 / 12 • Correct so far: 0 (0 answered)
Nibble Size Table
std::size_t countCodePoints(const std::string& text) {
static constexpr std::array<std::uint8_t, 16> kBytesFromHighNibble = {
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 3, 4
};
std::size_t i = 0;
std::size_t count = 0;
const std::size_t n = text.size();
while (i < n) {
const unsigned char b = static_cast<unsigned char>(text[i]);
i += kBytesFromHighNibble[b >> 4];
++count;
}
return count;
}
const std::size_t code_points = countCodePoints(TEST_TEXT); First Byte Ranges
std::size_t countCodePoints(const std::string& text) {
std::size_t i = 0;
std::size_t count = 0;
const std::size_t n = text.size();
while (i < n) {
const unsigned char b = static_cast<unsigned char>(text[i]);
if (b < 0x80) {
i += 1;
} else if (b < 0xE0) {
i += 2;
} else if (b < 0xF0) {
i += 3;
} else {
i += 4;
}
++count;
}
return count;
}
const std::size_t code_points = countCodePoints(TEST_TEXT); Shared test data (shared-setup)
constexpr std::size_t TEXT_SIZE = 64 * 1024;
static std::string TEST_TEXT;
struct AsciiBlobInit {
AsciiBlobInit() {
TEST_TEXT.resize(TEXT_SIZE);
std::mt19937 rng(1337);
std::uniform_int_distribution<int> dist(0, 99);
for (std::size_t i = 0; i < TEXT_SIZE; ++i) {
const int roll = dist(rng);
if (roll < 82) {
TEST_TEXT[i] = static_cast<char>('a' + (roll % 26));
} else if (roll < 90) {
TEST_TEXT[i] = ' ';
} else if (roll < 95) {
TEST_TEXT[i] = static_cast<char>('A' + (roll % 26));
} else {
TEST_TEXT[i] = static_cast<char>('0' + (roll % 10));
}
}
}
} _ascii_blob_init; Which snippet is faster?
Snippet B is faster on this dataset. The branch-based version benefits from highly predictable ASCII input, so branch prediction is very accurate. The nibble-table version is branch-free in the hot step, but it adds a per-byte table-load dependency before the index increment, which can limit throughput on this workload.
Benchmark results
| Snippet | CPU time / iteration | Speedup |
|---|---|---|
| First Byte Ranges | 17.8 us | 6.8× |
| Nibble Size Table | 122 us | 1.0× |
Explore the source
Open in Compiler ExplorerQuiz complete. You can return to the question list to restart and compare.