UTF-8 Code Point Counting: Nibble Table vs Range Checks

Question 12 / 12 Correct so far: 0 (0 answered)

Snippet A

Nibble Size Table

std::size_t countCodePoints(const std::string& text) {
    static constexpr std::array<std::uint8_t, 16> kBytesFromHighNibble = {
        1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 3, 4
    };

    std::size_t i = 0;
    std::size_t count = 0;
    const std::size_t n = text.size();

    while (i < n) {
        const unsigned char b = static_cast<unsigned char>(text[i]);
        i += kBytesFromHighNibble[b >> 4];
        ++count;
    }

    return count;
}

const std::size_t code_points = countCodePoints(TEST_TEXT);
Snippet B

First Byte Ranges

std::size_t countCodePoints(const std::string& text) {
    std::size_t i = 0;
    std::size_t count = 0;
    const std::size_t n = text.size();

    while (i < n) {
        const unsigned char b = static_cast<unsigned char>(text[i]);
        if (b < 0x80) {
            i += 1;
        } else if (b < 0xE0) {
            i += 2;
        } else if (b < 0xF0) {
            i += 3;
        } else {
            i += 4;
        }
        ++count;
    }

    return count;
}

const std::size_t code_points = countCodePoints(TEST_TEXT);
Shared test data (shared-setup)
constexpr std::size_t TEXT_SIZE = 64 * 1024;
static std::string TEST_TEXT;

struct AsciiBlobInit {
    AsciiBlobInit() {
        TEST_TEXT.resize(TEXT_SIZE);
        std::mt19937 rng(1337);
        std::uniform_int_distribution<int> dist(0, 99);

        for (std::size_t i = 0; i < TEXT_SIZE; ++i) {
            const int roll = dist(rng);
            if (roll < 82) {
                TEST_TEXT[i] = static_cast<char>('a' + (roll % 26));
            } else if (roll < 90) {
                TEST_TEXT[i] = ' ';
            } else if (roll < 95) {
                TEST_TEXT[i] = static_cast<char>('A' + (roll % 26));
            } else {
                TEST_TEXT[i] = static_cast<char>('0' + (roll % 10));
            }
        }
    }
} _ascii_blob_init;

Which snippet is faster?

Select the correct answer(s)