prism/util/pm_strpbrk.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

#include "yarp/util/yp_strpbrk.h"

// This is the slow path that does care about the encoding.
static inline const uint8_t *
yp_strpbrk_multi_byte(yp_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
    size_t index = 0;

    while (index < maximum) {
        if (strchr((const char *) charset, source[index]) != NULL) {
            return source + index;
        }

        size_t width = parser->encoding.char_width(source + index, (ptrdiff_t) (maximum - index));
        if (width == 0) {
            return NULL;
        }

        index += width;
    }

    return NULL;
}

// This is the fast path that does not care about the encoding.
static inline const uint8_t *
yp_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t maximum) {
    size_t index = 0;

    while (index < maximum) {
        if (strchr((const char *) charset, source[index]) != NULL) {
            return source + index;
        }

        index++;
    }

    return NULL;
}

// Here we have rolled our own version of strpbrk. The standard library strpbrk
// has undefined behavior when the source string is not null-terminated. We want
// to support strings that are not null-terminated because yp_parse does not
// have the contract that the string is null-terminated. (This is desirable
// because it means the extension can call yp_parse with the result of a call to
// mmap).
//
// The standard library strpbrk also does not support passing a maximum length
// to search. We want to support this for the reason mentioned above, but we
// also don't want it to stop on null bytes. Ruby actually allows null bytes
// within strings, comments, regular expressions, etc. So we need to be able to
// skip past them.
//
// Finally, we want to support encodings wherein the charset could contain
// characters that are trailing bytes of multi-byte characters. For example, in
// Shift-JIS, the backslash character can be a trailing byte. In that case we
// need to take a slower path and iterate one multi-byte character at a time.
const uint8_t *
yp_strpbrk(yp_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
    if (length <= 0) {
        return NULL;
    } else if (parser->encoding_changed && parser->encoding.multibyte) {
        return yp_strpbrk_multi_byte(parser, source, charset, (size_t) length);
    } else {
        return yp_strpbrk_single_byte(source, charset, (size_t) length);
    }
}