summaryrefslogtreecommitdiffstats
path: root/contrib/restricted/google/utf8_range/utf8_range_neon.inc
blob: ab73754d8321b6bc16fc1f079364b1b863fe8b91 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#include <arm_neon.h>

/* This code is almost the same as SSE implementation, please reference
 * utf8-range-sse.inc for detailed explanation.
 * The only difference is the range adjustment step. NEON code is more
 * straightforward.
 */

static FORCE_INLINE_ATTR size_t utf8_range_ValidateUTF8Simd(
    const char* data_original, const char* data, const char* end,
    int return_position) {
  const uint8x16_t first_len_tbl = {
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
  };
  const uint8x16_t first_range_tbl = {
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
  };
  const uint8x16_t range_min_tbl = {
      0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
      0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  };
  const uint8x16_t range_max_tbl = {
      0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
      0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  };
  /* Range adjustment in NEON uint8x16x2 table. Note that lanes are interleaved
   * in register. The table below is plotted vertically to ease understanding.
   * The 1st column is for E0~EF, 2nd column for F0~FF.
   */
  // clang-format off
  const uint8_t range_adjust_tbl_data[] = {
    /* index -> 0~15  16~31 <- index */
    /*  E0 -> */ 2,     3, /* <- F0  */
                 0,     0,
                 0,     0,
                 0,     0,
                 0,     4, /* <- F4  */
                 0,     0,
                 0,     0,
                 0,     0,
                 0,     0,
                 0,     0,
                 0,     0,
                 0,     0,
                 0,     0,
    /*  ED -> */ 3,     0,
                 0,     0,
                 0,     0,
  };
  // clang-format on
  const uint8x16x2_t range_adjust_tbl = vld2q_u8(range_adjust_tbl_data);

  const uint8x16_t const_1 = vdupq_n_u8(1);
  const uint8x16_t const_2 = vdupq_n_u8(2);
  const uint8x16_t const_e0 = vdupq_n_u8(0xE0);

  uint8x16_t prev_input = vdupq_n_u8(0);
  uint8x16_t prev_first_len = vdupq_n_u8(0);
  uint8x16_t error = vdupq_n_u8(0);

  while (end - data >= 16) {
    const uint8x16_t input = vld1q_u8((const uint8_t*)data);

    const uint8x16_t high_nibbles = vshrq_n_u8(input, 4);

    const uint8x16_t first_len = vqtbl1q_u8(first_len_tbl, high_nibbles);

    uint8x16_t range = vqtbl1q_u8(first_range_tbl, high_nibbles);

    range = vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15));

    uint8x16_t shift2 = vextq_u8(prev_first_len, first_len, 14);
    shift2 = vqsubq_u8(shift2, const_1);
    range = vorrq_u8(range, shift2);

    uint8x16_t shift3 = vextq_u8(prev_first_len, first_len, 13);
    shift3 = vqsubq_u8(shift3, const_2);
    range = vorrq_u8(range, shift3);

    uint8x16_t shift1 = vextq_u8(prev_input, input, 15);
    shift1 = vsubq_u8(shift1, const_e0);
    range = vaddq_u8(range, vqtbl2q_u8(range_adjust_tbl, shift1));

    const uint8x16_t min_range = vqtbl1q_u8(range_min_tbl, range);
    const uint8x16_t max_range = vqtbl1q_u8(range_max_tbl, range);

    if (return_position) {
      error = vcltq_u8(input, min_range);
      error = vorrq_u8(error, vcgtq_u8(input, max_range));
      if (vmaxvq_u32(vreinterpretq_u32_u8(error))) {
        break;
      }
    } else {
      error = vorrq_u8(error, vcltq_u8(input, min_range));
      error = vorrq_u8(error, vcgtq_u8(input, max_range));
    }

    prev_input = input;
    prev_first_len = first_len;

    data += 16;
  }

  if (return_position && data == data_original) {
    return utf8_range_ValidateUTF8Naive(data, end, return_position);
  }
  const int32_t prev = vgetq_lane_s32(vreinterpretq_s32_u8(prev_input), 3);
  data -= utf8_range_CodepointSkipBackwards(prev);
  if (return_position) {
    return (data - data_original) +
           utf8_range_ValidateUTF8Naive(data, end, return_position);
  }
  if (vmaxvq_u32(vreinterpretq_u32_u8(error))) {
    return 0;
  }
  return utf8_range_ValidateUTF8Naive(data, end, return_position);
}