1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Copyright (c) 2013-2022, Arm Limited. 4 * 5 * Adapted from the original at: 6 * https://github.com/ARM-software/optimized-r 7 */ 8 9 #include <linux/linkage.h> 10 #include <asm/assembler.h> 11 12 /* Assumptions: 13 * 14 * ARMv8-a, AArch64. 15 * MTE compatible. 16 */ 17 18 #define L(label) .L ## label 19 20 #define REP8_01 0x0101010101010101 21 #define REP8_7f 0x7f7f7f7f7f7f7f7f 22 23 /* Parameters and result. */ 24 #define src1 x0 25 #define src2 x1 26 #define limit x2 27 #define result x0 28 29 /* Internal variables. */ 30 #define data1 x3 31 #define data1w w3 32 #define data2 x4 33 #define data2w w4 34 #define has_nul x5 35 #define diff x6 36 #define syndrome x7 37 #define tmp1 x8 38 #define tmp2 x9 39 #define tmp3 x10 40 #define zeroones x11 41 #define pos x12 42 #define mask x13 43 #define endloop x14 44 #define count mask 45 #define offset pos 46 #define neg_offset x15 47 48 /* Define endian dependent shift operations. 49 On big-endian early bytes are at MSB and on 50 LS_FW means shifting towards early bytes. 51 LS_BK means shifting towards later bytes. 52 */ 53 #ifdef __AARCH64EB__ 54 #define LS_FW lsl 55 #define LS_BK lsr 56 #else 57 #define LS_FW lsr 58 #define LS_BK lsl 59 #endif 60 61 SYM_FUNC_START(__pi_strncmp) 62 cbz limit, L(ret0) 63 eor tmp1, src1, src2 64 mov zeroones, #REP8_01 65 tst tmp1, #7 66 and count, src1, #7 67 b.ne L(misaligned8) 68 cbnz count, L(mutual_align) 69 70 /* NUL detection works on the principl 71 (=> (X - 1) & ~(X | 0x7f)) is non-z 72 can be done in parallel across the 73 .p2align 4 74 L(loop_aligned): 75 ldr data1, [src1], #8 76 ldr data2, [src2], #8 77 L(start_realigned): 78 subs limit, limit, #8 79 sub tmp1, data1, zeroones 80 orr tmp2, data1, #REP8_7f 81 eor diff, data1, data2 /* Non 82 csinv endloop, diff, xzr, hi /* Las 83 bics has_nul, tmp1, tmp2 /* Non 84 ccmp endloop, #0, #0, eq 85 b.eq L(loop_aligned) 86 /* End of main loop */ 87 88 L(full_check): 89 #ifndef __AARCH64EB__ 90 orr syndrome, diff, has_nul 91 add limit, limit, 8 /* Rewind limi 92 L(syndrome_check): 93 /* Limit was reached. Check if the NUL 94 is before the limit. */ 95 rev syndrome, syndrome 96 rev data1, data1 97 clz pos, syndrome 98 rev data2, data2 99 lsl data1, data1, pos 100 cmp limit, pos, lsr #3 101 lsl data2, data2, pos 102 /* But we need to zero-extend (char is 103 perform a signed 32-bit subtraction 104 lsr data1, data1, #56 105 sub result, data1, data2, lsr #56 106 csel result, result, xzr, hi 107 ret 108 #else 109 /* Not reached the limit, must have fo 110 tbz limit, #63, L(not_limit) 111 add tmp1, limit, 8 112 cbz limit, L(not_limit) 113 114 lsl limit, tmp1, #3 /* Bits -> byt 115 mov mask, #~0 116 lsr mask, mask, limit 117 bic data1, data1, mask 118 bic data2, data2, mask 119 120 /* Make sure that the NUL byte is mark 121 orr has_nul, has_nul, mask 122 123 L(not_limit): 124 /* For big-endian we cannot use the tr 125 as carry-propagation can corrupt th 126 bytes in the string contain 0x01. 127 /* However, if there is no NUL byte in 128 the result directly. We can't just 129 MSB might be significant. */ 130 cbnz has_nul, 1f 131 cmp data1, data2 132 cset result, ne 133 cneg result, result, lo 134 ret 135 1: 136 /* Re-compute the NUL-byte detection, 137 rev tmp3, data1 138 sub tmp1, tmp3, zeroones 139 orr tmp2, tmp3, #REP8_7f 140 bic has_nul, tmp1, tmp2 141 rev has_nul, has_nul 142 orr syndrome, diff, has_nul 143 clz pos, syndrome 144 /* The most-significant-non-zero bit o 145 first bit that is different, or the 146 Shifting left now will bring the cr 147 top bits. */ 148 L(end_quick): 149 lsl data1, data1, pos 150 lsl data2, data2, pos 151 /* But we need to zero-extend (char is 152 perform a signed 32-bit subtraction 153 lsr data1, data1, #56 154 sub result, data1, data2, lsr #56 155 ret 156 #endif 157 158 L(mutual_align): 159 /* Sources are mutually aligned, but a 160 alignment boundary. Round down the 161 the bytes that precede the start po 162 We also need to adjust the limit ca 163 overflowing if the limit is near UL 164 bic src1, src1, #7 165 bic src2, src2, #7 166 ldr data1, [src1], #8 167 neg tmp3, count, lsl #3 /* 64 168 ldr data2, [src2], #8 169 mov tmp2, #~0 170 LS_FW tmp2, tmp2, tmp3 /* Shi 171 /* Adjust the limit and ensure it does 172 adds limit, limit, count 173 csinv limit, limit, xzr, lo 174 orr data1, data1, tmp2 175 orr data2, data2, tmp2 176 b L(start_realigned) 177 178 .p2align 4 179 /* Don't bother with dwords for up to 180 L(misaligned8): 181 cmp limit, #16 182 b.hs L(try_misaligned_words) 183 184 L(byte_loop): 185 /* Perhaps we can do better than this. 186 ldrb data1w, [src1], #1 187 ldrb data2w, [src2], #1 188 subs limit, limit, #1 189 ccmp data1w, #1, #0, hi /* NZC 190 ccmp data1w, data2w, #0, cs /* NZC 191 b.eq L(byte_loop) 192 L(done): 193 sub result, data1, data2 194 ret 195 /* Align the SRC1 to a dword by doing 196 the dword loop. */ 197 L(try_misaligned_words): 198 cbz count, L(src1_aligned) 199 200 neg count, count 201 and count, count, #7 202 sub limit, limit, count 203 204 L(page_end_loop): 205 ldrb data1w, [src1], #1 206 ldrb data2w, [src2], #1 207 cmp data1w, #1 208 ccmp data1w, data2w, #0, cs /* NZC 209 b.ne L(done) 210 subs count, count, #1 211 b.hi L(page_end_loop) 212 213 /* The following diagram explains the 214 The bytes are shown in natural orde 215 reversed in the registers. The "x" 216 The "|" separates data that is load 217 src1 | a a a a a a a a | b b b 218 src2 | x x x x x a a a a a a 219 220 After shifting in each step, the da 221 STEP_A ST 222 data1 a a a a a a a a b b b 223 data2 a a a a a a a a b b b 224 225 The bytes with "0" are eliminated f 226 227 Align SRC2 down to 16 bytes. This w 228 time from SRC2. The comparison happ 229 the loop can exit, or read from SRC 230 L(src1_aligned): 231 /* Calculate offset from 8 byte alignm 232 need to mask offset since shifts ar 233 lsl offset, src2, #3 234 bic src2, src2, #0xf 235 mov mask, -1 236 neg neg_offset, offset 237 ldr data1, [src1], #8 238 ldp tmp1, tmp2, [src2], #16 239 LS_BK mask, mask, neg_offset 240 and neg_offset, neg_offset, #63 241 /* Skip the first compare if data in t 242 tbnz offset, 6, L(misaligned_mid_lo 243 244 L(loop_misaligned): 245 /* STEP_A: Compare full 8 bytes when t 246 LS_FW data2, tmp1, offset 247 LS_BK tmp1, tmp2, neg_offset 248 subs limit, limit, #8 249 orr data2, data2, tmp1 /* 8 b 250 sub has_nul, data1, zeroones 251 eor diff, data1, data2 /* Non 252 orr tmp3, data1, #REP8_7f 253 csinv endloop, diff, xzr, hi /* If 254 bic has_nul, has_nul, tmp3 /* Non 255 orr tmp3, endloop, has_nul 256 cbnz tmp3, L(full_check) 257 258 ldr data1, [src1], #8 259 L(misaligned_mid_loop): 260 /* STEP_B: Compare first part of data1 261 LS_FW data2, tmp2, offset 262 #ifdef __AARCH64EB__ 263 /* For big-endian we do a byte reverse 264 problem described above. This way we c 265 next step and also use syndrome value 266 rev tmp3, data1 267 #define data1_fixed tmp3 268 #else 269 #define data1_fixed data1 270 #endif 271 sub has_nul, data1_fixed, zeroones 272 orr tmp3, data1_fixed, #REP8_7f 273 eor diff, data2, data1 /* Non 274 bic has_nul, has_nul, tmp3 /* Non 275 #ifdef __AARCH64EB__ 276 rev has_nul, has_nul 277 #endif 278 cmp limit, neg_offset, lsr #3 279 orr syndrome, diff, has_nul 280 bic syndrome, syndrome, mask 281 csinv tmp3, syndrome, xzr, hi /* If 282 cbnz tmp3, L(syndrome_check) 283 284 /* STEP_C: Compare second part of data 285 ldp tmp1, tmp2, [src2], #16 286 cmp limit, #8 287 LS_BK data2, tmp1, neg_offset 288 eor diff, data2, data1 /* Non 289 orr syndrome, diff, has_nul 290 and syndrome, syndrome, mask 291 csinv tmp3, syndrome, xzr, hi /* If 292 cbnz tmp3, L(syndrome_check) 293 294 ldr data1, [src1], #8 295 sub limit, limit, #8 296 b L(loop_misaligned) 297 298 #ifdef __AARCH64EB__ 299 L(syndrome_check): 300 clz pos, syndrome 301 cmp pos, limit, lsl #3 302 b.lo L(end_quick) 303 #endif 304 305 L(ret0): 306 mov result, #0 307 ret 308 SYM_FUNC_END(__pi_strncmp) 309 SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp) 310 EXPORT_SYMBOL_NOKASAN(strncmp)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.