1 /* SPDX-License-Identifier: GPL-2.0-only */ !! 1 /* $Id: strncmp.S,v 1.2 1997/03/11 17:51:44 jj Exp $ 2 /* !! 2 * Sparc64 optimized strncmp code. 3 * Copyright (c) 2013-2022, Arm Limited. << 4 * 3 * 5 * Adapted from the original at: !! 4 * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 6 * https://github.com/ARM-software/optimized-r << 7 */ 5 */ 8 6 9 #include <linux/linkage.h> !! 7 #include <asm/asi.h> 10 #include <asm/assembler.h> << 11 8 12 /* Assumptions: !! 9 .text 13 * !! 10 .align 4 14 * ARMv8-a, AArch64. !! 11 .global __strncmp, strncmp 15 * MTE compatible. !! 12 __strncmp: 16 */ !! 13 strncmp: 17 !! 14 brlez,pn %o2, 3f 18 #define L(label) .L ## label !! 15 lduba [%o0] (ASI_PNF), %o3 19 << 20 #define REP8_01 0x0101010101010101 << 21 #define REP8_7f 0x7f7f7f7f7f7f7f7f << 22 << 23 /* Parameters and result. */ << 24 #define src1 x0 << 25 #define src2 x1 << 26 #define limit x2 << 27 #define result x0 << 28 << 29 /* Internal variables. */ << 30 #define data1 x3 << 31 #define data1w w3 << 32 #define data2 x4 << 33 #define data2w w4 << 34 #define has_nul x5 << 35 #define diff x6 << 36 #define syndrome x7 << 37 #define tmp1 x8 << 38 #define tmp2 x9 << 39 #define tmp3 x10 << 40 #define zeroones x11 << 41 #define pos x12 << 42 #define mask x13 << 43 #define endloop x14 << 44 #define count mask << 45 #define offset pos << 46 #define neg_offset x15 << 47 << 48 /* Define endian dependent shift operations. << 49 On big-endian early bytes are at MSB and on << 50 LS_FW means shifting towards early bytes. << 51 LS_BK means shifting towards later bytes. << 52 */ << 53 #ifdef __AARCH64EB__ << 54 #define LS_FW lsl << 55 #define LS_BK lsr << 56 #else << 57 #define LS_FW lsr << 58 #define LS_BK lsl << 59 #endif << 60 << 61 SYM_FUNC_START(__pi_strncmp) << 62 cbz limit, L(ret0) << 63 eor tmp1, src1, src2 << 64 mov zeroones, #REP8_01 << 65 tst tmp1, #7 << 66 and count, src1, #7 << 67 b.ne L(misaligned8) << 68 cbnz count, L(mutual_align) << 69 << 70 /* NUL detection works on the principl << 71 (=> (X - 1) & ~(X | 0x7f)) is non-z << 72 can be done in parallel across the << 73 .p2align 4 << 74 L(loop_aligned): << 75 ldr data1, [src1], #8 << 76 ldr data2, [src2], #8 << 77 L(start_realigned): << 78 subs limit, limit, #8 << 79 sub tmp1, data1, zeroones << 80 orr tmp2, data1, #REP8_7f << 81 eor diff, data1, data2 /* Non << 82 csinv endloop, diff, xzr, hi /* Las << 83 bics has_nul, tmp1, tmp2 /* Non << 84 ccmp endloop, #0, #0, eq << 85 b.eq L(loop_aligned) << 86 /* End of main loop */ << 87 << 88 L(full_check): << 89 #ifndef __AARCH64EB__ << 90 orr syndrome, diff, has_nul << 91 add limit, limit, 8 /* Rewind limi << 92 L(syndrome_check): << 93 /* Limit was reached. Check if the NUL << 94 is before the limit. */ << 95 rev syndrome, syndrome << 96 rev data1, data1 << 97 clz pos, syndrome << 98 rev data2, data2 << 99 lsl data1, data1, pos << 100 cmp limit, pos, lsr #3 << 101 lsl data2, data2, pos << 102 /* But we need to zero-extend (char is << 103 perform a signed 32-bit subtraction << 104 lsr data1, data1, #56 << 105 sub result, data1, data2, lsr #56 << 106 csel result, result, xzr, hi << 107 ret << 108 #else << 109 /* Not reached the limit, must have fo << 110 tbz limit, #63, L(not_limit) << 111 add tmp1, limit, 8 << 112 cbz limit, L(not_limit) << 113 << 114 lsl limit, tmp1, #3 /* Bits -> byt << 115 mov mask, #~0 << 116 lsr mask, mask, limit << 117 bic data1, data1, mask << 118 bic data2, data2, mask << 119 << 120 /* Make sure that the NUL byte is mark << 121 orr has_nul, has_nul, mask << 122 << 123 L(not_limit): << 124 /* For big-endian we cannot use the tr << 125 as carry-propagation can corrupt th << 126 bytes in the string contain 0x01. << 127 /* However, if there is no NUL byte in << 128 the result directly. We can't just << 129 MSB might be significant. */ << 130 cbnz has_nul, 1f << 131 cmp data1, data2 << 132 cset result, ne << 133 cneg result, result, lo << 134 ret << 135 1: 16 1: 136 /* Re-compute the NUL-byte detection, !! 17 add %o0, 1, %o0 137 rev tmp3, data1 !! 18 ldub [%o1], %o4 138 sub tmp1, tmp3, zeroones !! 19 brz,pn %o3, 2f 139 orr tmp2, tmp3, #REP8_7f !! 20 add %o1, 1, %o1 140 bic has_nul, tmp1, tmp2 !! 21 cmp %o3, %o4 141 rev has_nul, has_nul !! 22 bne,pn %icc, 2f 142 orr syndrome, diff, has_nul !! 23 subcc %o2, 1, %o2 143 clz pos, syndrome !! 24 bne,a,pt %xcc, 1b 144 /* The most-significant-non-zero bit o !! 25 ldub [%o0], %o3 145 first bit that is different, or the !! 26 2: 146 Shifting left now will bring the cr !! 27 retl 147 top bits. */ !! 28 sub %o3, %o4, %o0 148 L(end_quick): !! 29 3: 149 lsl data1, data1, pos !! 30 retl 150 lsl data2, data2, pos !! 31 clr %o0 151 /* But we need to zero-extend (char is << 152 perform a signed 32-bit subtraction << 153 lsr data1, data1, #56 << 154 sub result, data1, data2, lsr #56 << 155 ret << 156 #endif << 157 << 158 L(mutual_align): << 159 /* Sources are mutually aligned, but a << 160 alignment boundary. Round down the << 161 the bytes that precede the start po << 162 We also need to adjust the limit ca << 163 overflowing if the limit is near UL << 164 bic src1, src1, #7 << 165 bic src2, src2, #7 << 166 ldr data1, [src1], #8 << 167 neg tmp3, count, lsl #3 /* 64 << 168 ldr data2, [src2], #8 << 169 mov tmp2, #~0 << 170 LS_FW tmp2, tmp2, tmp3 /* Shi << 171 /* Adjust the limit and ensure it does << 172 adds limit, limit, count << 173 csinv limit, limit, xzr, lo << 174 orr data1, data1, tmp2 << 175 orr data2, data2, tmp2 << 176 b L(start_realigned) << 177 << 178 .p2align 4 << 179 /* Don't bother with dwords for up to << 180 L(misaligned8): << 181 cmp limit, #16 << 182 b.hs L(try_misaligned_words) << 183 << 184 L(byte_loop): << 185 /* Perhaps we can do better than this. << 186 ldrb data1w, [src1], #1 << 187 ldrb data2w, [src2], #1 << 188 subs limit, limit, #1 << 189 ccmp data1w, #1, #0, hi /* NZC << 190 ccmp data1w, data2w, #0, cs /* NZC << 191 b.eq L(byte_loop) << 192 L(done): << 193 sub result, data1, data2 << 194 ret << 195 /* Align the SRC1 to a dword by doing << 196 the dword loop. */ << 197 L(try_misaligned_words): << 198 cbz count, L(src1_aligned) << 199 << 200 neg count, count << 201 and count, count, #7 << 202 sub limit, limit, count << 203 << 204 L(page_end_loop): << 205 ldrb data1w, [src1], #1 << 206 ldrb data2w, [src2], #1 << 207 cmp data1w, #1 << 208 ccmp data1w, data2w, #0, cs /* NZC << 209 b.ne L(done) << 210 subs count, count, #1 << 211 b.hi L(page_end_loop) << 212 << 213 /* The following diagram explains the << 214 The bytes are shown in natural orde << 215 reversed in the registers. The "x" << 216 The "|" separates data that is load << 217 src1 | a a a a a a a a | b b b << 218 src2 | x x x x x a a a a a a << 219 << 220 After shifting in each step, the da << 221 STEP_A ST << 222 data1 a a a a a a a a b b b << 223 data2 a a a a a a a a b b b << 224 << 225 The bytes with "0" are eliminated f << 226 << 227 Align SRC2 down to 16 bytes. This w << 228 time from SRC2. The comparison happ << 229 the loop can exit, or read from SRC << 230 L(src1_aligned): << 231 /* Calculate offset from 8 byte alignm << 232 need to mask offset since shifts ar << 233 lsl offset, src2, #3 << 234 bic src2, src2, #0xf << 235 mov mask, -1 << 236 neg neg_offset, offset << 237 ldr data1, [src1], #8 << 238 ldp tmp1, tmp2, [src2], #16 << 239 LS_BK mask, mask, neg_offset << 240 and neg_offset, neg_offset, #63 << 241 /* Skip the first compare if data in t << 242 tbnz offset, 6, L(misaligned_mid_lo << 243 << 244 L(loop_misaligned): << 245 /* STEP_A: Compare full 8 bytes when t << 246 LS_FW data2, tmp1, offset << 247 LS_BK tmp1, tmp2, neg_offset << 248 subs limit, limit, #8 << 249 orr data2, data2, tmp1 /* 8 b << 250 sub has_nul, data1, zeroones << 251 eor diff, data1, data2 /* Non << 252 orr tmp3, data1, #REP8_7f << 253 csinv endloop, diff, xzr, hi /* If << 254 bic has_nul, has_nul, tmp3 /* Non << 255 orr tmp3, endloop, has_nul << 256 cbnz tmp3, L(full_check) << 257 << 258 ldr data1, [src1], #8 << 259 L(misaligned_mid_loop): << 260 /* STEP_B: Compare first part of data1 << 261 LS_FW data2, tmp2, offset << 262 #ifdef __AARCH64EB__ << 263 /* For big-endian we do a byte reverse << 264 problem described above. This way we c << 265 next step and also use syndrome value << 266 rev tmp3, data1 << 267 #define data1_fixed tmp3 << 268 #else << 269 #define data1_fixed data1 << 270 #endif << 271 sub has_nul, data1_fixed, zeroones << 272 orr tmp3, data1_fixed, #REP8_7f << 273 eor diff, data2, data1 /* Non << 274 bic has_nul, has_nul, tmp3 /* Non << 275 #ifdef __AARCH64EB__ << 276 rev has_nul, has_nul << 277 #endif << 278 cmp limit, neg_offset, lsr #3 << 279 orr syndrome, diff, has_nul << 280 bic syndrome, syndrome, mask << 281 csinv tmp3, syndrome, xzr, hi /* If << 282 cbnz tmp3, L(syndrome_check) << 283 << 284 /* STEP_C: Compare second part of data << 285 ldp tmp1, tmp2, [src2], #16 << 286 cmp limit, #8 << 287 LS_BK data2, tmp1, neg_offset << 288 eor diff, data2, data1 /* Non << 289 orr syndrome, diff, has_nul << 290 and syndrome, syndrome, mask << 291 csinv tmp3, syndrome, xzr, hi /* If << 292 cbnz tmp3, L(syndrome_check) << 293 << 294 ldr data1, [src1], #8 << 295 sub limit, limit, #8 << 296 b L(loop_misaligned) << 297 << 298 #ifdef __AARCH64EB__ << 299 L(syndrome_check): << 300 clz pos, syndrome << 301 cmp pos, limit, lsl #3 << 302 b.lo L(end_quick) << 303 #endif << 304 << 305 L(ret0): << 306 mov result, #0 << 307 ret << 308 SYM_FUNC_END(__pi_strncmp) << 309 SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp) << 310 EXPORT_SYMBOL_NOKASAN(strncmp) <<
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.