~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/lib/strncmp.S

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-only */
  2 /*
  3  * Copyright (c) 2013-2022, Arm Limited.
  4  *
  5  * Adapted from the original at:
  6  * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
  7  */
  8 
  9 #include <linux/linkage.h>
 10 #include <asm/assembler.h>
 11 
 12 /* Assumptions:
 13  *
 14  * ARMv8-a, AArch64.
 15  * MTE compatible.
 16  */
 17 
 18 #define L(label) .L ## label
 19 
 20 #define REP8_01 0x0101010101010101
 21 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 22 
 23 /* Parameters and result.  */
 24 #define src1            x0
 25 #define src2            x1
 26 #define limit           x2
 27 #define result          x0
 28 
 29 /* Internal variables.  */
 30 #define data1           x3
 31 #define data1w          w3
 32 #define data2           x4
 33 #define data2w          w4
 34 #define has_nul         x5
 35 #define diff            x6
 36 #define syndrome        x7
 37 #define tmp1            x8
 38 #define tmp2            x9
 39 #define tmp3            x10
 40 #define zeroones        x11
 41 #define pos             x12
 42 #define mask            x13
 43 #define endloop         x14
 44 #define count           mask
 45 #define offset          pos
 46 #define neg_offset      x15
 47 
 48 /* Define endian dependent shift operations.
 49    On big-endian early bytes are at MSB and on little-endian LSB.
 50    LS_FW means shifting towards early bytes.
 51    LS_BK means shifting towards later bytes.
 52    */
 53 #ifdef __AARCH64EB__
 54 #define LS_FW lsl
 55 #define LS_BK lsr
 56 #else
 57 #define LS_FW lsr
 58 #define LS_BK lsl
 59 #endif
 60 
 61 SYM_FUNC_START(__pi_strncmp)
 62         cbz     limit, L(ret0)
 63         eor     tmp1, src1, src2
 64         mov     zeroones, #REP8_01
 65         tst     tmp1, #7
 66         and     count, src1, #7
 67         b.ne    L(misaligned8)
 68         cbnz    count, L(mutual_align)
 69 
 70         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 71            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 72            can be done in parallel across the entire word.  */
 73         .p2align 4
 74 L(loop_aligned):
 75         ldr     data1, [src1], #8
 76         ldr     data2, [src2], #8
 77 L(start_realigned):
 78         subs    limit, limit, #8
 79         sub     tmp1, data1, zeroones
 80         orr     tmp2, data1, #REP8_7f
 81         eor     diff, data1, data2      /* Non-zero if differences found.  */
 82         csinv   endloop, diff, xzr, hi  /* Last Dword or differences.  */
 83         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
 84         ccmp    endloop, #0, #0, eq
 85         b.eq    L(loop_aligned)
 86         /* End of main loop */
 87 
 88 L(full_check):
 89 #ifndef __AARCH64EB__
 90         orr     syndrome, diff, has_nul
 91         add     limit, limit, 8 /* Rewind limit to before last subs. */
 92 L(syndrome_check):
 93         /* Limit was reached. Check if the NUL byte or the difference
 94            is before the limit. */
 95         rev     syndrome, syndrome
 96         rev     data1, data1
 97         clz     pos, syndrome
 98         rev     data2, data2
 99         lsl     data1, data1, pos
100         cmp     limit, pos, lsr #3
101         lsl     data2, data2, pos
102         /* But we need to zero-extend (char is unsigned) the value and then
103            perform a signed 32-bit subtraction.  */
104         lsr     data1, data1, #56
105         sub     result, data1, data2, lsr #56
106         csel result, result, xzr, hi
107         ret
108 #else
109         /* Not reached the limit, must have found the end or a diff.  */
110         tbz     limit, #63, L(not_limit)
111         add     tmp1, limit, 8
112         cbz     limit, L(not_limit)
113 
114         lsl     limit, tmp1, #3 /* Bits -> bytes.  */
115         mov     mask, #~0
116         lsr     mask, mask, limit
117         bic     data1, data1, mask
118         bic     data2, data2, mask
119 
120         /* Make sure that the NUL byte is marked in the syndrome.  */
121         orr     has_nul, has_nul, mask
122 
123 L(not_limit):
124         /* For big-endian we cannot use the trick with the syndrome value
125            as carry-propagation can corrupt the upper bits if the trailing
126            bytes in the string contain 0x01.  */
127         /* However, if there is no NUL byte in the dword, we can generate
128            the result directly.  We can't just subtract the bytes as the
129            MSB might be significant.  */
130         cbnz    has_nul, 1f
131         cmp     data1, data2
132         cset    result, ne
133         cneg    result, result, lo
134         ret
135 1:
136         /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
137         rev     tmp3, data1
138         sub     tmp1, tmp3, zeroones
139         orr     tmp2, tmp3, #REP8_7f
140         bic     has_nul, tmp1, tmp2
141         rev     has_nul, has_nul
142         orr     syndrome, diff, has_nul
143         clz     pos, syndrome
144         /* The most-significant-non-zero bit of the syndrome marks either the
145            first bit that is different, or the top bit of the first zero byte.
146            Shifting left now will bring the critical information into the
147            top bits.  */
148 L(end_quick):
149         lsl     data1, data1, pos
150         lsl     data2, data2, pos
151         /* But we need to zero-extend (char is unsigned) the value and then
152            perform a signed 32-bit subtraction.  */
153         lsr     data1, data1, #56
154         sub     result, data1, data2, lsr #56
155         ret
156 #endif
157 
158 L(mutual_align):
159         /* Sources are mutually aligned, but are not currently at an
160            alignment boundary.  Round down the addresses and then mask off
161            the bytes that precede the start point.
162            We also need to adjust the limit calculations, but without
163            overflowing if the limit is near ULONG_MAX.  */
164         bic     src1, src1, #7
165         bic     src2, src2, #7
166         ldr     data1, [src1], #8
167         neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
168         ldr     data2, [src2], #8
169         mov     tmp2, #~0
170         LS_FW   tmp2, tmp2, tmp3        /* Shift (count & 63).  */
171         /* Adjust the limit and ensure it doesn't overflow.  */
172         adds    limit, limit, count
173         csinv   limit, limit, xzr, lo
174         orr     data1, data1, tmp2
175         orr     data2, data2, tmp2
176         b       L(start_realigned)
177 
178         .p2align 4
179         /* Don't bother with dwords for up to 16 bytes.  */
180 L(misaligned8):
181         cmp     limit, #16
182         b.hs    L(try_misaligned_words)
183 
184 L(byte_loop):
185         /* Perhaps we can do better than this.  */
186         ldrb    data1w, [src1], #1
187         ldrb    data2w, [src2], #1
188         subs    limit, limit, #1
189         ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
190         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
191         b.eq    L(byte_loop)
192 L(done):
193         sub     result, data1, data2
194         ret
195         /* Align the SRC1 to a dword by doing a bytewise compare and then do
196            the dword loop.  */
197 L(try_misaligned_words):
198         cbz     count, L(src1_aligned)
199 
200         neg     count, count
201         and     count, count, #7
202         sub     limit, limit, count
203 
204 L(page_end_loop):
205         ldrb    data1w, [src1], #1
206         ldrb    data2w, [src2], #1
207         cmp     data1w, #1
208         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
209         b.ne    L(done)
210         subs    count, count, #1
211         b.hi    L(page_end_loop)
212 
213         /* The following diagram explains the comparison of misaligned strings.
214            The bytes are shown in natural order. For little-endian, it is
215            reversed in the registers. The "x" bytes are before the string.
216            The "|" separates data that is loaded at one time.
217            src1     | a a a a a a a a | b b b c c c c c | . . .
218            src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
219 
220            After shifting in each step, the data looks like this:
221                         STEP_A              STEP_B              STEP_C
222            data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
223            data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
224 
225            The bytes with "0" are eliminated from the syndrome via mask.
226 
227            Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228            time from SRC2. The comparison happens in 3 steps. After each step
229            the loop can exit, or read from SRC1 or SRC2. */
230 L(src1_aligned):
231         /* Calculate offset from 8 byte alignment to string start in bits. No
232            need to mask offset since shifts are ignoring upper bits. */
233         lsl     offset, src2, #3
234         bic     src2, src2, #0xf
235         mov     mask, -1
236         neg     neg_offset, offset
237         ldr     data1, [src1], #8
238         ldp     tmp1, tmp2, [src2], #16
239         LS_BK   mask, mask, neg_offset
240         and     neg_offset, neg_offset, #63     /* Need actual value for cmp later. */
241         /* Skip the first compare if data in tmp1 is irrelevant. */
242         tbnz    offset, 6, L(misaligned_mid_loop)
243 
244 L(loop_misaligned):
245         /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
246         LS_FW   data2, tmp1, offset
247         LS_BK   tmp1, tmp2, neg_offset
248         subs    limit, limit, #8
249         orr     data2, data2, tmp1      /* 8 bytes from SRC2 combined from two regs.*/
250         sub     has_nul, data1, zeroones
251         eor     diff, data1, data2      /* Non-zero if differences found.  */
252         orr     tmp3, data1, #REP8_7f
253         csinv   endloop, diff, xzr, hi  /* If limit, set to all ones. */
254         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL byte found in SRC1. */
255         orr     tmp3, endloop, has_nul
256         cbnz    tmp3, L(full_check)
257 
258         ldr     data1, [src1], #8
259 L(misaligned_mid_loop):
260         /* STEP_B: Compare first part of data1 to second part of tmp2. */
261         LS_FW   data2, tmp2, offset
262 #ifdef __AARCH64EB__
263         /* For big-endian we do a byte reverse to avoid carry-propagation
264         problem described above. This way we can reuse the has_nul in the
265         next step and also use syndrome value trick at the end. */
266         rev     tmp3, data1
267         #define data1_fixed tmp3
268 #else
269         #define data1_fixed data1
270 #endif
271         sub     has_nul, data1_fixed, zeroones
272         orr     tmp3, data1_fixed, #REP8_7f
273         eor     diff, data2, data1      /* Non-zero if differences found.  */
274         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL terminator.  */
275 #ifdef __AARCH64EB__
276         rev     has_nul, has_nul
277 #endif
278         cmp     limit, neg_offset, lsr #3
279         orr     syndrome, diff, has_nul
280         bic     syndrome, syndrome, mask        /* Ignore later bytes. */
281         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
282         cbnz    tmp3, L(syndrome_check)
283 
284         /* STEP_C: Compare second part of data1 to first part of tmp1. */
285         ldp     tmp1, tmp2, [src2], #16
286         cmp     limit, #8
287         LS_BK   data2, tmp1, neg_offset
288         eor     diff, data2, data1      /* Non-zero if differences found.  */
289         orr     syndrome, diff, has_nul
290         and     syndrome, syndrome, mask        /* Ignore earlier bytes. */
291         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
292         cbnz    tmp3, L(syndrome_check)
293 
294         ldr     data1, [src1], #8
295         sub     limit, limit, #8
296         b       L(loop_misaligned)
297 
298 #ifdef  __AARCH64EB__
299 L(syndrome_check):
300         clz     pos, syndrome
301         cmp     pos, limit, lsl #3
302         b.lo    L(end_quick)
303 #endif
304 
305 L(ret0):
306         mov     result, #0
307         ret
308 SYM_FUNC_END(__pi_strncmp)
309 SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
310 EXPORT_SYMBOL_NOKASAN(strncmp)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php