1 /* SPDX-License-Identifier: GPL-2.0-only */ !! 1 /* strlen.S: Sparc optimized strlen code 2 /* !! 2 * Hand optimized from GNU libc's strlen 3 * Copyright (c) 2013-2021, Arm Limited. !! 3 * Copyright (C) 1991,1996 Free Software Foundation 4 * !! 4 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 5 * Adapted from the original at: !! 5 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 6 * https://github.com/ARM-software/optimized-r << 7 */ 6 */ 8 7 9 #include <linux/linkage.h> !! 8 #include <asm/cprefix.h> 10 #include <asm/assembler.h> << 11 #include <asm/mte-def.h> << 12 9 13 /* Assumptions: !! 10 #define LO_MAGIC 0x01010101 14 * !! 11 #define HI_MAGIC 0x80808080 15 * ARMv8-a, AArch64, unaligned accesses, min p << 16 */ << 17 << 18 #define L(label) .L ## label << 19 << 20 /* Arguments and results. */ << 21 #define srcin x0 << 22 #define len x0 << 23 << 24 /* Locals and temporaries. */ << 25 #define src x1 << 26 #define data1 x2 << 27 #define data2 x3 << 28 #define has_nul1 x4 << 29 #define has_nul2 x5 << 30 #define tmp1 x4 << 31 #define tmp2 x5 << 32 #define tmp3 x6 << 33 #define tmp4 x7 << 34 #define zeroones x8 << 35 << 36 /* NUL detection works on the principl << 37 (=> (X - 1) & ~(X | 0x7f)) is non-z << 38 can be done in parallel across the << 39 (X - 1) & 0x80 is zero for non-NUL << 40 false hits for characters 129..255. << 41 << 42 #define REP8_01 0x0101010101010101 << 43 #define REP8_7f 0x7f7f7f7f7f7f7f7f << 44 #define REP8_80 0x8080808080808080 << 45 << 46 /* << 47 * When KASAN_HW_TAGS is in use, memory is che << 48 * (16-byte) granularity, and we must ensure t << 49 * alignment boundary. << 50 */ << 51 #ifdef CONFIG_KASAN_HW_TAGS << 52 #define MIN_PAGE_SIZE MTE_GRANULE_SIZE << 53 #else << 54 #define MIN_PAGE_SIZE 4096 << 55 #endif << 56 << 57 /* Since strings are short on average, << 58 of the string for a NUL character. << 59 safely we have to do a page cross c << 60 byte we calculate the length from t << 61 conditional select to reduce branch << 62 strlen will be repeatedly called on << 63 12 64 If the string is longer than 16 byt !! 13 0: 65 further page cross checks, and proc !! 14 ldub [%o0], %o5 66 using the fast NUL check. If we en !! 15 cmp %o5, 0 67 fallback to a second loop using the !! 16 be 1f 68 !! 17 add %o0, 1, %o0 69 If the page cross check fails, we r !! 18 andcc %o0, 3, %g0 70 address, remove any characters befo !! 19 be 4f 71 in the main loop using aligned load !! 20 or %o4, %lo(HI_MAGIC), %o3 72 page in the first 16 bytes are rare !! 21 ldub [%o0], %o5 73 16/MIN_PAGE_SIZE ~= 0.4%), this cas !! 22 cmp %o5, 0 74 !! 23 be 2f 75 AArch64 systems have a minimum page !! 24 add %o0, 1, %o0 76 checking for larger page sizes - th !! 25 andcc %o0, 3, %g0 77 page size is just not worth the ext !! 26 be 5f 78 the cases taking the slow path. No !! 27 sethi %hi(LO_MAGIC), %o4 79 whether the first fetch, which may !! 28 ldub [%o0], %o5 80 boundary. */ !! 29 cmp %o5, 0 81 !! 30 be 3f 82 SYM_FUNC_START(__pi_strlen) !! 31 add %o0, 1, %o0 83 and tmp1, srcin, MIN_PAGE_SIZE - 1 !! 32 b 8f 84 mov zeroones, REP8_01 !! 33 or %o4, %lo(LO_MAGIC), %o2 85 cmp tmp1, MIN_PAGE_SIZE - 16 << 86 b.gt L(page_cross) << 87 ldp data1, data2, [srcin] << 88 #ifdef __AARCH64EB__ << 89 /* For big-endian, carry propagation ( << 90 string is 0x01) means we cannot use << 91 Since we expect strings to be small << 92 byte-swap the data now so has_null1 << 93 rev data1, data1 << 94 rev data2, data2 << 95 #endif << 96 sub tmp1, data1, zeroones << 97 orr tmp2, data1, REP8_7f << 98 sub tmp3, data2, zeroones << 99 orr tmp4, data2, REP8_7f << 100 bics has_nul1, tmp1, tmp2 << 101 bic has_nul2, tmp3, tmp4 << 102 ccmp has_nul2, 0, 0, eq << 103 beq L(main_loop_entry) << 104 << 105 /* Enter with C = has_nul1 == 0. */ << 106 csel has_nul1, has_nul1, has_nul2, << 107 mov len, 8 << 108 rev has_nul1, has_nul1 << 109 clz tmp1, has_nul1 << 110 csel len, xzr, len, cc << 111 add len, len, tmp1, lsr 3 << 112 ret << 113 << 114 /* The inner loop processes 32 bytes p << 115 NUL check. If we encounter non-ASC << 116 loop with the accurate NUL check. << 117 .p2align 4 << 118 L(main_loop_entry): << 119 bic src, srcin, 15 << 120 sub src, src, 16 << 121 L(main_loop): << 122 ldp data1, data2, [src, 32]! << 123 L(page_cross_entry): << 124 sub tmp1, data1, zeroones << 125 sub tmp3, data2, zeroones << 126 orr tmp2, tmp1, tmp3 << 127 tst tmp2, zeroones, lsl 7 << 128 bne 1f << 129 ldp data1, data2, [src, 16] << 130 sub tmp1, data1, zeroones << 131 sub tmp3, data2, zeroones << 132 orr tmp2, tmp1, tmp3 << 133 tst tmp2, zeroones, lsl 7 << 134 beq L(main_loop) << 135 add src, src, 16 << 136 1: 34 1: 137 /* The fast check failed, so do the sl !! 35 retl 138 orr tmp2, data1, REP8_7f !! 36 mov 0, %o0 139 orr tmp4, data2, REP8_7f !! 37 2: 140 bics has_nul1, tmp1, tmp2 !! 38 retl 141 bic has_nul2, tmp3, tmp4 !! 39 mov 1, %o0 142 ccmp has_nul2, 0, 0, eq !! 40 3: 143 beq L(nonascii_loop) !! 41 retl 144 !! 42 mov 2, %o0 145 /* Enter with C = has_nul1 == 0. */ !! 43 146 L(tail): !! 44 .align 4 147 #ifdef __AARCH64EB__ !! 45 .global C_LABEL(strlen) 148 /* For big-endian, carry propagation ( !! 46 C_LABEL(strlen): 149 string is 0x01) means we cannot use !! 47 mov %o0, %o1 150 easiest way to get the correct byte !! 48 andcc %o0, 3, %g0 151 and calculate the syndrome a second !! 49 bne 0b 152 csel data1, data1, data2, cc !! 50 sethi %hi(HI_MAGIC), %o4 153 rev data1, data1 !! 51 or %o4, %lo(HI_MAGIC), %o3 154 sub tmp1, data1, zeroones !! 52 4: 155 orr tmp2, data1, REP8_7f !! 53 sethi %hi(LO_MAGIC), %o4 156 bic has_nul1, tmp1, tmp2 !! 54 5: 157 #else !! 55 or %o4, %lo(LO_MAGIC), %o2 158 csel has_nul1, has_nul1, has_nul2, !! 56 8: 159 #endif !! 57 ld [%o0], %o5 160 sub len, src, srcin !! 58 2: 161 rev has_nul1, has_nul1 !! 59 sub %o5, %o2, %o4 162 add tmp2, len, 8 !! 60 andcc %o4, %o3, %g0 163 clz tmp1, has_nul1 !! 61 be 8b 164 csel len, len, tmp2, cc !! 62 add %o0, 4, %o0 165 add len, len, tmp1, lsr 3 !! 63 166 ret !! 64 /* Check every byte. */ 167 !! 65 srl %o5, 24, %g5 168 L(nonascii_loop): !! 66 andcc %g5, 0xff, %g0 169 ldp data1, data2, [src, 16]! !! 67 be 1f 170 sub tmp1, data1, zeroones !! 68 add %o0, -4, %o4 171 orr tmp2, data1, REP8_7f !! 69 srl %o5, 16, %g5 172 sub tmp3, data2, zeroones !! 70 andcc %g5, 0xff, %g0 173 orr tmp4, data2, REP8_7f !! 71 be 1f 174 bics has_nul1, tmp1, tmp2 !! 72 add %o4, 1, %o4 175 bic has_nul2, tmp3, tmp4 !! 73 srl %o5, 8, %g5 176 ccmp has_nul2, 0, 0, eq !! 74 andcc %g5, 0xff, %g0 177 bne L(tail) !! 75 be 1f 178 ldp data1, data2, [src, 16]! !! 76 add %o4, 1, %o4 179 sub tmp1, data1, zeroones !! 77 andcc %o5, 0xff, %g0 180 orr tmp2, data1, REP8_7f !! 78 bne,a 2b 181 sub tmp3, data2, zeroones !! 79 ld [%o0], %o5 182 orr tmp4, data2, REP8_7f !! 80 add %o4, 1, %o4 183 bics has_nul1, tmp1, tmp2 !! 81 1: 184 bic has_nul2, tmp3, tmp4 !! 82 retl 185 ccmp has_nul2, 0, 0, eq !! 83 sub %o4, %o1, %o0 186 beq L(nonascii_loop) << 187 b L(tail) << 188 << 189 /* Load 16 bytes from [srcin & ~15] an << 190 srcin to 0x7f, so we ignore any NUL << 191 Then continue in the aligned loop. << 192 L(page_cross): << 193 bic src, srcin, 15 << 194 ldp data1, data2, [src] << 195 lsl tmp1, srcin, 3 << 196 mov tmp4, -1 << 197 #ifdef __AARCH64EB__ << 198 /* Big-endian. Early bytes are at MSB << 199 lsr tmp1, tmp4, tmp1 /* Shi << 200 #else << 201 /* Little-endian. Early bytes are at << 202 lsl tmp1, tmp4, tmp1 /* Shi << 203 #endif << 204 orr tmp1, tmp1, REP8_80 << 205 orn data1, data1, tmp1 << 206 orn tmp2, data2, tmp1 << 207 tst srcin, 8 << 208 csel data1, data1, tmp4, eq << 209 csel data2, data2, tmp2, eq << 210 b L(page_cross_entry) << 211 SYM_FUNC_END(__pi_strlen) << 212 SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen) << 213 EXPORT_SYMBOL_NOKASAN(strlen) <<
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.