1 /* SPDX-License-Identifier: GPL-2.0-only */ !! 1 /* strlen.S: Sparc64 optimized strlen code 2 /* !! 2 * Hand optimized from GNU libc's strlen 3 * Copyright (c) 2013-2021, Arm Limited. !! 3 * Copyright (C) 1991,1996 Free Software Foundation 4 * !! 4 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 5 * Adapted from the original at: !! 5 * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 6 * https://github.com/ARM-software/optimized-r << 7 */ 6 */ 8 7 9 #include <linux/linkage.h> !! 8 #define LO_MAGIC 0x01010101 10 #include <asm/assembler.h> !! 9 #define HI_MAGIC 0x80808080 11 #include <asm/mte-def.h> << 12 10 13 /* Assumptions: !! 11 .align 32 14 * !! 12 .global strlen, __strlen 15 * ARMv8-a, AArch64, unaligned accesses, min p !! 13 strlen: 16 */ !! 14 __strlen: 17 !! 15 mov %o0, %o1 18 #define L(label) .L ## label !! 16 andcc %o0, 3, %g0 19 !! 17 be,pt %icc, 9f 20 /* Arguments and results. */ !! 18 sethi %hi(HI_MAGIC), %o4 21 #define srcin x0 !! 19 ldub [%o0], %o5 22 #define len x0 !! 20 brz,pn %o5, 11f 23 !! 21 add %o0, 1, %o0 24 /* Locals and temporaries. */ !! 22 andcc %o0, 3, %g0 25 #define src x1 !! 23 be,pn %icc, 4f 26 #define data1 x2 !! 24 or %o4, %lo(HI_MAGIC), %o3 27 #define data2 x3 !! 25 ldub [%o0], %o5 28 #define has_nul1 x4 !! 26 brz,pn %o5, 12f 29 #define has_nul2 x5 !! 27 add %o0, 1, %o0 30 #define tmp1 x4 !! 28 andcc %o0, 3, %g0 31 #define tmp2 x5 !! 29 be,pt %icc, 5f 32 #define tmp3 x6 !! 30 sethi %hi(LO_MAGIC), %o4 33 #define tmp4 x7 !! 31 ldub [%o0], %o5 34 #define zeroones x8 !! 32 brz,pn %o5, 13f 35 !! 33 add %o0, 1, %o0 36 /* NUL detection works on the principl !! 34 ba,pt %icc, 8f 37 (=> (X - 1) & ~(X | 0x7f)) is non-z !! 35 or %o4, %lo(LO_MAGIC), %o2 38 can be done in parallel across the !! 36 9: 39 (X - 1) & 0x80 is zero for non-NUL !! 37 or %o4, %lo(HI_MAGIC), %o3 40 false hits for characters 129..255. !! 38 4: 41 !! 39 sethi %hi(LO_MAGIC), %o4 42 #define REP8_01 0x0101010101010101 !! 40 5: 43 #define REP8_7f 0x7f7f7f7f7f7f7f7f !! 41 or %o4, %lo(LO_MAGIC), %o2 44 #define REP8_80 0x8080808080808080 !! 42 8: 45 !! 43 ld [%o0], %o5 46 /* !! 44 2: 47 * When KASAN_HW_TAGS is in use, memory is che !! 45 sub %o5, %o2, %o4 48 * (16-byte) granularity, and we must ensure t !! 46 andcc %o4, %o3, %g0 49 * alignment boundary. !! 47 be,pt %icc, 8b 50 */ !! 48 add %o0, 4, %o0 51 #ifdef CONFIG_KASAN_HW_TAGS !! 49 52 #define MIN_PAGE_SIZE MTE_GRANULE_SIZE !! 50 /* Check every byte. */ 53 #else !! 51 srl %o5, 24, %g5 54 #define MIN_PAGE_SIZE 4096 !! 52 andcc %g5, 0xff, %g0 55 #endif !! 53 be,pn %icc, 1f 56 !! 54 add %o0, -4, %o4 57 /* Since strings are short on average, !! 55 srl %o5, 16, %g5 58 of the string for a NUL character. !! 56 andcc %g5, 0xff, %g0 59 safely we have to do a page cross c !! 57 be,pn %icc, 1f 60 byte we calculate the length from t !! 58 add %o4, 1, %o4 61 conditional select to reduce branch !! 59 srl %o5, 8, %g5 62 strlen will be repeatedly called on !! 60 andcc %g5, 0xff, %g0 63 !! 61 be,pn %icc, 1f 64 If the string is longer than 16 byt !! 62 add %o4, 1, %o4 65 further page cross checks, and proc !! 63 andcc %o5, 0xff, %g0 66 using the fast NUL check. If we en !! 64 bne,a,pt %icc, 2b 67 fallback to a second loop using the !! 65 ld [%o0], %o5 68 !! 66 add %o4, 1, %o4 69 If the page cross check fails, we r << 70 address, remove any characters befo << 71 in the main loop using aligned load << 72 page in the first 16 bytes are rare << 73 16/MIN_PAGE_SIZE ~= 0.4%), this cas << 74 << 75 AArch64 systems have a minimum page << 76 checking for larger page sizes - th << 77 page size is just not worth the ext << 78 the cases taking the slow path. No << 79 whether the first fetch, which may << 80 boundary. */ << 81 << 82 SYM_FUNC_START(__pi_strlen) << 83 and tmp1, srcin, MIN_PAGE_SIZE - 1 << 84 mov zeroones, REP8_01 << 85 cmp tmp1, MIN_PAGE_SIZE - 16 << 86 b.gt L(page_cross) << 87 ldp data1, data2, [srcin] << 88 #ifdef __AARCH64EB__ << 89 /* For big-endian, carry propagation ( << 90 string is 0x01) means we cannot use << 91 Since we expect strings to be small << 92 byte-swap the data now so has_null1 << 93 rev data1, data1 << 94 rev data2, data2 << 95 #endif << 96 sub tmp1, data1, zeroones << 97 orr tmp2, data1, REP8_7f << 98 sub tmp3, data2, zeroones << 99 orr tmp4, data2, REP8_7f << 100 bics has_nul1, tmp1, tmp2 << 101 bic has_nul2, tmp3, tmp4 << 102 ccmp has_nul2, 0, 0, eq << 103 beq L(main_loop_entry) << 104 << 105 /* Enter with C = has_nul1 == 0. */ << 106 csel has_nul1, has_nul1, has_nul2, << 107 mov len, 8 << 108 rev has_nul1, has_nul1 << 109 clz tmp1, has_nul1 << 110 csel len, xzr, len, cc << 111 add len, len, tmp1, lsr 3 << 112 ret << 113 << 114 /* The inner loop processes 32 bytes p << 115 NUL check. If we encounter non-ASC << 116 loop with the accurate NUL check. << 117 .p2align 4 << 118 L(main_loop_entry): << 119 bic src, srcin, 15 << 120 sub src, src, 16 << 121 L(main_loop): << 122 ldp data1, data2, [src, 32]! << 123 L(page_cross_entry): << 124 sub tmp1, data1, zeroones << 125 sub tmp3, data2, zeroones << 126 orr tmp2, tmp1, tmp3 << 127 tst tmp2, zeroones, lsl 7 << 128 bne 1f << 129 ldp data1, data2, [src, 16] << 130 sub tmp1, data1, zeroones << 131 sub tmp3, data2, zeroones << 132 orr tmp2, tmp1, tmp3 << 133 tst tmp2, zeroones, lsl 7 << 134 beq L(main_loop) << 135 add src, src, 16 << 136 1: 67 1: 137 /* The fast check failed, so do the sl !! 68 retl 138 orr tmp2, data1, REP8_7f !! 69 sub %o4, %o1, %o0 139 orr tmp4, data2, REP8_7f !! 70 11: 140 bics has_nul1, tmp1, tmp2 !! 71 retl 141 bic has_nul2, tmp3, tmp4 !! 72 mov 0, %o0 142 ccmp has_nul2, 0, 0, eq !! 73 12: 143 beq L(nonascii_loop) !! 74 retl 144 !! 75 mov 1, %o0 145 /* Enter with C = has_nul1 == 0. */ !! 76 13: 146 L(tail): !! 77 retl 147 #ifdef __AARCH64EB__ !! 78 mov 2, %o0 148 /* For big-endian, carry propagation ( << 149 string is 0x01) means we cannot use << 150 easiest way to get the correct byte << 151 and calculate the syndrome a second << 152 csel data1, data1, data2, cc << 153 rev data1, data1 << 154 sub tmp1, data1, zeroones << 155 orr tmp2, data1, REP8_7f << 156 bic has_nul1, tmp1, tmp2 << 157 #else << 158 csel has_nul1, has_nul1, has_nul2, << 159 #endif << 160 sub len, src, srcin << 161 rev has_nul1, has_nul1 << 162 add tmp2, len, 8 << 163 clz tmp1, has_nul1 << 164 csel len, len, tmp2, cc << 165 add len, len, tmp1, lsr 3 << 166 ret << 167 << 168 L(nonascii_loop): << 169 ldp data1, data2, [src, 16]! << 170 sub tmp1, data1, zeroones << 171 orr tmp2, data1, REP8_7f << 172 sub tmp3, data2, zeroones << 173 orr tmp4, data2, REP8_7f << 174 bics has_nul1, tmp1, tmp2 << 175 bic has_nul2, tmp3, tmp4 << 176 ccmp has_nul2, 0, 0, eq << 177 bne L(tail) << 178 ldp data1, data2, [src, 16]! << 179 sub tmp1, data1, zeroones << 180 orr tmp2, data1, REP8_7f << 181 sub tmp3, data2, zeroones << 182 orr tmp4, data2, REP8_7f << 183 bics has_nul1, tmp1, tmp2 << 184 bic has_nul2, tmp3, tmp4 << 185 ccmp has_nul2, 0, 0, eq << 186 beq L(nonascii_loop) << 187 b L(tail) << 188 << 189 /* Load 16 bytes from [srcin & ~15] an << 190 srcin to 0x7f, so we ignore any NUL << 191 Then continue in the aligned loop. << 192 L(page_cross): << 193 bic src, srcin, 15 << 194 ldp data1, data2, [src] << 195 lsl tmp1, srcin, 3 << 196 mov tmp4, -1 << 197 #ifdef __AARCH64EB__ << 198 /* Big-endian. Early bytes are at MSB << 199 lsr tmp1, tmp4, tmp1 /* Shi << 200 #else << 201 /* Little-endian. Early bytes are at << 202 lsl tmp1, tmp4, tmp1 /* Shi << 203 #endif << 204 orr tmp1, tmp1, REP8_80 << 205 orn data1, data1, tmp1 << 206 orn tmp2, data2, tmp1 << 207 tst srcin, 8 << 208 csel data1, data1, tmp4, eq << 209 csel data2, data2, tmp2, eq << 210 b L(page_cross_entry) << 211 SYM_FUNC_END(__pi_strlen) << 212 SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen) << 213 EXPORT_SYMBOL_NOKASAN(strlen) <<
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.