1 /* SPDX-License-Identifier: GPL-2.0-only */ !! 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* !! 2 /* strlen.S: Sparc optimized strlen code 3 * Copyright (c) 2013-2021, Arm Limited. !! 3 * Hand optimized from GNU libc's strlen 4 * !! 4 * Copyright (C) 1991,1996 Free Software Foundation 5 * Adapted from the original at: !! 5 * Copyright (C) 1996,2008 David S. Miller (davem@davemloft.net) 6 * https://github.com/ARM-software/optimized-r !! 6 * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 7 */ 7 */ 8 8 9 #include <linux/linkage.h> 9 #include <linux/linkage.h> 10 #include <asm/assembler.h> !! 10 #include <asm/asm.h> 11 #include <asm/mte-def.h> !! 11 #include <asm/export.h> 12 12 13 /* Assumptions: !! 13 #define LO_MAGIC 0x01010101 14 * !! 14 #define HI_MAGIC 0x80808080 15 * ARMv8-a, AArch64, unaligned accesses, min p << 16 */ << 17 << 18 #define L(label) .L ## label << 19 << 20 /* Arguments and results. */ << 21 #define srcin x0 << 22 #define len x0 << 23 << 24 /* Locals and temporaries. */ << 25 #define src x1 << 26 #define data1 x2 << 27 #define data2 x3 << 28 #define has_nul1 x4 << 29 #define has_nul2 x5 << 30 #define tmp1 x4 << 31 #define tmp2 x5 << 32 #define tmp3 x6 << 33 #define tmp4 x7 << 34 #define zeroones x8 << 35 << 36 /* NUL detection works on the principl << 37 (=> (X - 1) & ~(X | 0x7f)) is non-z << 38 can be done in parallel across the << 39 (X - 1) & 0x80 is zero for non-NUL << 40 false hits for characters 129..255. << 41 << 42 #define REP8_01 0x0101010101010101 << 43 #define REP8_7f 0x7f7f7f7f7f7f7f7f << 44 #define REP8_80 0x8080808080808080 << 45 << 46 /* << 47 * When KASAN_HW_TAGS is in use, memory is che << 48 * (16-byte) granularity, and we must ensure t << 49 * alignment boundary. << 50 */ << 51 #ifdef CONFIG_KASAN_HW_TAGS << 52 #define MIN_PAGE_SIZE MTE_GRANULE_SIZE << 53 #else << 54 #define MIN_PAGE_SIZE 4096 << 55 #endif << 56 << 57 /* Since strings are short on average, << 58 of the string for a NUL character. << 59 safely we have to do a page cross c << 60 byte we calculate the length from t << 61 conditional select to reduce branch << 62 strlen will be repeatedly called on << 63 15 64 If the string is longer than 16 byt !! 16 .text 65 further page cross checks, and proc !! 17 ENTRY(strlen) 66 using the fast NUL check. If we en !! 18 mov %o0, %o1 67 fallback to a second loop using the !! 19 andcc %o0, 3, %g0 68 !! 20 BRANCH32(be, pt, 9f) 69 If the page cross check fails, we r !! 21 sethi %hi(HI_MAGIC), %o4 70 address, remove any characters befo !! 22 ldub [%o0], %o5 71 in the main loop using aligned load !! 23 BRANCH_REG_ZERO(pn, %o5, 11f) 72 page in the first 16 bytes are rare !! 24 add %o0, 1, %o0 73 16/MIN_PAGE_SIZE ~= 0.4%), this cas !! 25 andcc %o0, 3, %g0 74 !! 26 BRANCH32(be, pn, 4f) 75 AArch64 systems have a minimum page !! 27 or %o4, %lo(HI_MAGIC), %o3 76 checking for larger page sizes - th !! 28 ldub [%o0], %o5 77 page size is just not worth the ext !! 29 BRANCH_REG_ZERO(pn, %o5, 12f) 78 the cases taking the slow path. No !! 30 add %o0, 1, %o0 79 whether the first fetch, which may !! 31 andcc %o0, 3, %g0 80 boundary. */ !! 32 BRANCH32(be, pt, 5f) 81 !! 33 sethi %hi(LO_MAGIC), %o4 82 SYM_FUNC_START(__pi_strlen) !! 34 ldub [%o0], %o5 83 and tmp1, srcin, MIN_PAGE_SIZE - 1 !! 35 BRANCH_REG_ZERO(pn, %o5, 13f) 84 mov zeroones, REP8_01 !! 36 add %o0, 1, %o0 85 cmp tmp1, MIN_PAGE_SIZE - 16 !! 37 BRANCH32(ba, pt, 8f) 86 b.gt L(page_cross) !! 38 or %o4, %lo(LO_MAGIC), %o2 87 ldp data1, data2, [srcin] !! 39 9: 88 #ifdef __AARCH64EB__ !! 40 or %o4, %lo(HI_MAGIC), %o3 89 /* For big-endian, carry propagation ( !! 41 4: 90 string is 0x01) means we cannot use !! 42 sethi %hi(LO_MAGIC), %o4 91 Since we expect strings to be small !! 43 5: 92 byte-swap the data now so has_null1 !! 44 or %o4, %lo(LO_MAGIC), %o2 93 rev data1, data1 !! 45 8: 94 rev data2, data2 !! 46 ld [%o0], %o5 95 #endif !! 47 2: 96 sub tmp1, data1, zeroones !! 48 sub %o5, %o2, %o4 97 orr tmp2, data1, REP8_7f !! 49 andcc %o4, %o3, %g0 98 sub tmp3, data2, zeroones !! 50 BRANCH32(be, pt, 8b) 99 orr tmp4, data2, REP8_7f !! 51 add %o0, 4, %o0 100 bics has_nul1, tmp1, tmp2 !! 52 101 bic has_nul2, tmp3, tmp4 !! 53 /* Check every byte. */ 102 ccmp has_nul2, 0, 0, eq !! 54 srl %o5, 24, %g7 103 beq L(main_loop_entry) !! 55 andcc %g7, 0xff, %g0 104 !! 56 BRANCH32(be, pn, 1f) 105 /* Enter with C = has_nul1 == 0. */ !! 57 add %o0, -4, %o4 106 csel has_nul1, has_nul1, has_nul2, !! 58 srl %o5, 16, %g7 107 mov len, 8 !! 59 andcc %g7, 0xff, %g0 108 rev has_nul1, has_nul1 !! 60 BRANCH32(be, pn, 1f) 109 clz tmp1, has_nul1 !! 61 add %o4, 1, %o4 110 csel len, xzr, len, cc !! 62 srl %o5, 8, %g7 111 add len, len, tmp1, lsr 3 !! 63 andcc %g7, 0xff, %g0 112 ret !! 64 BRANCH32(be, pn, 1f) 113 !! 65 add %o4, 1, %o4 114 /* The inner loop processes 32 bytes p !! 66 andcc %o5, 0xff, %g0 115 NUL check. If we encounter non-ASC !! 67 BRANCH32_ANNUL(bne, pt, 2b) 116 loop with the accurate NUL check. !! 68 ld [%o0], %o5 117 .p2align 4 !! 69 add %o4, 1, %o4 118 L(main_loop_entry): << 119 bic src, srcin, 15 << 120 sub src, src, 16 << 121 L(main_loop): << 122 ldp data1, data2, [src, 32]! << 123 L(page_cross_entry): << 124 sub tmp1, data1, zeroones << 125 sub tmp3, data2, zeroones << 126 orr tmp2, tmp1, tmp3 << 127 tst tmp2, zeroones, lsl 7 << 128 bne 1f << 129 ldp data1, data2, [src, 16] << 130 sub tmp1, data1, zeroones << 131 sub tmp3, data2, zeroones << 132 orr tmp2, tmp1, tmp3 << 133 tst tmp2, zeroones, lsl 7 << 134 beq L(main_loop) << 135 add src, src, 16 << 136 1: 70 1: 137 /* The fast check failed, so do the sl !! 71 retl 138 orr tmp2, data1, REP8_7f !! 72 sub %o4, %o1, %o0 139 orr tmp4, data2, REP8_7f !! 73 11: 140 bics has_nul1, tmp1, tmp2 !! 74 retl 141 bic has_nul2, tmp3, tmp4 !! 75 mov 0, %o0 142 ccmp has_nul2, 0, 0, eq !! 76 12: 143 beq L(nonascii_loop) !! 77 retl 144 !! 78 mov 1, %o0 145 /* Enter with C = has_nul1 == 0. */ !! 79 13: 146 L(tail): !! 80 retl 147 #ifdef __AARCH64EB__ !! 81 mov 2, %o0 148 /* For big-endian, carry propagation ( !! 82 ENDPROC(strlen) 149 string is 0x01) means we cannot use !! 83 EXPORT_SYMBOL(strlen) 150 easiest way to get the correct byte << 151 and calculate the syndrome a second << 152 csel data1, data1, data2, cc << 153 rev data1, data1 << 154 sub tmp1, data1, zeroones << 155 orr tmp2, data1, REP8_7f << 156 bic has_nul1, tmp1, tmp2 << 157 #else << 158 csel has_nul1, has_nul1, has_nul2, << 159 #endif << 160 sub len, src, srcin << 161 rev has_nul1, has_nul1 << 162 add tmp2, len, 8 << 163 clz tmp1, has_nul1 << 164 csel len, len, tmp2, cc << 165 add len, len, tmp1, lsr 3 << 166 ret << 167 << 168 L(nonascii_loop): << 169 ldp data1, data2, [src, 16]! << 170 sub tmp1, data1, zeroones << 171 orr tmp2, data1, REP8_7f << 172 sub tmp3, data2, zeroones << 173 orr tmp4, data2, REP8_7f << 174 bics has_nul1, tmp1, tmp2 << 175 bic has_nul2, tmp3, tmp4 << 176 ccmp has_nul2, 0, 0, eq << 177 bne L(tail) << 178 ldp data1, data2, [src, 16]! << 179 sub tmp1, data1, zeroones << 180 orr tmp2, data1, REP8_7f << 181 sub tmp3, data2, zeroones << 182 orr tmp4, data2, REP8_7f << 183 bics has_nul1, tmp1, tmp2 << 184 bic has_nul2, tmp3, tmp4 << 185 ccmp has_nul2, 0, 0, eq << 186 beq L(nonascii_loop) << 187 b L(tail) << 188 << 189 /* Load 16 bytes from [srcin & ~15] an << 190 srcin to 0x7f, so we ignore any NUL << 191 Then continue in the aligned loop. << 192 L(page_cross): << 193 bic src, srcin, 15 << 194 ldp data1, data2, [src] << 195 lsl tmp1, srcin, 3 << 196 mov tmp4, -1 << 197 #ifdef __AARCH64EB__ << 198 /* Big-endian. Early bytes are at MSB << 199 lsr tmp1, tmp4, tmp1 /* Shi << 200 #else << 201 /* Little-endian. Early bytes are at << 202 lsl tmp1, tmp4, tmp1 /* Shi << 203 #endif << 204 orr tmp1, tmp1, REP8_80 << 205 orn data1, data1, tmp1 << 206 orn tmp2, data2, tmp1 << 207 tst srcin, 8 << 208 csel data1, data1, tmp4, eq << 209 csel data2, data2, tmp2, eq << 210 b L(page_cross_entry) << 211 SYM_FUNC_END(__pi_strlen) << 212 SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen) << 213 EXPORT_SYMBOL_NOKASAN(strlen) <<
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.