1 /* SPDX-License-Identifier: GPL-2.0-only */ << 2 /* 1 /* 3 * Copyright (c) 2013-2021, Arm Limited. !! 2 * strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu) 4 * 3 * 5 * Adapted from the original at: !! 4 * Finds length of a 0-terminated string. Optimized for the 6 * https://github.com/ARM-software/optimized-r !! 5 * Alpha architecture: 7 */ << 8 << 9 #include <linux/linkage.h> << 10 #include <asm/assembler.h> << 11 #include <asm/mte-def.h> << 12 << 13 /* Assumptions: << 14 * 6 * 15 * ARMv8-a, AArch64, unaligned accesses, min p !! 7 * - memory accessed as aligned quadwords only >> 8 * - uses bcmpge to compare 8 bytes in parallel >> 9 * - does binary search to find 0 byte in last >> 10 * quadword (HAKMEM needed 12 instructions to >> 11 * do this instead of the 9 instructions that >> 12 * binary search needs). 16 */ 13 */ >> 14 #include <asm/export.h> >> 15 .set noreorder >> 16 .set noat >> 17 >> 18 .align 3 >> 19 >> 20 .globl strlen >> 21 .ent strlen >> 22 >> 23 strlen: >> 24 ldq_u $1, 0($16) # load first quadword ($16 may be misaligned) >> 25 lda $2, -1($31) >> 26 insqh $2, $16, $2 >> 27 andnot $16, 7, $0 >> 28 or $2, $1, $1 >> 29 cmpbge $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0 >> 30 bne $2, found >> 31 >> 32 loop: ldq $1, 8($0) >> 33 addq $0, 8, $0 # addr += 8 >> 34 nop # helps dual issue last two insns >> 35 cmpbge $31, $1, $2 >> 36 beq $2, loop >> 37 >> 38 found: blbs $2, done # make aligned case fast >> 39 negq $2, $3 >> 40 and $2, $3, $2 >> 41 >> 42 and $2, 0x0f, $1 >> 43 addq $0, 4, $3 >> 44 cmoveq $1, $3, $0 >> 45 >> 46 and $2, 0x33, $1 >> 47 addq $0, 2, $3 >> 48 cmoveq $1, $3, $0 >> 49 >> 50 and $2, 0x55, $1 >> 51 addq $0, 1, $3 >> 52 cmoveq $1, $3, $0 17 53 18 #define L(label) .L ## label !! 54 done: subq $0, $16, $0 >> 55 ret $31, ($26) 19 56 20 /* Arguments and results. */ !! 57 .end strlen 21 #define srcin x0 !! 58 EXPORT_SYMBOL(strlen) 22 #define len x0 << 23 << 24 /* Locals and temporaries. */ << 25 #define src x1 << 26 #define data1 x2 << 27 #define data2 x3 << 28 #define has_nul1 x4 << 29 #define has_nul2 x5 << 30 #define tmp1 x4 << 31 #define tmp2 x5 << 32 #define tmp3 x6 << 33 #define tmp4 x7 << 34 #define zeroones x8 << 35 << 36 /* NUL detection works on the principl << 37 (=> (X - 1) & ~(X | 0x7f)) is non-z << 38 can be done in parallel across the << 39 (X - 1) & 0x80 is zero for non-NUL << 40 false hits for characters 129..255. << 41 << 42 #define REP8_01 0x0101010101010101 << 43 #define REP8_7f 0x7f7f7f7f7f7f7f7f << 44 #define REP8_80 0x8080808080808080 << 45 << 46 /* << 47 * When KASAN_HW_TAGS is in use, memory is che << 48 * (16-byte) granularity, and we must ensure t << 49 * alignment boundary. << 50 */ << 51 #ifdef CONFIG_KASAN_HW_TAGS << 52 #define MIN_PAGE_SIZE MTE_GRANULE_SIZE << 53 #else << 54 #define MIN_PAGE_SIZE 4096 << 55 #endif << 56 << 57 /* Since strings are short on average, << 58 of the string for a NUL character. << 59 safely we have to do a page cross c << 60 byte we calculate the length from t << 61 conditional select to reduce branch << 62 strlen will be repeatedly called on << 63 << 64 If the string is longer than 16 byt << 65 further page cross checks, and proc << 66 using the fast NUL check. If we en << 67 fallback to a second loop using the << 68 << 69 If the page cross check fails, we r << 70 address, remove any characters befo << 71 in the main loop using aligned load << 72 page in the first 16 bytes are rare << 73 16/MIN_PAGE_SIZE ~= 0.4%), this cas << 74 << 75 AArch64 systems have a minimum page << 76 checking for larger page sizes - th << 77 page size is just not worth the ext << 78 the cases taking the slow path. No << 79 whether the first fetch, which may << 80 boundary. */ << 81 << 82 SYM_FUNC_START(__pi_strlen) << 83 and tmp1, srcin, MIN_PAGE_SIZE - 1 << 84 mov zeroones, REP8_01 << 85 cmp tmp1, MIN_PAGE_SIZE - 16 << 86 b.gt L(page_cross) << 87 ldp data1, data2, [srcin] << 88 #ifdef __AARCH64EB__ << 89 /* For big-endian, carry propagation ( << 90 string is 0x01) means we cannot use << 91 Since we expect strings to be small << 92 byte-swap the data now so has_null1 << 93 rev data1, data1 << 94 rev data2, data2 << 95 #endif << 96 sub tmp1, data1, zeroones << 97 orr tmp2, data1, REP8_7f << 98 sub tmp3, data2, zeroones << 99 orr tmp4, data2, REP8_7f << 100 bics has_nul1, tmp1, tmp2 << 101 bic has_nul2, tmp3, tmp4 << 102 ccmp has_nul2, 0, 0, eq << 103 beq L(main_loop_entry) << 104 << 105 /* Enter with C = has_nul1 == 0. */ << 106 csel has_nul1, has_nul1, has_nul2, << 107 mov len, 8 << 108 rev has_nul1, has_nul1 << 109 clz tmp1, has_nul1 << 110 csel len, xzr, len, cc << 111 add len, len, tmp1, lsr 3 << 112 ret << 113 << 114 /* The inner loop processes 32 bytes p << 115 NUL check. If we encounter non-ASC << 116 loop with the accurate NUL check. << 117 .p2align 4 << 118 L(main_loop_entry): << 119 bic src, srcin, 15 << 120 sub src, src, 16 << 121 L(main_loop): << 122 ldp data1, data2, [src, 32]! << 123 L(page_cross_entry): << 124 sub tmp1, data1, zeroones << 125 sub tmp3, data2, zeroones << 126 orr tmp2, tmp1, tmp3 << 127 tst tmp2, zeroones, lsl 7 << 128 bne 1f << 129 ldp data1, data2, [src, 16] << 130 sub tmp1, data1, zeroones << 131 sub tmp3, data2, zeroones << 132 orr tmp2, tmp1, tmp3 << 133 tst tmp2, zeroones, lsl 7 << 134 beq L(main_loop) << 135 add src, src, 16 << 136 1: << 137 /* The fast check failed, so do the sl << 138 orr tmp2, data1, REP8_7f << 139 orr tmp4, data2, REP8_7f << 140 bics has_nul1, tmp1, tmp2 << 141 bic has_nul2, tmp3, tmp4 << 142 ccmp has_nul2, 0, 0, eq << 143 beq L(nonascii_loop) << 144 << 145 /* Enter with C = has_nul1 == 0. */ << 146 L(tail): << 147 #ifdef __AARCH64EB__ << 148 /* For big-endian, carry propagation ( << 149 string is 0x01) means we cannot use << 150 easiest way to get the correct byte << 151 and calculate the syndrome a second << 152 csel data1, data1, data2, cc << 153 rev data1, data1 << 154 sub tmp1, data1, zeroones << 155 orr tmp2, data1, REP8_7f << 156 bic has_nul1, tmp1, tmp2 << 157 #else << 158 csel has_nul1, has_nul1, has_nul2, << 159 #endif << 160 sub len, src, srcin << 161 rev has_nul1, has_nul1 << 162 add tmp2, len, 8 << 163 clz tmp1, has_nul1 << 164 csel len, len, tmp2, cc << 165 add len, len, tmp1, lsr 3 << 166 ret << 167 << 168 L(nonascii_loop): << 169 ldp data1, data2, [src, 16]! << 170 sub tmp1, data1, zeroones << 171 orr tmp2, data1, REP8_7f << 172 sub tmp3, data2, zeroones << 173 orr tmp4, data2, REP8_7f << 174 bics has_nul1, tmp1, tmp2 << 175 bic has_nul2, tmp3, tmp4 << 176 ccmp has_nul2, 0, 0, eq << 177 bne L(tail) << 178 ldp data1, data2, [src, 16]! << 179 sub tmp1, data1, zeroones << 180 orr tmp2, data1, REP8_7f << 181 sub tmp3, data2, zeroones << 182 orr tmp4, data2, REP8_7f << 183 bics has_nul1, tmp1, tmp2 << 184 bic has_nul2, tmp3, tmp4 << 185 ccmp has_nul2, 0, 0, eq << 186 beq L(nonascii_loop) << 187 b L(tail) << 188 << 189 /* Load 16 bytes from [srcin & ~15] an << 190 srcin to 0x7f, so we ignore any NUL << 191 Then continue in the aligned loop. << 192 L(page_cross): << 193 bic src, srcin, 15 << 194 ldp data1, data2, [src] << 195 lsl tmp1, srcin, 3 << 196 mov tmp4, -1 << 197 #ifdef __AARCH64EB__ << 198 /* Big-endian. Early bytes are at MSB << 199 lsr tmp1, tmp4, tmp1 /* Shi << 200 #else << 201 /* Little-endian. Early bytes are at << 202 lsl tmp1, tmp4, tmp1 /* Shi << 203 #endif << 204 orr tmp1, tmp1, REP8_80 << 205 orn data1, data1, tmp1 << 206 orn tmp2, data2, tmp1 << 207 tst srcin, 8 << 208 csel data1, data1, tmp4, eq << 209 csel data2, data2, tmp2, eq << 210 b L(page_cross_entry) << 211 SYM_FUNC_END(__pi_strlen) << 212 SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen) << 213 EXPORT_SYMBOL_NOKASAN(strlen) <<
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.