~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/lib/strlen.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/arm64/lib/strlen.S (Version linux-6.12-rc7) and /arch/alpha/lib/strlen.S (Version linux-4.17.19)


  1 /* SPDX-License-Identifier: GPL-2.0-only */    !!   1 /* SPDX-License-Identifier: GPL-2.0 */
  2 /*                                                  2 /*
  3  * Copyright (c) 2013-2021, Arm Limited.       !!   3  * strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu)
  4  *                                                  4  *
  5  * Adapted from the original at:               !!   5  * Finds length of a 0-terminated string.  Optimized for the
  6  * https://github.com/ARM-software/optimized-r !!   6  * Alpha architecture:
  7  */                                            << 
  8                                                << 
  9 #include <linux/linkage.h>                     << 
 10 #include <asm/assembler.h>                     << 
 11 #include <asm/mte-def.h>                       << 
 12                                                << 
 13 /* Assumptions:                                << 
 14  *                                                  7  *
 15  * ARMv8-a, AArch64, unaligned accesses, min p !!   8  *      - memory accessed as aligned quadwords only
                                                   >>   9  *      - uses bcmpge to compare 8 bytes in parallel
                                                   >>  10  *      - does binary search to find 0 byte in last
                                                   >>  11  *        quadword (HAKMEM needed 12 instructions to
                                                   >>  12  *        do this instead of the 9 instructions that
                                                   >>  13  *        binary search needs).
 16  */                                                14  */
                                                   >>  15 #include <asm/export.h>
                                                   >>  16         .set noreorder
                                                   >>  17         .set noat
                                                   >>  18 
                                                   >>  19         .align 3
                                                   >>  20 
                                                   >>  21         .globl  strlen
                                                   >>  22         .ent    strlen
                                                   >>  23 
                                                   >>  24 strlen:
                                                   >>  25         ldq_u   $1, 0($16)      # load first quadword ($16  may be misaligned)
                                                   >>  26         lda     $2, -1($31)
                                                   >>  27         insqh   $2, $16, $2
                                                   >>  28         andnot  $16, 7, $0
                                                   >>  29         or      $2, $1, $1
                                                   >>  30         cmpbge  $31, $1, $2     # $2  <- bitmask: bit i == 1 <==> i-th byte == 0
                                                   >>  31         bne     $2, found
                                                   >>  32 
                                                   >>  33 loop:   ldq     $1, 8($0)
                                                   >>  34         addq    $0, 8, $0       # addr += 8
                                                   >>  35         nop                     # helps dual issue last two insns
                                                   >>  36         cmpbge  $31, $1, $2
                                                   >>  37         beq     $2, loop
                                                   >>  38 
                                                   >>  39 found:  blbs    $2, done        # make aligned case fast
                                                   >>  40         negq    $2, $3
                                                   >>  41         and     $2, $3, $2
                                                   >>  42 
                                                   >>  43         and     $2, 0x0f, $1
                                                   >>  44         addq    $0, 4, $3
                                                   >>  45         cmoveq  $1, $3, $0
                                                   >>  46 
                                                   >>  47         and     $2, 0x33, $1
                                                   >>  48         addq    $0, 2, $3
                                                   >>  49         cmoveq  $1, $3, $0
                                                   >>  50 
                                                   >>  51         and     $2, 0x55, $1
                                                   >>  52         addq    $0, 1, $3
                                                   >>  53         cmoveq  $1, $3, $0
 17                                                    54 
 18 #define L(label) .L ## label                   !!  55 done:   subq    $0, $16, $0
                                                   >>  56         ret     $31, ($26)
 19                                                    57 
 20 /* Arguments and results.  */                  !!  58         .end    strlen
 21 #define srcin           x0                     !!  59         EXPORT_SYMBOL(strlen)
 22 #define len             x0                     << 
 23                                                << 
 24 /* Locals and temporaries.  */                 << 
 25 #define src             x1                     << 
 26 #define data1           x2                     << 
 27 #define data2           x3                     << 
 28 #define has_nul1        x4                     << 
 29 #define has_nul2        x5                     << 
 30 #define tmp1            x4                     << 
 31 #define tmp2            x5                     << 
 32 #define tmp3            x6                     << 
 33 #define tmp4            x7                     << 
 34 #define zeroones        x8                     << 
 35                                                << 
 36         /* NUL detection works on the principl << 
 37            (=> (X - 1) & ~(X | 0x7f)) is non-z << 
 38            can be done in parallel across the  << 
 39            (X - 1) & 0x80 is zero for non-NUL  << 
 40            false hits for characters 129..255. << 
 41                                                << 
 42 #define REP8_01 0x0101010101010101             << 
 43 #define REP8_7f 0x7f7f7f7f7f7f7f7f             << 
 44 #define REP8_80 0x8080808080808080             << 
 45                                                << 
 46 /*                                             << 
 47  * When KASAN_HW_TAGS is in use, memory is che << 
 48  * (16-byte) granularity, and we must ensure t << 
 49  * alignment boundary.                         << 
 50  */                                            << 
 51 #ifdef CONFIG_KASAN_HW_TAGS                    << 
 52 #define MIN_PAGE_SIZE MTE_GRANULE_SIZE         << 
 53 #else                                          << 
 54 #define MIN_PAGE_SIZE 4096                     << 
 55 #endif                                         << 
 56                                                << 
 57         /* Since strings are short on average, << 
 58            of the string for a NUL character.  << 
 59            safely we have to do a page cross c << 
 60            byte we calculate the length from t << 
 61            conditional select to reduce branch << 
 62            strlen will be repeatedly called on << 
 63                                                << 
 64            If the string is longer than 16 byt << 
 65            further page cross checks, and proc << 
 66            using the fast NUL check.  If we en << 
 67            fallback to a second loop using the << 
 68                                                << 
 69            If the page cross check fails, we r << 
 70            address, remove any characters befo << 
 71            in the main loop using aligned load << 
 72            page in the first 16 bytes are rare << 
 73            16/MIN_PAGE_SIZE ~= 0.4%), this cas << 
 74                                                << 
 75            AArch64 systems have a minimum page << 
 76            checking for larger page sizes - th << 
 77            page size is just not worth the ext << 
 78            the cases taking the slow path.  No << 
 79            whether the first fetch, which may  << 
 80            boundary.  */                       << 
 81                                                << 
 82 SYM_FUNC_START(__pi_strlen)                    << 
 83         and     tmp1, srcin, MIN_PAGE_SIZE - 1 << 
 84         mov     zeroones, REP8_01              << 
 85         cmp     tmp1, MIN_PAGE_SIZE - 16       << 
 86         b.gt    L(page_cross)                  << 
 87         ldp     data1, data2, [srcin]          << 
 88 #ifdef __AARCH64EB__                           << 
 89         /* For big-endian, carry propagation ( << 
 90            string is 0x01) means we cannot use << 
 91            Since we expect strings to be small << 
 92            byte-swap the data now so has_null1 << 
 93         rev     data1, data1                   << 
 94         rev     data2, data2                   << 
 95 #endif                                         << 
 96         sub     tmp1, data1, zeroones          << 
 97         orr     tmp2, data1, REP8_7f           << 
 98         sub     tmp3, data2, zeroones          << 
 99         orr     tmp4, data2, REP8_7f           << 
100         bics    has_nul1, tmp1, tmp2           << 
101         bic     has_nul2, tmp3, tmp4           << 
102         ccmp    has_nul2, 0, 0, eq             << 
103         beq     L(main_loop_entry)             << 
104                                                << 
105         /* Enter with C = has_nul1 == 0.  */   << 
106         csel    has_nul1, has_nul1, has_nul2,  << 
107         mov     len, 8                         << 
108         rev     has_nul1, has_nul1             << 
109         clz     tmp1, has_nul1                 << 
110         csel    len, xzr, len, cc              << 
111         add     len, len, tmp1, lsr 3          << 
112         ret                                    << 
113                                                << 
114         /* The inner loop processes 32 bytes p << 
115            NUL check.  If we encounter non-ASC << 
116            loop with the accurate NUL check.   << 
117         .p2align 4                             << 
118 L(main_loop_entry):                            << 
119         bic     src, srcin, 15                 << 
120         sub     src, src, 16                   << 
121 L(main_loop):                                  << 
122         ldp     data1, data2, [src, 32]!       << 
123 L(page_cross_entry):                           << 
124         sub     tmp1, data1, zeroones          << 
125         sub     tmp3, data2, zeroones          << 
126         orr     tmp2, tmp1, tmp3               << 
127         tst     tmp2, zeroones, lsl 7          << 
128         bne     1f                             << 
129         ldp     data1, data2, [src, 16]        << 
130         sub     tmp1, data1, zeroones          << 
131         sub     tmp3, data2, zeroones          << 
132         orr     tmp2, tmp1, tmp3               << 
133         tst     tmp2, zeroones, lsl 7          << 
134         beq     L(main_loop)                   << 
135         add     src, src, 16                   << 
136 1:                                             << 
137         /* The fast check failed, so do the sl << 
138         orr     tmp2, data1, REP8_7f           << 
139         orr     tmp4, data2, REP8_7f           << 
140         bics    has_nul1, tmp1, tmp2           << 
141         bic     has_nul2, tmp3, tmp4           << 
142         ccmp    has_nul2, 0, 0, eq             << 
143         beq     L(nonascii_loop)               << 
144                                                << 
145         /* Enter with C = has_nul1 == 0.  */   << 
146 L(tail):                                       << 
147 #ifdef __AARCH64EB__                           << 
148         /* For big-endian, carry propagation ( << 
149            string is 0x01) means we cannot use << 
150            easiest way to get the correct byte << 
151            and calculate the syndrome a second << 
152         csel    data1, data1, data2, cc        << 
153         rev     data1, data1                   << 
154         sub     tmp1, data1, zeroones          << 
155         orr     tmp2, data1, REP8_7f           << 
156         bic     has_nul1, tmp1, tmp2           << 
157 #else                                          << 
158         csel    has_nul1, has_nul1, has_nul2,  << 
159 #endif                                         << 
160         sub     len, src, srcin                << 
161         rev     has_nul1, has_nul1             << 
162         add     tmp2, len, 8                   << 
163         clz     tmp1, has_nul1                 << 
164         csel    len, len, tmp2, cc             << 
165         add     len, len, tmp1, lsr 3          << 
166         ret                                    << 
167                                                << 
168 L(nonascii_loop):                              << 
169         ldp     data1, data2, [src, 16]!       << 
170         sub     tmp1, data1, zeroones          << 
171         orr     tmp2, data1, REP8_7f           << 
172         sub     tmp3, data2, zeroones          << 
173         orr     tmp4, data2, REP8_7f           << 
174         bics    has_nul1, tmp1, tmp2           << 
175         bic     has_nul2, tmp3, tmp4           << 
176         ccmp    has_nul2, 0, 0, eq             << 
177         bne     L(tail)                        << 
178         ldp     data1, data2, [src, 16]!       << 
179         sub     tmp1, data1, zeroones          << 
180         orr     tmp2, data1, REP8_7f           << 
181         sub     tmp3, data2, zeroones          << 
182         orr     tmp4, data2, REP8_7f           << 
183         bics    has_nul1, tmp1, tmp2           << 
184         bic     has_nul2, tmp3, tmp4           << 
185         ccmp    has_nul2, 0, 0, eq             << 
186         beq     L(nonascii_loop)               << 
187         b       L(tail)                        << 
188                                                << 
189         /* Load 16 bytes from [srcin & ~15] an << 
190            srcin to 0x7f, so we ignore any NUL << 
191            Then continue in the aligned loop.  << 
192 L(page_cross):                                 << 
193         bic     src, srcin, 15                 << 
194         ldp     data1, data2, [src]            << 
195         lsl     tmp1, srcin, 3                 << 
196         mov     tmp4, -1                       << 
197 #ifdef __AARCH64EB__                           << 
198         /* Big-endian.  Early bytes are at MSB << 
199         lsr     tmp1, tmp4, tmp1        /* Shi << 
200 #else                                          << 
201         /* Little-endian.  Early bytes are at  << 
202         lsl     tmp1, tmp4, tmp1        /* Shi << 
203 #endif                                         << 
204         orr     tmp1, tmp1, REP8_80            << 
205         orn     data1, data1, tmp1             << 
206         orn     tmp2, data2, tmp1              << 
207         tst     srcin, 8                       << 
208         csel    data1, data1, tmp4, eq         << 
209         csel    data2, data2, tmp2, eq         << 
210         b       L(page_cross_entry)            << 
211 SYM_FUNC_END(__pi_strlen)                      << 
212 SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)       << 
213 EXPORT_SYMBOL_NOKASAN(strlen)                  << 
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php