~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/sparc/lib/M7memset.S

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * M7memset.S: SPARC M7 optimized memset.
  3  *
  4  * Copyright (c) 2016, Oracle and/or its affiliates.  All rights reserved.
  5  */
  6 
  7 /*
  8  * M7memset.S: M7 optimized memset.
  9  *
 10  * char *memset(sp, c, n)
 11  *
 12  * Set an array of n chars starting at sp to the character c.
 13  * Return sp.
 14  *
 15  * Fast assembler language version of the following C-program for memset
 16  * which represents the `standard' for the C-library.
 17  *
 18  *      void *
 19  *      memset(void *sp1, int c, size_t n)
 20  *      {
 21  *          if (n != 0) {
 22  *              char *sp = sp1;
 23  *              do {
 24  *                  *sp++ = (char)c;
 25  *              } while (--n != 0);
 26  *          }
 27  *          return (sp1);
 28  *      }
 29  *
 30  * The algorithm is as follows :
 31  *
 32  *      For small 6 or fewer bytes stores, bytes will be stored.
 33  *
 34  *      For less than 32 bytes stores, align the address on 4 byte boundary.
 35  *      Then store as many 4-byte chunks, followed by trailing bytes.
 36  *
 37  *      For sizes greater than 32 bytes, align the address on 8 byte boundary.
 38  *      if (count >= 64) {
 39  *              store 8-bytes chunks to align the address on 64 byte boundary
 40  *              if (value to be set is zero && count >= MIN_ZERO) {
 41  *                      Using BIS stores, set the first long word of each
 42  *                      64-byte cache line to zero which will also clear the
 43  *                      other seven long words of the cache line.
 44  *              }
 45  *              else if (count >= MIN_LOOP) {
 46  *                      Using BIS stores, set the first long word of each of
 47  *                      ST_CHUNK cache lines (64 bytes each) before the main
 48  *                      loop is entered.
 49  *                      In the main loop, continue pre-setting the first long
 50  *                      word of each cache line ST_CHUNK lines in advance while
 51  *                      setting the other seven long words (56 bytes) of each
 52  *                      cache line until fewer than ST_CHUNK*64 bytes remain.
 53  *                      Then set the remaining seven long words of each cache
 54  *                      line that has already had its first long word set.
 55  *              }
 56  *              store remaining data in 64-byte chunks until less than
 57  *              64 bytes remain.
 58  *       }
 59  *       Store as many 8-byte chunks, followed by trailing bytes.
 60  *
 61  * BIS = Block Init Store
 62  *   Doing the advance store of the first element of the cache line
 63  *   initiates the displacement of a cache line while only using a single
 64  *   instruction in the pipeline. That avoids various pipeline delays,
 65  *   such as filling the miss buffer. The performance effect is
 66  *   similar to prefetching for normal stores.
 67  *   The special case for zero fills runs faster and uses fewer instruction
 68  *   cycles than the normal memset loop.
 69  *
 70  * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
 71  * BIS stores must be followed by a membar #StoreStore. The benefit of
 72  * the BIS store must be balanced against the cost of the membar operation.
 73  */
 74 
 75 /*
 76  * ASI_STBI_P marks the cache line as "least recently used"
 77  * which means if many threads are active, it has a high chance
 78  * of being pushed out of the cache between the first initializing
 79  * store and the final stores.
 80  * Thus, we use ASI_STBIMRU_P which marks the cache line as
 81  * "most recently used" for all but the last store to the cache line.
 82  */
 83 
 84 #include <asm/asi.h>
 85 #include <asm/page.h>
 86 
 87 #define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
 88 #define ASI_STBIMRU_P   ASI_ST_BLKINIT_MRU_P
 89 
 90 
 91 #define ST_CHUNK        24   /* multiple of 4 due to loop unrolling */
 92 #define MIN_LOOP        16320
 93 #define MIN_ZERO        512
 94 
 95         .section        ".text"
 96         .align          32
 97 
 98 /*
 99  * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
100  * (can create a more optimized version later.)
101  */
102         .globl          M7clear_page
103         .globl          M7clear_user_page
104 M7clear_page:           /* clear_page(dest) */
105 M7clear_user_page:
106         set     PAGE_SIZE, %o1
107         /* fall through into bzero code */
108 
109         .size           M7clear_page,.-M7clear_page
110         .size           M7clear_user_page,.-M7clear_user_page
111 
112 /*
113  * Define bzero(dest, n) as memset(dest, 0, n)
114  * (can create a more optimized version later.)
115  */
116         .globl          M7bzero
117 M7bzero:                /* bzero(dest, size) */
118         mov     %o1, %o2
119         mov     0, %o1
120         /* fall through into memset code */
121 
122         .size           M7bzero,.-M7bzero
123 
124         .global         M7memset
125         .type           M7memset, #function
126         .register       %g3, #scratch
127 M7memset:
128         mov     %o0, %o5                ! copy sp1 before using it
129         cmp     %o2, 7                  ! if small counts, just write bytes
130         bleu,pn %xcc, .wrchar
131          and     %o1, 0xff, %o1          ! o1 is (char)c
132 
133         sll     %o1, 8, %o3
134         or      %o1, %o3, %o1           ! now o1 has 2 bytes of c
135         sll     %o1, 16, %o3
136         cmp     %o2, 32
137         blu,pn  %xcc, .wdalign
138          or      %o1, %o3, %o1           ! now o1 has 4 bytes of c
139 
140         sllx    %o1, 32, %o3
141         or      %o1, %o3, %o1           ! now o1 has 8 bytes of c
142 
143 .dbalign:
144         andcc   %o5, 7, %o3             ! is sp1 aligned on a 8 byte bound?
145         bz,pt   %xcc, .blkalign         ! already long word aligned
146          sub     %o3, 8, %o3             ! -(bytes till long word aligned)
147 
148         add     %o2, %o3, %o2           ! update o2 with new count
149         ! Set -(%o3) bytes till sp1 long word aligned
150 1:      stb     %o1, [%o5]              ! there is at least 1 byte to set
151         inccc   %o3                     ! byte clearing loop
152         bl,pt   %xcc, 1b
153          inc     %o5
154 
155         ! Now sp1 is long word aligned (sp1 is found in %o5)
156 .blkalign:
157         cmp     %o2, 64                 ! check if there are 64 bytes to set
158         blu,pn  %xcc, .wrshort
159          mov     %o2, %o3
160 
161         andcc   %o5, 63, %o3            ! is sp1 block aligned?
162         bz,pt   %xcc, .blkwr            ! now block aligned
163          sub     %o3, 64, %o3            ! o3 is -(bytes till block aligned)
164         add     %o2, %o3, %o2           ! o2 is the remainder
165 
166         ! Store -(%o3) bytes till dst is block (64 byte) aligned.
167         ! Use long word stores.
168         ! Recall that dst is already long word aligned
169 1:
170         addcc   %o3, 8, %o3
171         stx     %o1, [%o5]
172         bl,pt   %xcc, 1b
173          add     %o5, 8, %o5
174 
175         ! Now sp1 is block aligned
176 .blkwr:
177         andn    %o2, 63, %o4            ! calculate size of blocks in bytes
178         brz,pn  %o1, .wrzero            ! special case if c == 0
179          and     %o2, 63, %o3            ! %o3 = bytes left after blk stores.
180 
181         set     MIN_LOOP, %g1
182         cmp     %o4, %g1                ! check there are enough bytes to set
183         blu,pn  %xcc, .short_set        ! to justify cost of membar
184                                         ! must be > pre-cleared lines
185          nop
186 
187         ! initial cache-clearing stores
188         ! get store pipeline moving
189         rd      %asi, %g3               ! save %asi to be restored later
190         wr     %g0, ASI_STBIMRU_P, %asi
191 
192         ! Primary memset loop for large memsets
193 .wr_loop:
194         sub     %o5, 8, %o5             ! adjust %o5 for ASI store alignment
195         mov     ST_CHUNK, %g1
196 .wr_loop_start:
197         stxa    %o1, [%o5+8]%asi
198         subcc   %g1, 4, %g1
199         stxa    %o1, [%o5+8+64]%asi
200         add     %o5, 256, %o5
201         stxa    %o1, [%o5+8-128]%asi
202         bgu     %xcc, .wr_loop_start
203          stxa    %o1, [%o5+8-64]%asi
204 
205         sub     %o5, ST_CHUNK*64, %o5   ! reset %o5
206         mov     ST_CHUNK, %g1
207 
208 .wr_loop_rest:
209         stxa    %o1, [%o5+8+8]%asi
210         sub     %o4, 64, %o4
211         stxa    %o1, [%o5+16+8]%asi
212         subcc   %g1, 1, %g1
213         stxa    %o1, [%o5+24+8]%asi
214         stxa    %o1, [%o5+32+8]%asi
215         stxa    %o1, [%o5+40+8]%asi
216         add     %o5, 64, %o5
217         stxa    %o1, [%o5-8]%asi
218         bgu     %xcc, .wr_loop_rest
219          stxa    %o1, [%o5]ASI_STBI_P
220 
221         ! If more than ST_CHUNK*64 bytes remain to set, continue
222         ! setting the first long word of each cache line in advance
223         ! to keep the store pipeline moving.
224 
225         cmp     %o4, ST_CHUNK*64
226         bge,pt  %xcc, .wr_loop_start
227          mov     ST_CHUNK, %g1
228 
229         brz,a,pn %o4, .asi_done
230          add     %o5, 8, %o5             ! restore %o5 offset
231 
232 .wr_loop_small:
233         stxa    %o1, [%o5+8]%asi
234         stxa    %o1, [%o5+8+8]%asi
235         stxa    %o1, [%o5+16+8]%asi
236         stxa    %o1, [%o5+24+8]%asi
237         stxa    %o1, [%o5+32+8]%asi
238         subcc   %o4, 64, %o4
239         stxa    %o1, [%o5+40+8]%asi
240         add     %o5, 64, %o5
241         stxa    %o1, [%o5-8]%asi
242         bgu,pt  %xcc, .wr_loop_small
243          stxa    %o1, [%o5]ASI_STBI_P
244 
245         ba      .asi_done
246          add     %o5, 8, %o5             ! restore %o5 offset
247 
248         ! Special case loop for zero fill memsets
249         ! For each 64 byte cache line, single STBI to first element
250         ! clears line
251 .wrzero:
252         cmp     %o4, MIN_ZERO           ! check if enough bytes to set
253                                         ! to pay %asi + membar cost
254         blu     %xcc, .short_set
255          nop
256         sub     %o4, 256, %o4
257 
258 .wrzero_loop:
259         mov     64, %g3
260         stxa    %o1, [%o5]ASI_STBI_P
261         subcc   %o4, 256, %o4
262         stxa    %o1, [%o5+%g3]ASI_STBI_P
263         add     %o5, 256, %o5
264         sub     %g3, 192, %g3
265         stxa    %o1, [%o5+%g3]ASI_STBI_P
266         add %g3, 64, %g3
267         bge,pt  %xcc, .wrzero_loop
268          stxa    %o1, [%o5+%g3]ASI_STBI_P
269         add     %o4, 256, %o4
270 
271         brz,pn  %o4, .bsi_done
272          nop
273 
274 .wrzero_small:
275         stxa    %o1, [%o5]ASI_STBI_P
276         subcc   %o4, 64, %o4
277         bgu,pt  %xcc, .wrzero_small
278          add     %o5, 64, %o5
279         ba,a    .bsi_done
280 
281 .asi_done:
282         wr      %g3, 0x0, %asi          ! restored saved %asi
283 .bsi_done:
284         membar  #StoreStore             ! required by use of Block Store Init
285 
286 .short_set:
287         cmp     %o4, 64                 ! check if 64 bytes to set
288         blu     %xcc, 5f
289          nop
290 4:                                      ! set final blocks of 64 bytes
291         stx     %o1, [%o5]
292         stx     %o1, [%o5+8]
293         stx     %o1, [%o5+16]
294         stx     %o1, [%o5+24]
295         subcc   %o4, 64, %o4
296         stx     %o1, [%o5+32]
297         stx     %o1, [%o5+40]
298         add     %o5, 64, %o5
299         stx     %o1, [%o5-16]
300         bgu,pt  %xcc, 4b
301          stx     %o1, [%o5-8]
302 
303 5:
304         ! Set the remaining long words
305 .wrshort:
306         subcc   %o3, 8, %o3             ! Can we store any long words?
307         blu,pn  %xcc, .wrchars
308          and     %o2, 7, %o2             ! calc bytes left after long words
309 6:
310         subcc   %o3, 8, %o3
311         stx     %o1, [%o5]              ! store the long words
312         bgeu,pt %xcc, 6b
313          add     %o5, 8, %o5
314 
315 .wrchars:                               ! check for extra chars
316         brnz    %o2, .wrfin
317          nop
318         retl
319          nop
320 
321 .wdalign:
322         andcc   %o5, 3, %o3             ! is sp1 aligned on a word boundary
323         bz,pn   %xcc, .wrword
324          andn    %o2, 3, %o3             ! create word sized count in %o3
325 
326         dec     %o2                     ! decrement count
327         stb     %o1, [%o5]              ! clear a byte
328         b       .wdalign
329          inc     %o5                     ! next byte
330 
331 .wrword:
332         subcc   %o3, 4, %o3
333         st      %o1, [%o5]              ! 4-byte writing loop
334         bnz,pt  %xcc, .wrword
335          add     %o5, 4, %o5
336 
337         and     %o2, 3, %o2             ! leftover count, if any
338 
339 .wrchar:
340         ! Set the remaining bytes, if any
341         brz     %o2, .exit
342          nop
343 .wrfin:
344         deccc   %o2
345         stb     %o1, [%o5]
346         bgu,pt  %xcc, .wrfin
347          inc     %o5
348 .exit:
349         retl                            ! %o0 was preserved
350          nop
351 
352         .size           M7memset,.-M7memset

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php