~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/powerpc/lib/checksum_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 /*
  3  * This file contains assembly-language implementations
  4  * of IP-style 1's complement checksum routines.
  5  *      
  6  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  7  *
  8  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  9  */
 10 
 11 #include <linux/export.h>
 12 #include <linux/sys.h>
 13 #include <asm/processor.h>
 14 #include <asm/errno.h>
 15 #include <asm/ppc_asm.h>
 16 
 17 /*
 18  * Computes the checksum of a memory block at buff, length len,
 19  * and adds in "sum" (32-bit).
 20  *
 21  * __csum_partial(r3=buff, r4=len, r5=sum)
 22  */
 23 _GLOBAL(__csum_partial)
 24         addic   r0,r5,0                 /* clear carry */
 25 
 26         srdi.   r6,r4,3                 /* less than 8 bytes? */
 27         beq     .Lcsum_tail_word
 28 
 29         /*
 30          * If only halfword aligned, align to a double word. Since odd
 31          * aligned addresses should be rare and they would require more
 32          * work to calculate the correct checksum, we ignore that case
 33          * and take the potential slowdown of unaligned loads.
 34          */
 35         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
 36         beq     .Lcsum_aligned
 37 
 38         li      r7,4
 39         sub     r6,r7,r6
 40         mtctr   r6
 41 
 42 1:
 43         lhz     r6,0(r3)                /* align to doubleword */
 44         subi    r4,r4,2
 45         addi    r3,r3,2
 46         adde    r0,r0,r6
 47         bdnz    1b
 48 
 49 .Lcsum_aligned:
 50         /*
 51          * We unroll the loop such that each iteration is 64 bytes with an
 52          * entry and exit limb of 64 bytes, meaning a minimum size of
 53          * 128 bytes.
 54          */
 55         srdi.   r6,r4,7
 56         beq     .Lcsum_tail_doublewords         /* len < 128 */
 57 
 58         srdi    r6,r4,6
 59         subi    r6,r6,1
 60         mtctr   r6
 61 
 62         stdu    r1,-STACKFRAMESIZE(r1)
 63         std     r14,STK_REG(R14)(r1)
 64         std     r15,STK_REG(R15)(r1)
 65         std     r16,STK_REG(R16)(r1)
 66 
 67         ld      r6,0(r3)
 68         ld      r9,8(r3)
 69 
 70         ld      r10,16(r3)
 71         ld      r11,24(r3)
 72 
 73         /*
 74          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
 75          * because of the XER dependency. This means the fastest this loop can
 76          * go is 16 cycles per iteration. The scheduling of the loop below has
 77          * been shown to hit this on both POWER6 and POWER7.
 78          */
 79         .align 5
 80 2:
 81         adde    r0,r0,r6
 82         ld      r12,32(r3)
 83         ld      r14,40(r3)
 84 
 85         adde    r0,r0,r9
 86         ld      r15,48(r3)
 87         ld      r16,56(r3)
 88         addi    r3,r3,64
 89 
 90         adde    r0,r0,r10
 91 
 92         adde    r0,r0,r11
 93 
 94         adde    r0,r0,r12
 95 
 96         adde    r0,r0,r14
 97 
 98         adde    r0,r0,r15
 99         ld      r6,0(r3)
100         ld      r9,8(r3)
101 
102         adde    r0,r0,r16
103         ld      r10,16(r3)
104         ld      r11,24(r3)
105         bdnz    2b
106 
107 
108         adde    r0,r0,r6
109         ld      r12,32(r3)
110         ld      r14,40(r3)
111 
112         adde    r0,r0,r9
113         ld      r15,48(r3)
114         ld      r16,56(r3)
115         addi    r3,r3,64
116 
117         adde    r0,r0,r10
118         adde    r0,r0,r11
119         adde    r0,r0,r12
120         adde    r0,r0,r14
121         adde    r0,r0,r15
122         adde    r0,r0,r16
123 
124         ld      r14,STK_REG(R14)(r1)
125         ld      r15,STK_REG(R15)(r1)
126         ld      r16,STK_REG(R16)(r1)
127         addi    r1,r1,STACKFRAMESIZE
128 
129         andi.   r4,r4,63
130 
131 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
132         srdi.   r6,r4,3
133         beq     .Lcsum_tail_word
134 
135         mtctr   r6
136 3:
137         ld      r6,0(r3)
138         addi    r3,r3,8
139         adde    r0,r0,r6
140         bdnz    3b
141 
142         andi.   r4,r4,7
143 
144 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
145         srdi.   r6,r4,2
146         beq     .Lcsum_tail_halfword
147 
148         lwz     r6,0(r3)
149         addi    r3,r3,4
150         adde    r0,r0,r6
151         subi    r4,r4,4
152 
153 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
154         srdi.   r6,r4,1
155         beq     .Lcsum_tail_byte
156 
157         lhz     r6,0(r3)
158         addi    r3,r3,2
159         adde    r0,r0,r6
160         subi    r4,r4,2
161 
162 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
163         andi.   r6,r4,1
164         beq     .Lcsum_finish
165 
166         lbz     r6,0(r3)
167 #ifdef __BIG_ENDIAN__
168         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
169         adde    r0,r0,r9
170 #else
171         adde    r0,r0,r6
172 #endif
173 
174 .Lcsum_finish:
175         addze   r0,r0                   /* add in final carry */
176         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
177         add     r3,r4,r0
178         srdi    r3,r3,32
179         blr
180 EXPORT_SYMBOL(__csum_partial)
181 
182 
183         .macro srcnr
184 100:
185         EX_TABLE(100b,.Lerror_nr)
186         .endm
187 
188         .macro source
189 150:
190         EX_TABLE(150b,.Lerror)
191         .endm
192 
193         .macro dstnr
194 200:
195         EX_TABLE(200b,.Lerror_nr)
196         .endm
197 
198         .macro dest
199 250:
200         EX_TABLE(250b,.Lerror)
201         .endm
202 
203 /*
204  * Computes the checksum of a memory block at src, length len,
205  * and adds in 0xffffffff (32-bit), while copying the block to dst.
206  * If an access exception occurs, it returns 0.
207  *
208  * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209  */
210 _GLOBAL(csum_partial_copy_generic)
211         li      r6,-1
212         addic   r0,r6,0                 /* clear carry */
213 
214         srdi.   r6,r5,3                 /* less than 8 bytes? */
215         beq     .Lcopy_tail_word
216 
217         /*
218          * If only halfword aligned, align to a double word. Since odd
219          * aligned addresses should be rare and they would require more
220          * work to calculate the correct checksum, we ignore that case
221          * and take the potential slowdown of unaligned loads.
222          *
223          * If the source and destination are relatively unaligned we only
224          * align the source. This keeps things simple.
225          */
226         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
227         beq     .Lcopy_aligned
228 
229         li      r9,4
230         sub     r6,r9,r6
231         mtctr   r6
232 
233 1:
234 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
235         subi    r5,r5,2
236         addi    r3,r3,2
237         adde    r0,r0,r6
238 dstnr;  sth     r6,0(r4)
239         addi    r4,r4,2
240         bdnz    1b
241 
242 .Lcopy_aligned:
243         /*
244          * We unroll the loop such that each iteration is 64 bytes with an
245          * entry and exit limb of 64 bytes, meaning a minimum size of
246          * 128 bytes.
247          */
248         srdi.   r6,r5,7
249         beq     .Lcopy_tail_doublewords         /* len < 128 */
250 
251         srdi    r6,r5,6
252         subi    r6,r6,1
253         mtctr   r6
254 
255         stdu    r1,-STACKFRAMESIZE(r1)
256         std     r14,STK_REG(R14)(r1)
257         std     r15,STK_REG(R15)(r1)
258         std     r16,STK_REG(R16)(r1)
259 
260 source; ld      r6,0(r3)
261 source; ld      r9,8(r3)
262 
263 source; ld      r10,16(r3)
264 source; ld      r11,24(r3)
265 
266         /*
267          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
268          * because of the XER dependency. This means the fastest this loop can
269          * go is 16 cycles per iteration. The scheduling of the loop below has
270          * been shown to hit this on both POWER6 and POWER7.
271          */
272         .align 5
273 2:
274         adde    r0,r0,r6
275 source; ld      r12,32(r3)
276 source; ld      r14,40(r3)
277 
278         adde    r0,r0,r9
279 source; ld      r15,48(r3)
280 source; ld      r16,56(r3)
281         addi    r3,r3,64
282 
283         adde    r0,r0,r10
284 dest;   std     r6,0(r4)
285 dest;   std     r9,8(r4)
286 
287         adde    r0,r0,r11
288 dest;   std     r10,16(r4)
289 dest;   std     r11,24(r4)
290 
291         adde    r0,r0,r12
292 dest;   std     r12,32(r4)
293 dest;   std     r14,40(r4)
294 
295         adde    r0,r0,r14
296 dest;   std     r15,48(r4)
297 dest;   std     r16,56(r4)
298         addi    r4,r4,64
299 
300         adde    r0,r0,r15
301 source; ld      r6,0(r3)
302 source; ld      r9,8(r3)
303 
304         adde    r0,r0,r16
305 source; ld      r10,16(r3)
306 source; ld      r11,24(r3)
307         bdnz    2b
308 
309 
310         adde    r0,r0,r6
311 source; ld      r12,32(r3)
312 source; ld      r14,40(r3)
313 
314         adde    r0,r0,r9
315 source; ld      r15,48(r3)
316 source; ld      r16,56(r3)
317         addi    r3,r3,64
318 
319         adde    r0,r0,r10
320 dest;   std     r6,0(r4)
321 dest;   std     r9,8(r4)
322 
323         adde    r0,r0,r11
324 dest;   std     r10,16(r4)
325 dest;   std     r11,24(r4)
326 
327         adde    r0,r0,r12
328 dest;   std     r12,32(r4)
329 dest;   std     r14,40(r4)
330 
331         adde    r0,r0,r14
332 dest;   std     r15,48(r4)
333 dest;   std     r16,56(r4)
334         addi    r4,r4,64
335 
336         adde    r0,r0,r15
337         adde    r0,r0,r16
338 
339         ld      r14,STK_REG(R14)(r1)
340         ld      r15,STK_REG(R15)(r1)
341         ld      r16,STK_REG(R16)(r1)
342         addi    r1,r1,STACKFRAMESIZE
343 
344         andi.   r5,r5,63
345 
346 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
347         srdi.   r6,r5,3
348         beq     .Lcopy_tail_word
349 
350         mtctr   r6
351 3:
352 srcnr;  ld      r6,0(r3)
353         addi    r3,r3,8
354         adde    r0,r0,r6
355 dstnr;  std     r6,0(r4)
356         addi    r4,r4,8
357         bdnz    3b
358 
359         andi.   r5,r5,7
360 
361 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
362         srdi.   r6,r5,2
363         beq     .Lcopy_tail_halfword
364 
365 srcnr;  lwz     r6,0(r3)
366         addi    r3,r3,4
367         adde    r0,r0,r6
368 dstnr;  stw     r6,0(r4)
369         addi    r4,r4,4
370         subi    r5,r5,4
371 
372 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
373         srdi.   r6,r5,1
374         beq     .Lcopy_tail_byte
375 
376 srcnr;  lhz     r6,0(r3)
377         addi    r3,r3,2
378         adde    r0,r0,r6
379 dstnr;  sth     r6,0(r4)
380         addi    r4,r4,2
381         subi    r5,r5,2
382 
383 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
384         andi.   r6,r5,1
385         beq     .Lcopy_finish
386 
387 srcnr;  lbz     r6,0(r3)
388 #ifdef __BIG_ENDIAN__
389         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
390         adde    r0,r0,r9
391 #else
392         adde    r0,r0,r6
393 #endif
394 dstnr;  stb     r6,0(r4)
395 
396 .Lcopy_finish:
397         addze   r0,r0                   /* add in final carry */
398         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
399         add     r3,r4,r0
400         srdi    r3,r3,32
401         blr
402 
403 .Lerror:
404         ld      r14,STK_REG(R14)(r1)
405         ld      r15,STK_REG(R15)(r1)
406         ld      r16,STK_REG(R16)(r1)
407         addi    r1,r1,STACKFRAMESIZE
408 .Lerror_nr:
409         li      r3,0
410         blr
411 
412 EXPORT_SYMBOL(csum_partial_copy_generic)
413 
414 /*
415  * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416  *                         const struct in6_addr *daddr,
417  *                         __u32 len, __u8 proto, __wsum sum)
418  */
419 
420 _GLOBAL(csum_ipv6_magic)
421         ld      r8, 0(r3)
422         ld      r9, 8(r3)
423         add     r5, r5, r6
424         addc    r0, r8, r9
425         ld      r10, 0(r4)
426         ld      r11, 8(r4)
427 #ifdef CONFIG_CPU_LITTLE_ENDIAN
428         rotldi  r5, r5, 8
429 #endif
430         adde    r0, r0, r10
431         add     r5, r5, r7
432         adde    r0, r0, r11
433         adde    r0, r0, r5
434         addze   r0, r0
435         rotldi  r3, r0, 32              /* fold two 32 bit halves together */
436         add     r3, r0, r3
437         srdi    r0, r3, 32
438         rotlwi  r3, r0, 16              /* fold two 16 bit halves together */
439         add     r3, r0, r3
440         not     r3, r3
441         rlwinm  r3, r3, 16, 16, 31
442         blr
443 EXPORT_SYMBOL(csum_ipv6_magic)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php