Skip to content

Commit

Permalink
runtime: optimize the function memmove on loong64
Browse files Browse the repository at this point in the history
benchmarck on 3A6000:
goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
                                 │      old      │                 new                  │
                                 │    sec/op     │    sec/op     vs base                │
Memmove/0                           0.6003n ± 0%   0.6003n ± 0%        ~ (p=0.487 n=20)
Memmove/1                            4.402n ± 0%    2.815n ± 0%  -36.05% (p=0.000 n=20)
Memmove/2                            5.202n ± 0%    3.202n ± 0%  -38.45% (p=0.000 n=20)
Memmove/3                            6.003n ± 0%    2.820n ± 0%  -53.02% (p=0.000 n=20)
Memmove/4                            6.803n ± 0%    3.202n ± 0%  -52.93% (p=0.000 n=20)
Memmove/5                            7.604n ± 0%    3.202n ± 0%  -57.89% (p=0.000 n=20)
Memmove/6                            8.404n ± 0%    3.202n ± 0%  -61.90% (p=0.000 n=20)
Memmove/7                            9.204n ± 0%    3.202n ± 0%  -65.21% (p=0.000 n=20)
Memmove/8                            4.802n ± 0%    3.602n ± 0%  -24.99% (p=0.000 n=20)
Memmove/9                            6.003n ± 0%    3.202n ± 0%  -46.66% (p=0.000 n=20)
Memmove/10                           6.803n ± 0%    3.202n ± 0%  -52.93% (p=0.000 n=20)
Memmove/11                           7.604n ± 0%    3.202n ± 0%  -57.89% (p=0.000 n=20)
Memmove/12                           8.404n ± 0%    3.202n ± 0%  -61.90% (p=0.000 n=20)
Memmove/13                           9.204n ± 0%    3.202n ± 0%  -65.21% (p=0.000 n=20)
Memmove/14                          10.000n ± 0%    3.202n ± 0%  -67.98% (p=0.000 n=20)
Memmove/15                          10.810n ± 0%    3.202n ± 0%  -70.38% (p=0.000 n=20)
Memmove/16                           6.003n ± 0%    3.202n ± 0%  -46.66% (p=0.000 n=20)
Memmove/32                           7.604n ± 0%    3.602n ± 0%  -52.63% (p=0.000 n=20)
Memmove/64                          10.810n ± 0%    4.402n ± 0%  -59.28% (p=0.000 n=20)
Memmove/128                         17.210n ± 0%    8.004n ± 0%  -53.49% (p=0.000 n=20)
Memmove/256                          30.41n ± 0%    10.81n ± 0%  -64.45% (p=0.000 n=20)
Memmove/512                          56.03n ± 0%    17.81n ± 0%  -68.21% (p=0.000 n=20)
Memmove/1024                        107.30n ± 0%    30.62n ± 0%  -71.46% (p=0.000 n=20)
Memmove/2048                        209.70n ± 0%    56.23n ± 0%  -73.19% (p=0.000 n=20)
Memmove/4096                         414.6n ± 0%    107.5n ± 0%  -74.07% (p=0.000 n=20)
MemmoveOverlap/32                    8.404n ± 0%    4.402n ± 0%  -47.62% (p=0.000 n=20)
MemmoveOverlap/64                   11.610n ± 0%    5.003n ± 0%  -56.91% (p=0.000 n=20)
MemmoveOverlap/128                  18.010n ± 0%    9.005n ± 0%  -50.00% (p=0.000 n=20)
MemmoveOverlap/256                   31.22n ± 0%    12.41n ± 0%  -60.25% (p=0.000 n=20)
MemmoveOverlap/512                   56.83n ± 0%    19.08n ± 0%  -66.43% (p=0.000 n=20)
MemmoveOverlap/1024                 108.10n ± 0%    32.00n ± 0%  -70.40% (p=0.000 n=20)
MemmoveOverlap/2048                 210.50n ± 0%    57.94n ± 0%  -72.48% (p=0.000 n=20)
MemmoveOverlap/4096                  415.4n ± 0%    108.9n ± 0%  -73.78% (p=0.000 n=20)
MemmoveUnalignedDst/0                2.448n ± 0%    2.942n ± 0%  +20.16% (p=0.000 n=20)
MemmoveUnalignedDst/1                4.802n ± 0%    3.202n ± 0%  -33.32% (p=0.000 n=20)
MemmoveUnalignedDst/2                5.603n ± 0%    3.602n ± 0%  -35.71% (p=0.000 n=20)
MemmoveUnalignedDst/3                6.403n ± 0%    3.202n ± 0%  -49.99% (p=0.000 n=20)
MemmoveUnalignedDst/4                7.203n ± 0%    3.602n ± 0%  -49.99% (p=0.000 n=20)
MemmoveUnalignedDst/5                8.004n ± 0%    3.602n ± 0%  -55.00% (p=0.000 n=20)
MemmoveUnalignedDst/6                8.804n ± 0%    3.602n ± 0%  -59.09% (p=0.000 n=20)
MemmoveUnalignedDst/7                9.605n ± 0%    3.602n ± 0%  -62.50% (p=0.000 n=20)
MemmoveUnalignedDst/8               10.400n ± 0%    4.002n ± 0%  -61.52% (p=0.000 n=20)
MemmoveUnalignedDst/9               11.210n ± 0%    3.802n ± 0%  -66.08% (p=0.000 n=20)
MemmoveUnalignedDst/10              12.010n ± 0%    3.802n ± 0%  -68.34% (p=0.000 n=20)
MemmoveUnalignedDst/11              12.810n ± 0%    3.802n ± 0%  -70.32% (p=0.000 n=20)
MemmoveUnalignedDst/12              13.610n ± 0%    3.802n ± 0%  -72.06% (p=0.000 n=20)
MemmoveUnalignedDst/13              14.410n ± 0%    3.802n ± 0%  -73.62% (p=0.000 n=20)
MemmoveUnalignedDst/14              15.210n ± 0%    3.802n ± 0%  -75.00% (p=0.000 n=20)
MemmoveUnalignedDst/15              16.010n ± 0%    3.802n ± 0%  -76.25% (p=0.000 n=20)
MemmoveUnalignedDst/16              17.210n ± 0%    3.802n ± 0%  -77.91% (p=0.000 n=20)
MemmoveUnalignedDst/32              30.020n ± 0%    4.202n ± 0%  -86.00% (p=0.000 n=20)
MemmoveUnalignedDst/64              56.030n ± 0%    6.804n ± 0%  -87.86% (p=0.000 n=20)
MemmoveUnalignedDst/128             106.90n ± 0%    13.61n ± 0%  -87.27% (p=0.000 n=20)
MemmoveUnalignedDst/256             209.50n ± 0%    17.07n ± 1%  -91.85% (p=0.000 n=20)
MemmoveUnalignedDst/512             414.60n ± 0%    24.95n ± 0%  -93.98% (p=0.000 n=20)
MemmoveUnalignedDst/1024            828.40n ± 0%    42.82n ± 0%  -94.83% (p=0.000 n=20)
MemmoveUnalignedDst/2048           1648.00n ± 0%    78.04n ± 0%  -95.26% (p=0.000 n=20)
MemmoveUnalignedDst/4096            3287.0n ± 0%    148.4n ± 0%  -95.49% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/32       30.810n ± 0%    5.603n ± 0%  -81.81% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/64       56.430n ± 0%    7.604n ± 0%  -86.52% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/128     107.700n ± 0%    9.812n ± 0%  -90.89% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/256      210.10n ± 0%    13.50n ± 0%  -93.57% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/512      415.00n ± 0%    21.21n ± 0%  -94.89% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/1024     828.80n ± 0%    41.02n ± 0%  -95.05% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/2048    1648.00n ± 0%    80.23n ± 0%  -95.13% (p=0.000 n=20)
MemmoveUnalignedDstOverlap/4096     3288.0n ± 0%    162.4n ± 0%  -95.06% (p=0.000 n=20)
MemmoveUnalignedSrc/0                2.468n ± 1%    2.913n ± 0%  +18.01% (p=0.000 n=20)
MemmoveUnalignedSrc/1                4.802n ± 0%    3.202n ± 0%  -33.32% (p=0.000 n=20)
MemmoveUnalignedSrc/2                5.603n ± 0%    3.603n ± 0%  -35.70% (p=0.000 n=20)
MemmoveUnalignedSrc/3                6.403n ± 0%    3.207n ± 0%  -49.91% (p=0.000 n=20)
MemmoveUnalignedSrc/4                7.203n ± 0%    3.603n ± 0%  -49.98% (p=0.000 n=20)
MemmoveUnalignedSrc/5                8.004n ± 0%    3.602n ± 0%  -55.00% (p=0.000 n=20)
MemmoveUnalignedSrc/6                8.804n ± 0%    3.602n ± 0%  -59.09% (p=0.000 n=20)
MemmoveUnalignedSrc/7                9.605n ± 0%    3.602n ± 0%  -62.50% (p=0.000 n=20)
MemmoveUnalignedSrc/8               10.410n ± 0%    4.002n ± 0%  -61.56% (p=0.000 n=20)
MemmoveUnalignedSrc/9               11.210n ± 0%    3.802n ± 0%  -66.08% (p=0.000 n=20)
MemmoveUnalignedSrc/10              12.010n ± 0%    3.802n ± 0%  -68.34% (p=0.000 n=20)
MemmoveUnalignedSrc/11              12.810n ± 0%    3.802n ± 0%  -70.32% (p=0.000 n=20)
MemmoveUnalignedSrc/12              13.610n ± 0%    3.802n ± 0%  -72.06% (p=0.000 n=20)
MemmoveUnalignedSrc/13              14.410n ± 0%    3.802n ± 0%  -73.62% (p=0.000 n=20)
MemmoveUnalignedSrc/14              15.210n ± 0%    3.802n ± 0%  -75.00% (p=0.000 n=20)
MemmoveUnalignedSrc/15              16.010n ± 0%    3.802n ± 0%  -76.25% (p=0.000 n=20)
MemmoveUnalignedSrc/16              16.810n ± 0%    3.802n ± 0%  -77.38% (p=0.000 n=20)
MemmoveUnalignedSrc/32              30.410n ± 0%    4.301n ± 0%  -85.86% (p=0.000 n=20)
MemmoveUnalignedSrc/64              55.630n ± 0%    5.203n ± 0%  -90.65% (p=0.000 n=20)
MemmoveUnalignedSrc/128            107.300n ± 0%    8.805n ± 0%  -91.79% (p=0.000 n=20)
MemmoveUnalignedSrc/256             209.50n ± 0%    12.41n ± 6%  -94.08% (p=0.000 n=20)
MemmoveUnalignedSrc/512             414.20n ± 0%    20.41n ± 0%  -95.07% (p=0.000 n=20)
MemmoveUnalignedSrc/1024            828.00n ± 0%    36.92n ± 0%  -95.54% (p=0.000 n=20)
MemmoveUnalignedSrc/2048           1648.00n ± 0%    71.41n ± 0%  -95.67% (p=0.000 n=20)
MemmoveUnalignedSrc/4096            3287.0n ± 0%    132.2n ± 0%  -95.98% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_0        7.203n ± 0%    5.002n ± 0%  -30.56% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_0        7.604n ± 0%    5.002n ± 0%  -34.22% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_1       13.210n ± 0%    5.002n ± 0%  -62.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_1       13.210n ± 0%    5.002n ± 0%  -62.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_4       13.210n ± 0%    5.002n ± 0%  -62.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_4       13.610n ± 0%    5.002n ± 0%  -63.24% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_16_7       12.810n ± 0%    5.002n ± 0%  -60.95% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_16_7       13.610n ± 0%    5.002n ± 0%  -63.24% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_0       12.010n ± 0%    7.191n ± 0%  -40.12% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_0       12.410n ± 0%    7.194n ± 0%  -42.03% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_1       18.410n ± 0%    7.604n ± 0%  -58.70% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_1       18.410n ± 0%    7.604n ± 0%  -58.70% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_4       18.410n ± 0%    7.604n ± 0%  -58.70% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_4       18.810n ± 0%    7.604n ± 0%  -59.57% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_64_7       18.010n ± 0%    7.604n ± 0%  -57.78% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_64_7       18.810n ± 0%    7.604n ± 0%  -59.57% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_0       31.62n ± 0%    14.19n ± 0%  -55.12% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_0       32.02n ± 0%    13.61n ± 0%  -57.50% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_1       38.02n ± 0%    18.20n ± 0%  -52.13% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_1       38.02n ± 0%    18.41n ± 0%  -51.58% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_4       38.02n ± 0%    17.21n ± 0%  -54.73% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_4       38.42n ± 0%    16.81n ± 0%  -56.25% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_256_7       37.62n ± 0%    15.61n ± 0%  -58.51% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_256_7       38.42n ± 0%    15.01n ± 0%  -60.93% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_0      415.8n ± 0%    111.1n ± 0%  -73.28% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_0      416.2n ± 0%    110.5n ± 0%  -73.45% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_1      422.2n ± 0%    114.3n ± 0%  -72.93% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_1      422.2n ± 0%    114.7n ± 0%  -72.83% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_4      422.2n ± 0%    113.3n ± 0%  -73.16% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_4      422.6n ± 0%    113.1n ± 0%  -73.24% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_4096_7      421.8n ± 0%    111.7n ± 0%  -73.52% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_4096_7      422.6n ± 0%    111.7n ± 0%  -73.57% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_0     6.568µ ± 0%    4.869µ ± 0%  -25.88% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_0     6.568µ ± 0%    5.009µ ± 0%  -23.74% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_1     6.574µ ± 0%    4.743µ ± 0%  -27.85% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_1     6.574µ ± 0%    4.770µ ± 0%  -27.44% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_4     6.574µ ± 0%    4.758µ ± 0%  -27.63% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_4     6.574µ ± 0%    4.768µ ± 0%  -27.48% (p=0.000 n=20)
MemmoveUnalignedSrcDst/f_65536_7     6.574µ ± 0%    4.757µ ± 0%  -27.64% (p=0.000 n=20)
MemmoveUnalignedSrcDst/b_65536_7     6.574µ ± 0%    4.583µ ± 0%  -30.29% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/32       30.410n ± 0%    6.804n ± 0%  -77.63% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/64        56.03n ± 0%    10.01n ± 0%  -82.13% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/128      107.30n ± 0%    14.01n ± 0%  -86.94% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/256      209.70n ± 0%    13.43n ± 1%  -93.60% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/512      414.60n ± 0%    22.23n ± 0%  -94.64% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/1024     828.40n ± 0%    37.62n ± 0%  -95.46% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/2048    1648.00n ± 0%    68.04n ± 0%  -95.87% (p=0.000 n=20)
MemmoveUnalignedSrcOverlap/4096     3287.0n ± 0%    128.9n ± 0%  -96.08% (p=0.000 n=20)
geomean                              48.94n         13.58n       -72.26%

The relevant performance improved by 72.26%.

Change-Id: If2d3e09c3d687e733e6ff2c50feb8d6a8eb7e63b
Reviewed-on: https://go-review.googlesource.com/c/go/+/589537
Reviewed-by: Tim King <[email protected]>
Reviewed-by: Dmitri Shuralyov <[email protected]>
Reviewed-by: Qiqi Huang <[email protected]>
Reviewed-by: Meidan Li <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: abner chenc <[email protected]>
Auto-Submit: Tim King <[email protected]>
  • Loading branch information
sophie-zhao authored and gopherbot committed Sep 11, 2024
1 parent eb97560 commit 547baaf
Showing 1 changed file with 251 additions and 84 deletions.
335 changes: 251 additions & 84 deletions src/runtime/memmove_loong64.s
Original file line number Diff line number Diff line change
Expand Up @@ -6,99 +6,266 @@

// See memmove Go doc for important implementation constraints.

// Register map
//
// to R4
// from R5
// n(aka count) R6
// to-end R7
// from-end R8
// data R11-R18
// tmp R9

// Algorithm:
//
// Memory alignment check is only performed for copy size greater
// than 64 bytes to minimize overhead.
//
// when copy size <= 64 bytes, jump to label tail, according to the
// copy size to select the appropriate case and copy directly.
// Based on the common memory access instructions of loong64, the
// currently implemented cases are:
// move_0, move_1, move_2, move_3, move_4, move_5through7, move_8,
// move_9through16, move_17through32, move_33through64
//
// when copy size > 64 bytes, use the destination-aligned copying,
// adopt the following strategy to copy in 3 parts:
// 1. Head: do the memory alignment
// 2. Body: a 64-byte loop structure
// 3. Tail: processing of the remaining part (<= 64 bytes)
//
// forward:
//
// Dst NewDst Dstend
// | |<----count after correction---->|
// |<-------------count before correction---------->|
// |<--8-(Dst&7)-->| |<---64 bytes--->|
// +------------------------------------------------+
// | Head | Body | Tail |
// +---------------+---------------+----------------+
// NewDst = Dst - (Dst & 7) + 8
// count = count - 8 + (Dst & 7)
// Src = Src - (Dst & 7) + 8
//
// backward:
//
// Dst NewDstend Dstend
// |<-----count after correction------>| |
// |<------------count before correction--------------->|
// |<---64 bytes--->| |<---Dstend&7--->|
// +----------------------------------------------------+
// | Tail | Body | Head |
// +----------------+------------------+----------------+
// NewDstend = Dstend - (Dstend & 7)
// count = count - (Dstend & 7)
// Srcend = Srcend - (Dstend & 7)

// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
BNE R6, check
RET
BEQ R4, R5, move_0
BEQ R6, move_0

check:
SGTU R4, R5, R7
BNE R7, backward
ADDV R4, R6, R7 // to-end pointer
ADDV R5, R6, R8 // from-end pointer

ADDV R4, R6, R9 // end pointer
tail:
//copy size <= 64 bytes, copy directly, not check aligned

// if the two pointers are not of same alignments, do byte copying
SUBVU R5, R4, R7
AND $7, R7
BNE R7, out
// < 2 bytes
SGTU $2, R6, R9
BNE R9, move_1

// if less than 8 bytes, do byte copying
SGTU $8, R6, R7
BNE R7, out
// < 3 bytes
SGTU $3, R6, R9
BNE R9, move_2

// do one byte at a time until 8-aligned
AND $7, R4, R8
BEQ R8, words
MOVB (R5), R7
ADDV $1, R5
MOVB R7, (R4)
ADDV $1, R4
JMP -6(PC)

words:
// do 8 bytes at a time if there is room
ADDV $-7, R9, R6 // R6 is end pointer-7

PCALIGN $16
SGTU R6, R4, R8
BEQ R8, out
MOVV (R5), R7
ADDV $8, R5
MOVV R7, (R4)
ADDV $8, R4
JMP -6(PC)

out:
BEQ R4, R9, done
MOVB (R5), R7
// < 4 bytes
SGTU $4, R6, R9
BNE R9, move_3

// < 5 bytes
SGTU $5, R6, R9
BNE R9, move_4

// >= 5 bytes and < 8 bytes
SGTU $8, R6, R9
BNE R9, move_5through7

// < 9 bytes
SGTU $9, R6, R9
BNE R9, move_8

// >= 9 bytes and < 17 bytes
SGTU $17, R6, R9
BNE R9, move_9through16

// >= 17 bytes and < 33 bytes
SGTU $33, R6, R9
BNE R9, move_17through32

// >= 33 bytes and < 65 bytes
SGTU $65, R6, R9
BNE R9, move_33through64

// if (dst > src) && (dst < src + count), regarded as memory
// overlap, jump to backward
// else, jump to forward
BGEU R5, R4, forward
ADDV R5, R6, R10
BLTU R4, R10, backward

forward:
AND $7, R4, R9 // dst & 7
BEQ R9, body
head:
MOVV $8, R10
SUBV R9, R10 // head = 8 - (dst & 7)
MOVB (R5), R11
SUBV $1, R10
ADDV $1, R5
MOVB R7, (R4)
MOVB R11, (R4)
ADDV $1, R4
JMP -5(PC)
done:
RET
BNE R10, -5(PC)
ADDV R9, R6
ADDV $-8, R6 // newcount = count + (dst & 7) - 8
// if newcount < 65 bytes, use move_33through64 to copy is enough
SGTU $65, R6, R9
BNE R9, move_33through64

body:
MOVV (R5), R11
MOVV 8(R5), R12
MOVV 16(R5), R13
MOVV 24(R5), R14
MOVV 32(R5), R15
MOVV 40(R5), R16
MOVV 48(R5), R17
MOVV 56(R5), R18
MOVV R11, (R4)
MOVV R12, 8(R4)
MOVV R13, 16(R4)
MOVV R14, 24(R4)
MOVV R15, 32(R4)
MOVV R16, 40(R4)
MOVV R17, 48(R4)
MOVV R18, 56(R4)
ADDV $-64, R6
ADDV $64, R4
ADDV $64, R5
SGTU $64, R6, R9
// if the remaining part >= 64 bytes, jmp to body
BEQ R9, body
// if the remaining part == 0 bytes, use move_0 to return
BEQ R6, move_0
// if the remaining part in (0, 63] bytes, jmp to tail
JMP tail

// The backward copy algorithm is the same as the forward copy,
// except for the direction.
backward:
ADDV R6, R5 // from-end pointer
ADDV R4, R6, R9 // to-end pointer

// if the two pointers are not of same alignments, do byte copying
SUBVU R9, R5, R7
AND $7, R7
BNE R7, out1

// if less than 8 bytes, do byte copying
SGTU $8, R6, R7
BNE R7, out1

// do one byte at a time until 8-aligned
AND $7, R9, R8
BEQ R8, words1
ADDV $-1, R5
MOVB (R5), R7
ADDV $-1, R9
MOVB R7, (R9)
JMP -6(PC)

words1:
// do 8 bytes at a time if there is room
ADDV $7, R4, R6 // R6 is start pointer+7

PCALIGN $16
SGTU R9, R6, R8
BEQ R8, out1
ADDV $-8, R5
MOVV (R5), R7
ADDV $-8, R9
MOVV R7, (R9)
JMP -6(PC)

out1:
BEQ R4, R9, done1
ADDV $-1, R5
MOVB (R5), R7
ADDV $-1, R9
MOVB R7, (R9)
JMP -5(PC)
done1:
AND $7, R7, R9 // dstend & 7
BEQ R9, b_body
b_head:
MOVV -8(R8), R11
SUBV R9, R6 // newcount = count - (dstend & 7)
SUBV R9, R8 // newsrcend = srcend - (dstend & 7)
MOVV -8(R8), R12
MOVV R11, -8(R7)
SUBV R9, R7 // newdstend = dstend - (dstend & 7)
MOVV R12, -8(R7)
SUBV $8, R6
SUBV $8, R7
SUBV $8, R8
SGTU $65, R6, R9
BNE R9, move_33through64

b_body:
MOVV -8(R8), R11
MOVV -16(R8), R12
MOVV -24(R8), R13
MOVV -32(R8), R14
MOVV -40(R8), R15
MOVV -48(R8), R16
MOVV -56(R8), R17
MOVV -64(R8), R18
MOVV R11, -8(R7)
MOVV R12, -16(R7)
MOVV R13, -24(R7)
MOVV R14, -32(R7)
MOVV R15, -40(R7)
MOVV R16, -48(R7)
MOVV R17, -56(R7)
MOVV R18, -64(R7)
ADDV $-64, R6
ADDV $-64, R7
ADDV $-64, R8
SGTU $64, R6, R9
BEQ R9, b_body
BEQ R6, move_0
JMP tail

move_0:
RET

move_1:
MOVB (R5), R11
MOVB R11, (R4)
RET
move_2:
MOVH (R5), R11
MOVH R11, (R4)
RET
move_3:
MOVH (R5), R11
MOVB -1(R8), R12
MOVH R11, (R4)
MOVB R12, -1(R7)
RET
move_4:
MOVW (R5), R11
MOVW R11, (R4)
RET
move_5through7:
MOVW (R5), R11
MOVW -4(R8), R12
MOVW R11, (R4)
MOVW R12, -4(R7)
RET
move_8:
MOVV (R5), R11
MOVV R11, (R4)
RET
move_9through16:
MOVV (R5), R11
MOVV -8(R8), R12
MOVV R11, (R4)
MOVV R12, -8(R7)
RET
move_17through32:
MOVV (R5), R11
MOVV 8(R5), R12
MOVV -16(R8), R13
MOVV -8(R8), R14
MOVV R11, (R4)
MOVV R12, 8(R4)
MOVV R13, -16(R7)
MOVV R14, -8(R7)
RET
move_33through64:
MOVV (R5), R11
MOVV 8(R5), R12
MOVV 16(R5), R13
MOVV 24(R5), R14
MOVV -32(R8), R15
MOVV -24(R8), R16
MOVV -16(R8), R17
MOVV -8(R8), R18
MOVV R11, (R4)
MOVV R12, 8(R4)
MOVV R13, 16(R4)
MOVV R14, 24(R4)
MOVV R15, -32(R7)
MOVV R16, -24(R7)
MOVV R17, -16(R7)
MOVV R18, -8(R7)
RET

0 comments on commit 547baaf

Please sign in to comment.