/*************************************************************************** Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define unit_size 4 #define DISP64(ind,disp) (ind*unit_size*64+disp) #define DISP32(ind,disp) (ind*unit_size*32+disp) #define DISP16(ind,disp) (ind*unit_size*16+disp) #define DISP8(ind,disp) (ind*unit_size*8+disp) #define DISP4(ind,disp) (ind*unit_size*4+disp) #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) /********************************************************************************************** * Macros for N=8 and M=16 **********************************************************************************************/ .macro LOAD8x16_1 LOAD8x16 1 .endm .macro LOAD8x16_0 LOAD8x16 0 .endm .macro KERNEL8x16_L1_L4 Index,IsLast KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 .endm .macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro Zero8X16 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs38, vs38, vs38 xxlxor vs39, vs39, vs39 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs42, vs42, vs42 xxlxor vs43, vs43, vs43 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs50, vs50, vs50 xxlxor vs51, vs51, vs51 xxlxor vs52, vs52, vs52 xxlxor vs53, vs53, vs53 xxlxor vs54, vs54, vs54 xxlxor vs55, vs55, vs55 xxlxor vs56, vs56, vs56 xxlxor vs57, vs57, vs57 xxlxor vs58, vs58, vs58 xxlxor vs59, vs59, vs59 xxlxor vs60, vs60, vs60 xxlxor vs61, vs61, vs61 xxlxor vs62, vs62, vs62 xxlxor vs63, vs63, vs63 .endm .macro LOAD8x16 Zero lxv vs24, 0(BO) lxv vs28, 16(BO) lxv vs0, 0(AO) lxv vs1, 16(AO) lxv vs2, 32(AO) lxv vs3, 48(AO) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 .if \Zero==1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs38, vs38, vs38 xxlxor vs39, vs39, vs39 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs42, vs42, vs42 xxlxor vs43, vs43, vs43 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs50, vs50, vs50 xxlxor vs51, vs51, vs51 xxlxor vs52, vs52, vs52 xxlxor vs53, vs53, vs53 xxlxor vs54, vs54, vs54 xxlxor vs55, vs55, vs55 xxlxor vs56, vs56, vs56 xxlxor vs57, vs57, vs57 xxlxor vs58, vs58, vs58 xxlxor vs59, vs59, vs59 xxlxor vs60, vs60, vs60 xxlxor vs61, vs61, vs61 xxlxor vs62, vs62, vs62 xxlxor vs63, vs63, vs63 .endif .endm .macro END8x16_NORMAL END8x16 0, AO, BO, 64,32 .endm .macro END8x16 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs34, vs2,vs24 xvmulsp vs35, vs3,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 xvmulsp vs38, vs2,vs25 xvmulsp vs39, vs3,vs25 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs42, vs2,vs26 xvmulsp vs43, vs3,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 xvmulsp vs46, vs2,vs27 xvmulsp vs47, vs3,vs27 xvmulsp vs48, vs0,vs28 xvmulsp vs49, vs1,vs28 xvmulsp vs50, vs2,vs28 xvmulsp vs51, vs3,vs28 xvmulsp vs52, vs0,vs29 xvmulsp vs53, vs1,vs29 xvmulsp vs54, vs2,vs29 xvmulsp vs55, vs3,vs29 xvmulsp vs56, vs0,vs30 xvmulsp vs57, vs1,vs30 xvmulsp vs58, vs2,vs30 xvmulsp vs59, vs3,vs30 xvmulsp vs60, vs0,vs31 xvmulsp vs61, vs1,vs31 xvmulsp vs62, vs2,vs31 xvmulsp vs63, vs3,vs31 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs50, vs2,vs28 xvmaddasp vs51, vs3,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs54, vs2,vs29 xvmaddasp vs55, vs3,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs58, vs2,vs30 xvmaddasp vs59, vs3,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 xvmaddasp vs62, vs2,vs31 xvmaddasp vs63, vs3,vs31 .endif .endm .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs50, vs2,vs28 xvmaddasp vs51, vs3,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs54, vs2,vs29 xvmaddasp vs55, vs3,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs58, vs2,vs30 xvmaddasp vs59, vs3,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 xvmaddasp vs62, vs2,vs31 xvmaddasp vs63, vs3,vs31 lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs34, vs6,vs8 xvmaddasp vs35, vs7,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xvmaddasp vs38, vs6,vs9 xvmaddasp vs39, vs7,vs9 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs42, vs6,vs10 xvmaddasp vs43, vs7,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs46, vs6,vs11 xvmaddasp vs47, vs7,vs11 xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 xvmaddasp vs50, vs6,vs12 xvmaddasp vs51, vs7,vs12 xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 xvmaddasp vs54, vs6,vs13 xvmaddasp vs55, vs7,vs13 xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs58, vs6,vs14 xvmaddasp vs59, vs7,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 xvmaddasp vs62, vs6,vs15 xvmaddasp vs63, vs7,vs15 lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs50, vs2,vs28 xvmaddasp vs51, vs3,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs54, vs2,vs29 xvmaddasp vs55, vs3,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs58, vs2,vs30 xvmaddasp vs59, vs3,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 xvmaddasp vs62, vs2,vs31 xvmaddasp vs63, vs3,vs31 .if \Complete==0 lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) .else addi \BREG, \BREG, DISP32(\Index,128) addi \AREG, \AREG, DISP64(\Index,256) .endif .endif xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs34, vs6,vs8 xvmaddasp vs35, vs7,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xvmaddasp vs38, vs6,vs9 xvmaddasp vs39, vs7,vs9 .if \Complete==0 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 .endif xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs42, vs6,vs10 xvmaddasp vs43, vs7,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs46, vs6,vs11 xvmaddasp vs47, vs7,vs11 xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 xvmaddasp vs50, vs6,vs12 xvmaddasp vs51, vs7,vs12 xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 xvmaddasp vs54, vs6,vs13 xvmaddasp vs55, vs7,vs13 xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs58, vs6,vs14 xvmaddasp vs59, vs7,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 xvmaddasp vs62, vs6,vs15 xvmaddasp vs63, vs7,vs15 .endm .macro KERNEL8x16 First LOAD8x16 0 END8x16 \First, AO, BO, 64,32 .endm .macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs34, vs2,vs24 xvmulsp vs35, vs3,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 xvmulsp vs38, vs2,vs25 xvmulsp vs39, vs3,vs25 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 .endif xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 .if \First==1 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs42, vs2,vs26 xvmulsp vs43, vs3,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 xvmulsp vs46, vs2,vs27 xvmulsp vs47, vs3,vs27 xvmulsp vs48, vs0,vs28 xvmulsp vs49, vs1,vs28 xvmulsp vs50, vs2,vs28 xvmulsp vs51, vs3,vs28 xvmulsp vs52, vs0,vs29 xvmulsp vs53, vs1,vs29 xvmulsp vs54, vs2,vs29 xvmulsp vs55, vs3,vs29 xvmulsp vs56, vs0,vs30 xvmulsp vs57, vs1,vs30 xvmulsp vs58, vs2,vs30 xvmulsp vs59, vs3,vs30 xvmulsp vs60, vs0,vs31 xvmulsp vs61, vs1,vs31 xvmulsp vs62, vs2,vs31 xvmulsp vs63, vs3,vs31 .else xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs50, vs2,vs28 xvmaddasp vs51, vs3,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs54, vs2,vs29 xvmaddasp vs55, vs3,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs58, vs2,vs30 xvmaddasp vs59, vs3,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 xvmaddasp vs62, vs2,vs31 xvmaddasp vs63, vs3,vs31 .endif .if \Complete==0 lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) .else addi \BREG, \BREG, DISP16(\Index,64) addi \AREG, \AREG, DISP32(\Index,128) .endif .endif .if \First==1 xvmulsp vs32, vs4,vs8 xvmulsp vs33, vs5,vs8 xvmulsp vs34, vs6,vs8 xvmulsp vs35, vs7,vs8 xvmulsp vs36, vs4,vs9 xvmulsp vs37, vs5,vs9 xvmulsp vs38, vs6,vs9 xvmulsp vs39, vs7,vs9 .else xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs34, vs6,vs8 xvmaddasp vs35, vs7,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xvmaddasp vs38, vs6,vs9 xvmaddasp vs39, vs7,vs9 .endif .if \Complete==0 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 .endif .if \First==1 xvmulsp vs40, vs4,vs10 xvmulsp vs41, vs5,vs10 xvmulsp vs42, vs6,vs10 xvmulsp vs43, vs7,vs10 xvmulsp vs44, vs4,vs11 xvmulsp vs45, vs5,vs11 xvmulsp vs46, vs6,vs11 xvmulsp vs47, vs7,vs11 xvmulsp vs48, vs4,vs12 xvmulsp vs49, vs5,vs12 xvmulsp vs50, vs6,vs12 xvmulsp vs51, vs7,vs12 xvmulsp vs52, vs4,vs13 xvmulsp vs53, vs5,vs13 xvmulsp vs54, vs6,vs13 xvmulsp vs55, vs7,vs13 xvmulsp vs56, vs4,vs14 xvmulsp vs57, vs5,vs14 xvmulsp vs58, vs6,vs14 xvmulsp vs59, vs7,vs14 xvmulsp vs60, vs4,vs15 xvmulsp vs61, vs5,vs15 xvmulsp vs62, vs6,vs15 xvmulsp vs63, vs7,vs15 .else xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs42, vs6,vs10 xvmaddasp vs43, vs7,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs46, vs6,vs11 xvmaddasp vs47, vs7,vs11 xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 xvmaddasp vs50, vs6,vs12 xvmaddasp vs51, vs7,vs12 xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 xvmaddasp vs54, vs6,vs13 xvmaddasp vs55, vs7,vs13 xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs58, vs6,vs14 xvmaddasp vs59, vs7,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 xvmaddasp vs62, vs6,vs15 xvmaddasp vs63, vs7,vs15 .endif .endm .macro SAVE8x16 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 add T4, T2, T10 add T5, T3, T10 add T6, T4, T10 add T7, T5, T10 /* permute to restore butterfly rank 1 updateto normal promoted one */ /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ xxmrglw vs8, vs32, vs44 xxmrglw vs10, vs36, vs40 xxmrghw vs1, vs32, vs44 xxmrghw vs0, vs36, vs40 xxmrglw vs12, vs33, vs45 xxmrglw vs14, vs37, vs41 xxmrghw vs2, vs37, vs41 xxmrghw vs3, vs33, vs45 xxmrglw vs16, vs34, vs46 xxmrglw vs18, vs38, vs42 xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxmrghw vs4, vs38, vs42 xxmrghw vs5, vs34, vs46 xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxmrglw vs24, vs35, vs47 xxmrglw vs26, vs39, vs43 xxlor vs17, vs16, vs16 xxlor vs19, vs18, vs18 xxmrghw vs30, vs39, vs43 xxmrghw vs31, vs35, vs47 xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 xxperm vs9, vs0, save_permute_2 xxperm vs11, vs1, save_permute_2 #ifndef TRMMKERNEL lxv vs32, 0(CO) lxv vs33, 16(CO) lxv vs34, 32(CO) lxv vs35, 48(CO) #endif xxlor vs25, vs24, vs24 xxlor vs27, vs26, vs26 #ifndef TRMMKERNEL lxv vs36, 0(T1) lxv vs37, 16(T1) lxv vs38, 32(T1) lxv vs39, 48(T1) #endif #ifndef TRMMKERNEL lxv vs40, 0(T2) lxv vs41, 16(T2) lxv vs42, 32(T2) lxv vs43, 48(T2) #endif #ifndef TRMMKERNEL lxv vs44, 0(T3) lxv vs45, 16(T3) lxv vs46, 32(T3) lxv vs47, 48(T3) #endif xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 xxperm vs16, vs4, save_permute_1 xxperm vs18, vs5, save_permute_1 xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 xxperm vs24, vs30, save_permute_1 xxperm vs26, vs31, save_permute_1 xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 /* multiply add normal way */ #ifdef TRMMKERNEL xvmulsp vs32, vs8, alpha_r xvmulsp vs33, vs12, alpha_r xvmulsp vs34, vs16, alpha_r xvmulsp vs35, vs24, alpha_r xvmulsp vs36, vs9, alpha_r xvmulsp vs37, vs13, alpha_r xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r #else xvmaddasp vs32, vs8, alpha_r xvmaddasp vs33, vs12, alpha_r xvmaddasp vs34, vs16, alpha_r xvmaddasp vs35, vs24, alpha_r xvmaddasp vs36, vs9, alpha_r xvmaddasp vs37, vs13, alpha_r xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif #ifdef TRMMKERNEL xvmulsp vs40, vs10, alpha_r xvmulsp vs41, vs14, alpha_r xvmulsp vs42, vs18, alpha_r xvmulsp vs43, vs26, alpha_r xvmulsp vs44, vs11, alpha_r xvmulsp vs45, vs15, alpha_r xvmulsp vs46, vs19, alpha_r xvmulsp vs47, vs27, alpha_r #else xvmaddasp vs40, vs10, alpha_r xvmaddasp vs41, vs14, alpha_r xvmaddasp vs42, vs18, alpha_r xvmaddasp vs43, vs26, alpha_r xvmaddasp vs44, vs11, alpha_r xvmaddasp vs45, vs15, alpha_r xvmaddasp vs46, vs19, alpha_r xvmaddasp vs47, vs27, alpha_r #endif stxv vs32, 0(CO) stxv vs33, 16(CO) stxv vs34, 32(CO) stxv vs35, 48(CO) stxv vs36, 0(T1) stxv vs37, 16(T1) stxv vs38, 32(T1) stxv vs39, 48(T1) stxv vs40, 0(T2) stxv vs41, 16(T2) stxv vs42, 32(T2) stxv vs43, 48(T2) stxv vs44, 0(T3) stxv vs45, 16(T3) stxv vs46, 32(T3) stxv vs47, 48(T3) /*****the same with the second 8X8 ****/ #ifndef TRMMKERNEL lxv vs32, 0(T4) lxv vs33, 16(T4) lxv vs34, 32(T4) lxv vs35, 48(T4) lxv vs36, 0(T5) lxv vs37, 16(T5) lxv vs38,32(T5) lxv vs39, 48(T5) #endif xxmrglw vs8, vs48, vs60 xxmrglw vs10, vs52, vs56 xxmrghw vs1, vs48, vs60 xxmrghw vs0, vs52, vs56 xxmrglw vs12, vs49, vs61 xxmrglw vs14, vs53, vs57 #ifndef TRMMKERNEL lxv vs40, 0(T6) lxv vs41, 16(T6) lxv vs42, 32(T6) lxv vs43, 48(T6) lxv vs44, 0(T7) lxv vs45, 16(T7) lxv vs46, 32(T7) lxv vs47, 48(T7) #endif xxmrghw vs2, vs53, vs57 xxmrghw vs3, vs49, vs61 xxmrglw vs16, vs50, vs62 xxmrglw vs18, vs54, vs58 xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxmrghw vs4, vs54, vs58 xxmrghw vs5, vs50, vs62 xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxmrglw vs24, vs51, vs63 xxmrglw vs26, vs55, vs59 xxlor vs17, vs16, vs16 xxlor vs19, vs18, vs18 xxmrghw vs30, vs55, vs59 xxmrghw vs31, vs51, vs63 xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 xxperm vs9, vs0, save_permute_2 xxperm vs11, vs1, save_permute_2 xxlor vs25, vs24, vs24 xxlor vs27, vs26, vs26 xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 xxperm vs16, vs4, save_permute_1 xxperm vs18, vs5, save_permute_1 xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 xxperm vs24, vs30, save_permute_1 xxperm vs26, vs31, save_permute_1 xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 #ifdef TRMMKERNEL xvmulsp vs32, vs8, alpha_r xvmulsp vs33, vs12, alpha_r xvmulsp vs34, vs16, alpha_r xvmulsp vs35, vs24, alpha_r xvmulsp vs36, vs9, alpha_r xvmulsp vs37, vs13, alpha_r xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r #else xvmaddasp vs32, vs8, alpha_r xvmaddasp vs33, vs12, alpha_r xvmaddasp vs34, vs16, alpha_r xvmaddasp vs35, vs24, alpha_r xvmaddasp vs36, vs9, alpha_r xvmaddasp vs37, vs13, alpha_r xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif stxv vs32, 0(T4) stxv vs33, 16(T4) stxv vs34, 32(T4) stxv vs35, 48(T4) stxv vs36, 0(T5) stxv vs37, 16(T5) stxv vs38, 32(T5) stxv vs39, 48(T5) #ifdef TRMMKERNEL xvmulsp vs40, vs10, alpha_r xvmulsp vs41, vs14, alpha_r xvmulsp vs42, vs18, alpha_r xvmulsp vs43, vs26, alpha_r xvmulsp vs44, vs11, alpha_r xvmulsp vs45, vs15, alpha_r xvmulsp vs46, vs19, alpha_r xvmulsp vs47, vs27, alpha_r #else xvmaddasp vs40, vs10, alpha_r xvmaddasp vs41, vs14, alpha_r xvmaddasp vs42, vs18, alpha_r xvmaddasp vs43, vs26, alpha_r xvmaddasp vs44, vs11, alpha_r xvmaddasp vs45, vs15, alpha_r xvmaddasp vs46, vs19, alpha_r xvmaddasp vs47, vs27, alpha_r #endif stxv vs40, 0(T6) stxv vs41, 16(T6) stxv vs42, 32(T6) stxv vs43, 48(T6) stxv vs44, 0(T7) stxv vs45, 16(T7) stxv vs46, 32(T7) stxv vs47, 48(T7) addi CO,CO,64 .endm /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ .macro LOAD8x8_1 LOAD8x8 1 .endm .macro LOAD8x8_0 LOAD8x8 0 .endm .macro KERNEL8x8_L1_L4 Index,IsLast KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 .endm .macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro END8x8_NORMAL END8x8 0, AO, BO, 32,32 .endm .macro Zero8X8 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs52, vs52, vs52 xxlxor vs53, vs53, vs53 xxlxor vs56, vs56, vs56 xxlxor vs57, vs57, vs57 xxlxor vs60, vs60, vs60 xxlxor vs61, vs61, vs61 .endm .macro LOAD8x8 Zero lxv vs24, 0(BO) lxv vs28, 16(BO) lxv vs0, 0(AO) lxv vs1, 16(AO) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 .if \Zero==1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs52, vs52, vs52 xxlxor vs53, vs53, vs53 xxlxor vs56, vs56, vs56 xxlxor vs57, vs57, vs57 xxlxor vs60, vs60, vs60 xxlxor vs61, vs61, vs61 .endif .endm .macro END8x8 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 xvmulsp vs48, vs0,vs28 xvmulsp vs49, vs1,vs28 xvmulsp vs52, vs0,vs29 xvmulsp vs53, vs1,vs29 xvmulsp vs56, vs0,vs30 xvmulsp vs57, vs1,vs30 xvmulsp vs60, vs0,vs31 xvmulsp vs61, vs1,vs31 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 .endif .endm .macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 .if \Complete==0 lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) .else addi \BREG, \BREG, DISP32(\Index,128) addi \AREG, \AREG, DISP32(\Index,128) .endif .endif xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 .if \Complete==0 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 .endif xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 .endm .macro KERNEL8x8 First LOAD8x8 0 END8x8 \First, AO, BO, 32,32 .endm .macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxperm vs14, vs12, permute_mask xxpermdi vs9, vs8, vs8,2 xxpermdi vs13, vs12, vs12,2 .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 .endif xxpermdi vs11, vs10, vs10,2 xxpermdi vs15, vs14, vs14,2 .if \First==1 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 xvmulsp vs48, vs0,vs28 xvmulsp vs49, vs1,vs28 xvmulsp vs52, vs0,vs29 xvmulsp vs53, vs1,vs29 xvmulsp vs56, vs0,vs30 xvmulsp vs57, vs1,vs30 xvmulsp vs60, vs0,vs31 xvmulsp vs61, vs1,vs31 .else xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs48, vs0,vs28 xvmaddasp vs49, vs1,vs28 xvmaddasp vs52, vs0,vs29 xvmaddasp vs53, vs1,vs29 xvmaddasp vs56, vs0,vs30 xvmaddasp vs57, vs1,vs30 xvmaddasp vs60, vs0,vs31 xvmaddasp vs61, vs1,vs31 .endif .if \Complete==0 lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxperm vs30, vs28, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs29, vs28, vs28,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) .else addi \BREG, \BREG, DISP16(\Index,64) addi \AREG, \AREG, DISP16(\Index,64) .endif .endif .if \First==1 xvmulsp vs32, vs4,vs8 xvmulsp vs33, vs5,vs8 xvmulsp vs36, vs4,vs9 xvmulsp vs37, vs5,vs9 .else xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 .endif .if \Complete==0 xxpermdi vs27, vs26, vs26,2 xxpermdi vs31, vs30, vs30,2 .endif .if \First==1 xvmulsp vs40, vs4,vs10 xvmulsp vs41, vs5,vs10 xvmulsp vs44, vs4,vs11 xvmulsp vs45, vs5,vs11 xvmulsp vs48, vs4,vs12 xvmulsp vs49, vs5,vs12 xvmulsp vs52, vs4,vs13 xvmulsp vs53, vs5,vs13 xvmulsp vs56, vs4,vs14 xvmulsp vs57, vs5,vs14 xvmulsp vs60, vs4,vs15 xvmulsp vs61, vs5,vs15 .else xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs48, vs4,vs12 xvmaddasp vs49, vs5,vs12 xvmaddasp vs52, vs4,vs13 xvmaddasp vs53, vs5,vs13 xvmaddasp vs56, vs4,vs14 xvmaddasp vs57, vs5,vs14 xvmaddasp vs60, vs4,vs15 xvmaddasp vs61, vs5,vs15 .endif .endm .macro SAVE8x8 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 add T4, T2, T10 add T5, T3, T10 add T6, T4, T10 add T7, T5, T10 #ifndef TRMMKERNEL lxv vs34, 0(CO) lxv vs35, 16(CO) lxv vs38, 0(T1) lxv vs39, 16(T1) lxv vs42, 0(T2) lxv vs43, 16(T2) lxv vs46, 0(T3) lxv vs47, 16(T3) lxv vs50, 0(T4) lxv vs51, 16(T4) lxv vs54, 0(T5) lxv vs55, 16(T5) lxv vs58, 0(T6) lxv vs59, 16(T6) lxv vs62, 0(T7) lxv vs63, 16(T7) #endif xxmrglw vs8, vs32, vs44 xxmrglw vs10, vs36, vs40 xxmrghw vs1, vs32, vs44 xxmrghw vs0, vs36, vs40 xxmrglw vs12, vs33, vs45 xxmrglw vs14, vs37, vs41 xxmrghw vs2, vs37, vs41 xxmrghw vs3, vs33, vs45 xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 xxperm vs9, vs0, save_permute_2 xxperm vs11, vs1, save_permute_2 xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 /* multiply add normal way */ #ifdef TRMMKERNEL xvmulsp vs34, vs8, alpha_r xvmulsp vs35, vs12, alpha_r xvmulsp vs38, vs9, alpha_r xvmulsp vs39, vs13, alpha_r xvmulsp vs42, vs10, alpha_r xvmulsp vs43, vs14, alpha_r xvmulsp vs46, vs11, alpha_r xvmulsp vs47, vs15, alpha_r #else xvmaddasp vs34, vs8, alpha_r xvmaddasp vs35, vs12, alpha_r xvmaddasp vs38, vs9, alpha_r xvmaddasp vs39, vs13, alpha_r xvmaddasp vs42, vs10, alpha_r xvmaddasp vs43, vs14, alpha_r xvmaddasp vs46, vs11, alpha_r xvmaddasp vs47, vs15, alpha_r #endif xxmrglw vs8, vs48, vs60 xxmrglw vs10, vs52, vs56 xxmrghw vs1, vs48, vs60 xxmrghw vs0, vs52, vs56 stxv vs34, 0(CO) stxv vs35, 16(CO) xxmrglw vs12, vs49, vs61 xxmrglw vs14, vs53, vs57 stxv vs38, 0(T1) stxv vs39, 16(T1) xxmrghw vs2, vs53, vs57 xxmrghw vs3, vs49, vs61 stxv vs42, 0(T2) stxv vs43, 16(T2) xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 stxv vs46, 0(T3) stxv vs47, 16(T3) xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 xxperm vs9, vs0, save_permute_2 xxperm vs11, vs1, save_permute_2 xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 #ifdef TRMMKERNEL xvmulsp vs50, vs8, alpha_r xvmulsp vs51, vs12, alpha_r xvmulsp vs54, vs9, alpha_r xvmulsp vs55, vs13, alpha_r xvmulsp vs58, vs10, alpha_r xvmulsp vs59, vs14, alpha_r xvmulsp vs62, vs11, alpha_r xvmulsp vs63, vs15, alpha_r #else xvmaddasp vs50, vs8, alpha_r xvmaddasp vs51, vs12, alpha_r xvmaddasp vs54, vs9, alpha_r xvmaddasp vs55, vs13, alpha_r xvmaddasp vs58, vs10, alpha_r xvmaddasp vs59, vs14, alpha_r xvmaddasp vs62, vs11, alpha_r xvmaddasp vs63, vs15, alpha_r #endif stxv vs50, 0(T4) stxv vs51, 16(T4) stxv vs54, 0(T5) stxv vs55, 16(T5) stxv vs58, 0(T6) stxv vs59, 16(T6) stxv vs62, 0(T7) stxv vs63, 16(T7) addi CO,CO,32 .endm /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ .macro LOAD8x4_1 LOAD8x4 1 .endm .macro LOAD8x4_0 LOAD8x4 0 .endm .macro KERNEL8x4_L1_L4 Index,IsLast KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 .endm .macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro Zero8X4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs50, vs50, vs50 xxlxor vs51, vs51, vs51 .endm .macro LOAD8x4 Zero lxv vs0, 0(AO) lxv vs24, 0(BO) lxv vs25, 16(BO) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 .if \Zero==1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs50, vs50, vs50 xxlxor vs51, vs51, vs51 .endif .endm .macro END8x4_NORMAL END8x4 0, AO, BO, 16,32 .endm .macro END8x4 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif .if \First==1 xvmulsp vs32, vs24, vs0 xvmulsp vs33, vs24, vs1 xvmulsp vs34, vs24, vs2 xvmulsp vs35, vs24, vs3 xvmulsp vs48, vs25, vs0 xvmulsp vs49, vs25, vs1 xvmulsp vs50, vs25, vs2 xvmulsp vs51, vs25, vs3 .else xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 xvmaddasp vs48, vs25, vs0 xvmaddasp vs49, vs25, vs1 xvmaddasp vs50, vs25, vs2 xvmaddasp vs51, vs25, vs3 .endif .endm .macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) xxperm vs6, vs4, permute_mask xxpermdi vs5, vs4, vs4,2 xxpermdi vs7, vs6, vs6,2 xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 xvmaddasp vs48, vs25, vs0 xvmaddasp vs49, vs25, vs1 xvmaddasp vs50, vs25, vs2 xvmaddasp vs51, vs25, vs3 lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 xvmaddasp vs32, vs26, vs4 xvmaddasp vs33, vs26, vs5 xvmaddasp vs34, vs26, vs6 xvmaddasp vs35, vs26, vs7 xvmaddasp vs48, vs27, vs4 xvmaddasp vs49, vs27, vs5 xvmaddasp vs50, vs27, vs6 xvmaddasp vs51, vs27, vs7 lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) xxperm vs6, vs4, permute_mask xxpermdi vs5, vs4, vs4,2 xxpermdi vs7, vs6, vs6,2 xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 xvmaddasp vs48, vs25, vs0 xvmaddasp vs49, vs25, vs1 xvmaddasp vs50, vs25, vs2 xvmaddasp vs51, vs25, vs3 .if \Complete==0 lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 .endif xvmaddasp vs32, vs26, vs4 xvmaddasp vs33, vs26, vs5 xvmaddasp vs34, vs26, vs6 xvmaddasp vs35, vs26, vs7 xvmaddasp vs48, vs27, vs4 xvmaddasp vs49, vs27, vs5 xvmaddasp vs50, vs27, vs6 xvmaddasp vs51, vs27, vs7 .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) .else addi \AREG, \AREG, DISP16(\Index,64) addi \BREG, \BREG, DISP32(\Index,128) .endif .endif .endm .macro KERNEL8x4 First LOAD8x4 0 END8x4 \First, AO, BO, 16,32 .endm .macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) xxperm vs6, vs4, permute_mask xxpermdi vs5, vs4, vs4,2 xxpermdi vs7, vs6, vs6,2 .if \First==1 xvmulsp vs32, vs24, vs0 xvmulsp vs33, vs24, vs1 xvmulsp vs34, vs24, vs2 xvmulsp vs35, vs24, vs3 xvmulsp vs48, vs25, vs0 xvmulsp vs49, vs25, vs1 xvmulsp vs50, vs25, vs2 xvmulsp vs51, vs25, vs3 .else xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 xvmaddasp vs48, vs25, vs0 xvmaddasp vs49, vs25, vs1 xvmaddasp vs50, vs25, vs2 xvmaddasp vs51, vs25, vs3 .endif .if \Complete==0 lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 .endif .if \First==1 xvmulsp vs32, vs26, vs4 xvmulsp vs33, vs26, vs5 xvmulsp vs34, vs26, vs6 xvmulsp vs35, vs26, vs7 xvmulsp vs48, vs27, vs4 xvmulsp vs49, vs27, vs5 xvmulsp vs50, vs27, vs6 xvmulsp vs51, vs27, vs7 .else xvmaddasp vs32, vs26, vs4 xvmaddasp vs33, vs26, vs5 xvmaddasp vs34, vs26, vs6 xvmaddasp vs35, vs26, vs7 xvmaddasp vs48, vs27, vs4 xvmaddasp vs49, vs27, vs5 xvmaddasp vs50, vs27, vs6 xvmaddasp vs51, vs27, vs7 .endif .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) .else addi \AREG, \AREG, DISP8(\Index,32) addi \BREG, \BREG, DISP16(\Index,64) .endif .endif .endm .macro SAVE8x4 slwi T10, LDC , 1 add T1, CO, LDC #if !defined(TRMMKERNEL) lxv vs36, 0(CO) lxv vs37, 0(T1) #endif add T2, CO, T10 add T3, T1, T10 #if !defined(TRMMKERNEL) lxv vs38, 0(T2) lxv vs39, 0(T3) #endif add T4, T2, T10 add T5, T3, T10 #if !defined(TRMMKERNEL) lxv vs40, 0(T4) lxv vs41, 0(T5) #endif add T6, T4, T10 add T7, T5, T10 #if !defined(TRMMKERNEL) lxv vs42, 0(T6) lxv vs43, 0(T7) #endif xxmrglw vs0, vs35,vs32 xxmrglw vs1, vs34,vs33 xxmrglw vs4, vs32,vs35 xxmrglw vs5, vs33,vs34 xxmrghw vs2, vs35,vs32 xxmrghw vs3, vs34,vs33 xxmrghw vs6, vs32,vs35 xxmrghw vs7, vs33,vs34 xxmrgld vs24, vs1, vs0 xxmrghd vs25,vs5,vs4 xxmrgld vs26, vs2, vs3 xxmrghd vs27,vs6,vs7 xxmrglw vs0, vs51,vs48 xxmrglw vs1, vs50,vs49 xxmrglw vs4, vs48,vs51 xxmrglw vs5, vs49,vs50 xxmrghw vs2, vs51,vs48 xxmrghw vs3, vs50,vs49 xxmrghw vs6, vs48,vs51 xxmrghw vs7, vs49,vs50 xxmrgld vs28, vs1, vs0 xxmrghd vs29,vs5,vs4 xxmrgld vs30, vs2, vs3 xxmrghd vs31,vs6,vs7 #if defined(TRMMKERNEL) xvmulsp vs36, vs24, alpha_r xvmulsp vs37, vs25, alpha_r xvmulsp vs38, vs26, alpha_r xvmulsp vs39, vs27, alpha_r xvmulsp vs40, vs28, alpha_r xvmulsp vs41, vs29, alpha_r xvmulsp vs42, vs30, alpha_r xvmulsp vs43, vs31, alpha_r #else xvmaddasp vs36, vs24, alpha_r xvmaddasp vs37, vs25, alpha_r xvmaddasp vs38, vs26, alpha_r xvmaddasp vs39, vs27, alpha_r xvmaddasp vs40, vs28, alpha_r xvmaddasp vs41, vs29, alpha_r xvmaddasp vs42, vs30, alpha_r xvmaddasp vs43, vs31, alpha_r #endif stxv vs36, 0(CO) stxv vs37, 0(T1) stxv vs38, 0(T2) stxv vs39, 0(T3) stxv vs40, 0(T4) stxv vs41, 0(T5) stxv vs42, 0(T6) stxv vs43, 0(T7) addi CO,CO,16 .endm /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ .macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero8x2 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2, vs2, vs2 xxlxor vs3, vs3, vs3 .endm .macro KERNEL8x2 KERNEL8x2_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 xvmulsp vs2, vs26, vs9 xvmulsp vs3, vs27, vs9 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs26, vs9 xvmaddasp vs3, vs27, vs9 .endif addi \AREG, \AREG, DISP2(\Index,8) addi \BREG, \BREG, DISP8(\Index,32) .endm .macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) xxspltw vs8, vs4, 2 xxspltw vs9, vs4, 3 xxspltw vs10, vs4, 0 xxspltw vs11, vs4, 1 .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 xvmulsp vs2, vs26, vs9 xvmulsp vs3, vs27, vs9 xvmulsp vs0, vs28, vs10 xvmulsp vs1, vs29, vs10 xvmulsp vs2, vs28, vs11 xvmulsp vs3, vs29, vs11 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs26, vs9 xvmaddasp vs3, vs27, vs9 xvmaddasp vs0, vs28, vs10 xvmaddasp vs1, vs29, vs10 xvmaddasp vs2, vs28, vs11 xvmaddasp vs3, vs29, vs11 .endif .if \IsLast==1 addi \AREG, \AREG, DISP4(\Index,16) addi \BREG, \BREG, DISP16(\Index,64) .endif .endm .macro SAVE8x2 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 add T4, T2, T10 add T5, T3, T10 add T6, T4, T10 add T7, T5, T10 /*convert alpha_r for multiply*/ xscvspdp vs4,alpha_r /* v0 corresponds to vs32, do not forget*/ #if !defined(TRMMKERNEL) lxssp v0,0(CO) lxssp v1,4(CO) lxssp v2,0(T1) lxssp v3,4(T1) lxssp v4,0(T2) lxssp v5,4(T2) lxssp v6,0(T3) lxssp v7,4(T3) lxssp v8,0(T4) lxssp v9,4(T4) lxssp v10,0(T5) lxssp v11,4(T5) lxssp v12,0(T6) lxssp v13,4(T6) lxssp v14,0(T7) lxssp v15,4(T7) #endif xscvspdp vs5, vs2 xxspltw vs6, vs2, 1 xxspltw vs7, vs2, 2 xxspltw vs8, vs2, 3 xscvspdp vs6,vs6 xscvspdp vs7,vs7 xscvspdp vs8,vs8 xscvspdp vs24, vs0 xxspltw vs25, vs0, 1 xxspltw vs26, vs0, 2 xxspltw vs27, vs0, 3 xscvspdp vs25,vs25 xscvspdp vs26,vs26 xscvspdp vs27,vs27 xscvspdp vs9, vs3 xxspltw vs10, vs3, 1 xxspltw vs11, vs3, 2 xxspltw vs12, vs3, 3 xscvspdp vs10,vs10 xscvspdp vs11,vs11 xscvspdp vs12,vs12 xscvspdp vs28, vs1 xxspltw vs29, vs1, 1 xxspltw vs30, vs1, 2 xxspltw vs31, vs1, 3 xscvspdp vs29,vs29 xscvspdp vs30,vs30 xscvspdp vs31,vs31 #if defined(TRMMKERNEL) xsmuldp vs32,vs8, vs4 xsmuldp vs33,vs27, vs4 xsmuldp vs34,vs7, vs4 xsmuldp vs35,vs26, vs4 xsmuldp vs36,vs6, vs4 xsmuldp vs37,vs25, vs4 xsmuldp vs38,vs5, vs4 xsmuldp vs39,vs24, vs4 xsmuldp vs40,vs12, vs4 xsmuldp vs41,vs31, vs4 xsmuldp vs42,vs11, vs4 xsmuldp vs43,vs30, vs4 xsmuldp vs44,vs10, vs4 xsmuldp vs45,vs29, vs4 xsmuldp vs46,vs9, vs4 xsmuldp vs47,vs28, vs4 #else xsmaddadp vs32,vs8, vs4 xsmaddadp vs33,vs27, vs4 xsmaddadp vs34,vs7, vs4 xsmaddadp vs35,vs26, vs4 xsmaddadp vs36,vs6, vs4 xsmaddadp vs37,vs25, vs4 xsmaddadp vs38,vs5, vs4 xsmaddadp vs39,vs24, vs4 xsmaddadp vs40,vs12, vs4 xsmaddadp vs41,vs31, vs4 xsmaddadp vs42,vs11, vs4 xsmaddadp vs43,vs30, vs4 xsmaddadp vs44,vs10, vs4 xsmaddadp vs45,vs29, vs4 xsmaddadp vs46,vs9, vs4 xsmaddadp vs47,vs28, vs4 #endif stxssp v0,0(CO) stxssp v1,4(CO) stxssp v2,0(T1) stxssp v3,4(T1) stxssp v4,0(T2) stxssp v5,4(T2) stxssp v6,0(T3) stxssp v7,4(T3) stxssp v8,0(T4) stxssp v9,4(T4) stxssp v10,0(T5) stxssp v11,4(T5) stxssp v12,0(T6) stxssp v13,4(T6) stxssp v14,0(T7) stxssp v15,4(T7) addi CO,CO,8 .endm /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ .macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero8x1 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 .endm .macro KERNEL8x1 KERNEL8x1_1 AO,BO, 0 .endm .macro KERNEL8x1_2 KERNEL8x1_2_1 AO,BO, 0 .endm .macro KERNEL8x1_1 AREG,BREG,First lxvwsx vs8, 0, \AREG lxv vs26, 0(\BREG) lxv vs27, 16(\BREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 .endif addi \AREG, \AREG, 4 addi \BREG, \BREG, 32 .endm .macro KERNEL8x1_2_1 AREG,BREG,First lxsd v4, 0(\AREG) lxv vs26, 0(\BREG) lxv vs27, 16(\BREG) lxv vs28, 32(\BREG) lxv vs29, 48(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 xvmulsp vs0, vs28, vs9 xvmulsp vs1, vs29, vs9 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs0, vs28, vs9 xvmaddasp vs1, vs29, vs9 .endif addi \AREG, \AREG, 8 addi \BREG, \BREG, 64 .endm .macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) xxspltw vs8, vs4, 3 xxspltw vs9, vs4, 2 xxspltw vs10, vs4, 1 xxspltw vs11, vs4, 0 lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 xvmulsp vs0, vs28, vs9 xvmulsp vs1, vs29, vs9 xvmulsp vs0, vs30, vs10 xvmulsp vs1, vs31, vs10 xvmulsp vs0, vs32, vs11 xvmulsp vs1, vs33, vs11 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs0, vs28, vs9 xvmaddasp vs1, vs29, vs9 xvmaddasp vs0, vs30, vs10 xvmaddasp vs1, vs31, vs10 xvmaddasp vs0, vs32, vs11 xvmaddasp vs1, vs33, vs11 .endif .if \IsLast==1 addi \AREG, \AREG, DISP4(\Index,16) addi \BREG, \BREG, DISP32(\Index,128) .endif .endm .macro SAVE8x1 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 add T4, T2, T10 add T5, T3, T10 add T6, T4, T10 add T7, T5, T10 /*convert alpha_r for multiply*/ xscvspdp vs4,alpha_r /* v0 corresponds to vs32, do not forget*/ #if !defined(TRMMKERNEL) lxssp v0,0(CO) lxssp v2,0(T1) lxssp v4,0(T2) lxssp v6,0(T3) lxssp v8,0(T4) lxssp v10,0(T5) lxssp v12,0(T6) lxssp v14,0(T7) #endif xscvspdp vs24, vs0 xxspltw vs25, vs0, 1 xxspltw vs26, vs0, 2 xxspltw vs27, vs0, 3 xscvspdp vs25,vs25 xscvspdp vs26,vs26 xscvspdp vs27,vs27 xscvspdp vs28, vs1 xxspltw vs29, vs1, 1 xxspltw vs30, vs1, 2 xxspltw vs31, vs1, 3 xscvspdp vs29,vs29 xscvspdp vs30,vs30 xscvspdp vs31,vs31 #if defined(TRMMKERNEL) xsmuldp vs32,vs27, vs4 xsmuldp vs34,vs26, vs4 xsmuldp vs36,vs25, vs4 xsmuldp vs38,vs24, vs4 xsmuldp vs40,vs31, vs4 xsmuldp vs42,vs30, vs4 xsmuldp vs44,vs29, vs4 xsmuldp vs46,vs28, vs4 #else xsmaddadp vs32,vs27, vs4 xsmaddadp vs34,vs26, vs4 xsmaddadp vs36,vs25, vs4 xsmaddadp vs38,vs24, vs4 xsmaddadp vs40,vs31, vs4 xsmaddadp vs42,vs30, vs4 xsmaddadp vs44,vs29, vs4 xsmaddadp vs46,vs28, vs4 #endif stxssp v0,0(CO) stxssp v2,0(T1) stxssp v4,0(T2) stxssp v6,0(T3) stxssp v8,0(T4) stxssp v10,0(T5) stxssp v12,0(T6) stxssp v14,0(T7) addi CO,CO,4 .endm /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ .macro LOAD4x16_1 LOAD4x16 1 .endm .macro LOAD4x16_0 LOAD4x16 0 .endm .macro KERNEL4x16_L1_L4 Index,IsLast KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 .endm .macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro Zero4X16 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs38, vs38, vs38 xxlxor vs39, vs39, vs39 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs42, vs42, vs42 xxlxor vs43, vs43, vs43 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 .endm .macro LOAD4x16 Zero lxv vs24, 0(BO) lxv vs0, 0(AO) lxv vs1, 16(AO) lxv vs2, 32(AO) lxv vs3, 48(AO) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 .if \Zero==1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs38, vs38, vs38 xxlxor vs39, vs39, vs39 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs42, vs42, vs42 xxlxor vs43, vs43, vs43 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 .endif .endm .macro END4x16_NORMAL END4x16 0, AO, BO, 64,16 .endm .macro END4x16 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs34, vs2,vs24 xvmulsp vs35, vs3,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 xvmulsp vs38, vs2,vs25 xvmulsp vs39, vs3,vs25 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs42, vs2,vs26 xvmulsp vs43, vs3,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 xvmulsp vs46, vs2,vs27 xvmulsp vs47, vs3,vs27 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 .endif .endm .macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxpermdi vs9, vs8, vs8,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 xxpermdi vs11, vs10, vs10,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs34, vs6,vs8 xvmaddasp vs35, vs7,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xvmaddasp vs38, vs6,vs9 xvmaddasp vs39, vs7,vs9 xxpermdi vs27, vs26, vs26,2 xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs42, vs6,vs10 xvmaddasp vs43, vs7,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs46, vs6,vs11 xvmaddasp vs47, vs7,vs11 lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxpermdi vs9, vs8, vs8,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 xxpermdi vs11, vs10, vs10,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 .if \Complete==0 lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) .else addi \BREG, \BREG, DISP16(\Index,64) addi \AREG, \AREG, DISP64(\Index,256) .endif .endif xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs34, vs6,vs8 xvmaddasp vs35, vs7,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xvmaddasp vs38, vs6,vs9 xvmaddasp vs39, vs7,vs9 .if \Complete==0 xxpermdi vs27, vs26, vs26,2 .endif xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs42, vs6,vs10 xvmaddasp vs43, vs7,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs46, vs6,vs11 xvmaddasp vs47, vs7,vs11 .endm .macro KERNEL4x16 First LOAD4x16 0 END4x16 \First, AO, BO, 64,16 .endm .macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxpermdi vs9, vs8, vs8,2 .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs34, vs2,vs24 xvmulsp vs35, vs3,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 xvmulsp vs38, vs2,vs25 xvmulsp vs39, vs3,vs25 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs34, vs2,vs24 xvmaddasp vs35, vs3,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs38, vs2,vs25 xvmaddasp vs39, vs3,vs25 .endif xxpermdi vs11, vs10, vs10,2 .if \First==1 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs42, vs2,vs26 xvmulsp vs43, vs3,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 xvmulsp vs46, vs2,vs27 xvmulsp vs47, vs3,vs27 .else xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs42, vs2,vs26 xvmaddasp vs43, vs3,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 xvmaddasp vs46, vs2,vs27 xvmaddasp vs47, vs3,vs27 .endif .if \Complete==0 lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) .else addi \BREG, \BREG, DISP8(\Index,32) addi \AREG, \AREG, DISP32(\Index,128) .endif .endif .if \First==1 xvmulsp vs32, vs4,vs8 xvmulsp vs33, vs5,vs8 xvmulsp vs34, vs6,vs8 xvmulsp vs35, vs7,vs8 xvmulsp vs36, vs4,vs9 xvmulsp vs37, vs5,vs9 xvmulsp vs38, vs6,vs9 xvmulsp vs39, vs7,vs9 .else xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs34, vs6,vs8 xvmaddasp vs35, vs7,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xvmaddasp vs38, vs6,vs9 xvmaddasp vs39, vs7,vs9 .endif .if \Complete==0 xxpermdi vs27, vs26, vs26,2 .endif .if \First==1 xvmulsp vs40, vs4,vs10 xvmulsp vs41, vs5,vs10 xvmulsp vs42, vs6,vs10 xvmulsp vs43, vs7,vs10 xvmulsp vs44, vs4,vs11 xvmulsp vs45, vs5,vs11 xvmulsp vs46, vs6,vs11 xvmulsp vs47, vs7,vs11 .else xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs42, vs6,vs10 xvmaddasp vs43, vs7,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 xvmaddasp vs46, vs6,vs11 xvmaddasp vs47, vs7,vs11 .endif .endm .macro SAVE4x16 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 xxmrglw vs8, vs32, vs44 xxmrglw vs10, vs36, vs40 xxmrghw vs1, vs32, vs44 xxmrghw vs0, vs36, vs40 xxmrglw vs12, vs33, vs45 xxmrglw vs14, vs37, vs41 xxmrghw vs2, vs37, vs41 xxmrghw vs3, vs33, vs45 xxmrglw vs16, vs34, vs46 xxmrglw vs18, vs38, vs42 xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxmrghw vs4, vs38, vs42 xxmrghw vs5, vs34, vs46 xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxmrglw vs24, vs35, vs47 xxmrglw vs26, vs39, vs43 xxlor vs17, vs16, vs16 xxlor vs19, vs18, vs18 xxmrghw vs30, vs39, vs43 xxmrghw vs31, vs35, vs47 xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 xxperm vs9, vs0, save_permute_2 xxperm vs11, vs1, save_permute_2 #ifndef TRMMKERNEL lxv vs32, 0(CO) lxv vs33, 16(CO) lxv vs34, 32(CO) lxv vs35, 48(CO) #endif xxlor vs25, vs24, vs24 xxlor vs27, vs26, vs26 #ifndef TRMMKERNEL lxv vs36, 0(T1) lxv vs37, 16(T1) lxv vs38, 32(T1) lxv vs39, 48(T1) #endif #ifndef TRMMKERNEL lxv vs40, 0(T2) lxv vs41, 16(T2) lxv vs42, 32(T2) lxv vs43, 48(T2) #endif #ifndef TRMMKERNEL lxv vs44, 0(T3) lxv vs45, 16(T3) lxv vs46, 32(T3) lxv vs47, 48(T3) #endif xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 xxperm vs16, vs4, save_permute_1 xxperm vs18, vs5, save_permute_1 xxperm vs17, vs4, save_permute_2 xxperm vs19, vs5, save_permute_2 xxperm vs24, vs30, save_permute_1 xxperm vs26, vs31, save_permute_1 xxperm vs25, vs30, save_permute_2 xxperm vs27, vs31, save_permute_2 /* multiply add normal way */ #ifdef TRMMKERNEL xvmulsp vs32, vs8, alpha_r xvmulsp vs33, vs12, alpha_r xvmulsp vs34, vs16, alpha_r xvmulsp vs35, vs24, alpha_r xvmulsp vs36, vs9, alpha_r xvmulsp vs37, vs13, alpha_r xvmulsp vs38, vs17, alpha_r xvmulsp vs39, vs25, alpha_r #else xvmaddasp vs32, vs8, alpha_r xvmaddasp vs33, vs12, alpha_r xvmaddasp vs34, vs16, alpha_r xvmaddasp vs35, vs24, alpha_r xvmaddasp vs36, vs9, alpha_r xvmaddasp vs37, vs13, alpha_r xvmaddasp vs38, vs17, alpha_r xvmaddasp vs39, vs25, alpha_r #endif #ifdef TRMMKERNEL xvmulsp vs40, vs10, alpha_r xvmulsp vs41, vs14, alpha_r xvmulsp vs42, vs18, alpha_r xvmulsp vs43, vs26, alpha_r xvmulsp vs44, vs11, alpha_r xvmulsp vs45, vs15, alpha_r xvmulsp vs46, vs19, alpha_r xvmulsp vs47, vs27, alpha_r #else xvmaddasp vs40, vs10, alpha_r xvmaddasp vs41, vs14, alpha_r xvmaddasp vs42, vs18, alpha_r xvmaddasp vs43, vs26, alpha_r xvmaddasp vs44, vs11, alpha_r xvmaddasp vs45, vs15, alpha_r xvmaddasp vs46, vs19, alpha_r xvmaddasp vs47, vs27, alpha_r #endif stxv vs32, 0(CO) stxv vs33, 16(CO) stxv vs34, 32(CO) stxv vs35, 48(CO) stxv vs36, 0(T1) stxv vs37, 16(T1) stxv vs38, 32(T1) stxv vs39, 48(T1) stxv vs40, 0(T2) stxv vs41, 16(T2) stxv vs42, 32(T2) stxv vs43, 48(T2) stxv vs44, 0(T3) stxv vs45, 16(T3) stxv vs46, 32(T3) stxv vs47, 48(T3) addi CO,CO,64 .endm /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro LOAD4x8_1 LOAD4x8 1 .endm .macro LOAD4x8_0 LOAD4x8 0 .endm .macro KERNEL4x8_L1_L4 Index,IsLast KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 .endm .macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro END4x8_NORMAL END4x8 0, AO, BO, 32,16 .endm .macro Zero4X8 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 .endm .macro LOAD4x8 Zero lxv vs24, 0(BO) lxv vs0, 0(AO) lxv vs1, 16(AO) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 .if \Zero==1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs36, vs36, vs36 xxlxor vs37, vs37, vs37 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 .endif .endm .macro END4x8 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 .endif .endm .macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxpermdi vs9, vs8, vs8,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xxpermdi vs11, vs10, vs10,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 xxpermdi vs27, vs26, vs26,2 xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxpermdi vs9, vs8, vs8,2 xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 xxpermdi vs11, vs10, vs10,2 xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 .if \Complete==0 lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) .else addi \BREG, \BREG, DISP16(\Index,64) addi \AREG, \AREG, DISP32(\Index,128) .endif .endif xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 .if \Complete==0 xxpermdi vs27, vs26, vs26,2 .endif xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 .endm .macro KERNEL4x8 First LOAD4x8 0 END4x8 \First, AO, BO, 32,16 .endm .macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) xxperm vs10, vs8, permute_mask xxpermdi vs9, vs8, vs8,2 .if \First==1 xvmulsp vs32, vs0,vs24 xvmulsp vs33, vs1,vs24 xvmulsp vs36, vs0,vs25 xvmulsp vs37, vs1,vs25 .else xvmaddasp vs32, vs0,vs24 xvmaddasp vs33, vs1,vs24 xvmaddasp vs36, vs0,vs25 xvmaddasp vs37, vs1,vs25 .endif xxpermdi vs11, vs10, vs10,2 .if \First==1 xvmulsp vs40, vs0,vs26 xvmulsp vs41, vs1,vs26 xvmulsp vs44, vs0,vs27 xvmulsp vs45, vs1,vs27 .else xvmaddasp vs40, vs0,vs26 xvmaddasp vs41, vs1,vs26 xvmaddasp vs44, vs0,vs27 xvmaddasp vs45, vs1,vs27 .endif .if \Complete==0 lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) xxperm vs26, vs24, permute_mask xxpermdi vs25, vs24, vs24,2 .endif .if \IsLast==1 .if \Complete==1 addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) .else addi \BREG, \BREG, DISP8(\Index,32) addi \AREG, \AREG, DISP16(\Index,64) .endif .endif .if \First==1 xvmulsp vs32, vs4,vs8 xvmulsp vs33, vs5,vs8 xvmulsp vs36, vs4,vs9 xvmulsp vs37, vs5,vs9 .else xvmaddasp vs32, vs4,vs8 xvmaddasp vs33, vs5,vs8 xvmaddasp vs36, vs4,vs9 xvmaddasp vs37, vs5,vs9 .endif .if \Complete==0 xxpermdi vs27, vs26, vs26,2 .endif .if \First==1 xvmulsp vs40, vs4,vs10 xvmulsp vs41, vs5,vs10 xvmulsp vs44, vs4,vs11 xvmulsp vs45, vs5,vs11 .else xvmaddasp vs40, vs4,vs10 xvmaddasp vs41, vs5,vs10 xvmaddasp vs44, vs4,vs11 xvmaddasp vs45, vs5,vs11 .endif .endm .macro SAVE4x8 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 #ifndef TRMMKERNEL lxv vs34, 0(CO) lxv vs35, 16(CO) lxv vs38, 0(T1) lxv vs39, 16(T1) lxv vs42, 0(T2) lxv vs43, 16(T2) lxv vs46, 0(T3) lxv vs47, 16(T3) #endif xxmrglw vs8, vs32, vs44 xxmrglw vs10, vs36, vs40 xxmrghw vs1, vs32, vs44 xxmrghw vs0, vs36, vs40 xxmrglw vs12, vs33, vs45 xxmrglw vs14, vs37, vs41 xxmrghw vs2, vs37, vs41 xxmrghw vs3, vs33, vs45 xxlor vs9, vs8, vs8 xxlor vs11, vs10, vs10 xxlor vs13, vs12, vs12 xxlor vs15, vs14, vs14 xxperm vs8, vs0, save_permute_1 xxperm vs10, vs1, save_permute_1 xxperm vs9, vs0, save_permute_2 xxperm vs11, vs1, save_permute_2 xxperm vs12, vs2, save_permute_1 xxperm vs14, vs3, save_permute_1 xxperm vs13, vs2, save_permute_2 xxperm vs15, vs3, save_permute_2 /* multiply add normal way */ #ifdef TRMMKERNEL xvmulsp vs34, vs8, alpha_r xvmulsp vs35, vs12, alpha_r xvmulsp vs38, vs9, alpha_r xvmulsp vs39, vs13, alpha_r xvmulsp vs42, vs10, alpha_r xvmulsp vs43, vs14, alpha_r xvmulsp vs46, vs11, alpha_r xvmulsp vs47, vs15, alpha_r #else xvmaddasp vs34, vs8, alpha_r xvmaddasp vs35, vs12, alpha_r xvmaddasp vs38, vs9, alpha_r xvmaddasp vs39, vs13, alpha_r xvmaddasp vs42, vs10, alpha_r xvmaddasp vs43, vs14, alpha_r xvmaddasp vs46, vs11, alpha_r xvmaddasp vs47, vs15, alpha_r #endif stxv vs34, 0(CO) stxv vs35, 16(CO) stxv vs38, 0(T1) stxv vs39, 16(T1) stxv vs42, 0(T2) stxv vs43, 16(T2) stxv vs46, 0(T3) stxv vs47, 16(T3) addi CO,CO,32 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro LOAD4x4_1 LOAD4x4 1 .endm .macro LOAD4x4_0 LOAD4x4 0 .endm .macro KERNEL4x4_L1_L4 Index,IsLast KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 .endm .macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro Zero4X4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 .endm .macro LOAD4x4 Zero lxv vs0, 0(AO) lxv vs24, 0(BO) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 .if \Zero==1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 .endif .endm .macro END4x4_NORMAL END4x4 0, AO, BO, 16,16 .endm .macro END4x4 First, AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif .if \First==1 xvmulsp vs32, vs24, vs0 xvmulsp vs33, vs24, vs1 xvmulsp vs34, vs24, vs2 xvmulsp vs35, vs24, vs3 .else xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 .endif .endm .macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) xxperm vs6, vs4, permute_mask xxpermdi vs5, vs4, vs4,2 xxpermdi vs7, vs6, vs6,2 xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 xvmaddasp vs32, vs26, vs4 xvmaddasp vs33, vs26, vs5 xvmaddasp vs34, vs26, vs6 xvmaddasp vs35, vs26, vs7 lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) xxperm vs6, vs4, permute_mask xxpermdi vs5, vs4, vs4,2 xxpermdi vs7, vs6, vs6,2 xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 .if \Complete==0 lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 .endif xvmaddasp vs32, vs26, vs4 xvmaddasp vs33, vs26, vs5 xvmaddasp vs34, vs26, vs6 xvmaddasp vs35, vs26, vs7 .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) .else addi \AREG, \AREG, DISP16(\Index,64) addi \BREG, \BREG, DISP16(\Index,64) .endif .endif .endm .macro KERNEL4x4 First LOAD4x4 0 END4x4 \First, AO, BO, 16,16 .endm .macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) xxperm vs6, vs4, permute_mask xxpermdi vs5, vs4, vs4,2 xxpermdi vs7, vs6, vs6,2 .if \First==1 xvmulsp vs32, vs24, vs0 xvmulsp vs33, vs24, vs1 xvmulsp vs34, vs24, vs2 xvmulsp vs35, vs24, vs3 .else xvmaddasp vs32, vs24, vs0 xvmaddasp vs33, vs24, vs1 xvmaddasp vs34, vs24, vs2 xvmaddasp vs35, vs24, vs3 .endif .if \Complete==0 lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) xxperm vs2, vs0, permute_mask xxpermdi vs1, vs0, vs0,2 xxpermdi vs3, vs2, vs2,2 .endif .if \First==1 xvmulsp vs32, vs26, vs4 xvmulsp vs33, vs26, vs5 xvmulsp vs34, vs26, vs6 xvmulsp vs35, vs26, vs7 .else xvmaddasp vs32, vs26, vs4 xvmaddasp vs33, vs26, vs5 xvmaddasp vs34, vs26, vs6 xvmaddasp vs35, vs26, vs7 .endif .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) .else addi \AREG, \AREG, DISP8(\Index,32) addi \BREG, \BREG, DISP8(\Index,32) .endif .endif .endm .macro SAVE4x4 slwi T10, LDC , 1 add T1, CO, LDC #if !defined(TRMMKERNEL) lxv vs36, 0(CO) lxv vs37, 0(T1) #endif add T2, CO, T10 add T3, T1, T10 #if !defined(TRMMKERNEL) lxv vs38, 0(T2) lxv vs39, 0(T3) #endif xxmrglw vs0, vs35,vs32 xxmrglw vs1, vs34,vs33 xxmrglw vs4, vs32,vs35 xxmrglw vs5, vs33,vs34 xxmrghw vs2, vs35,vs32 xxmrghw vs3, vs34,vs33 xxmrghw vs6, vs32,vs35 xxmrghw vs7, vs33,vs34 xxmrgld vs24, vs1, vs0 xxmrghd vs25,vs5,vs4 xxmrgld vs26, vs2, vs3 xxmrghd vs27,vs6,vs7 #if defined(TRMMKERNEL) xvmulsp vs36, vs24, alpha_r xvmulsp vs37, vs25, alpha_r xvmulsp vs38, vs26, alpha_r xvmulsp vs39, vs27, alpha_r #else xvmaddasp vs36, vs24, alpha_r xvmaddasp vs37, vs25, alpha_r xvmaddasp vs38, vs26, alpha_r xvmaddasp vs39, vs27, alpha_r #endif stxv vs36, 0(CO) stxv vs37, 0(T1) stxv vs38, 0(T2) stxv vs39, 0(T3) addi CO,CO,16 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero4x2 xxlxor vs0, vs0, vs0 xxlxor vs2, vs2, vs2 .endm .macro KERNEL4x2 KERNEL4x2_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs2, vs26, vs9 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs2, vs26, vs9 .endif addi \AREG, \AREG, DISP2(\Index,8) addi \BREG, \BREG, DISP4(\Index,16) .endm .macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) xxspltw vs8, vs4, 2 xxspltw vs9, vs4, 3 xxspltw vs10, vs4, 0 xxspltw vs11, vs4, 1 .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs2, vs26, vs9 xvmulsp vs0, vs28, vs10 xvmulsp vs2, vs28, vs11 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs2, vs26, vs9 xvmaddasp vs0, vs28, vs10 xvmaddasp vs2, vs28, vs11 .endif .if \IsLast==1 addi \AREG, \AREG, DISP4(\Index,16) addi \BREG, \BREG, DISP8(\Index,32) .endif .endm .macro SAVE4x2 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 /*convert alpha_r for multiply*/ xscvspdp vs4,alpha_r /* v0 corresponds to vs32, do not forget*/ #if !defined(TRMMKERNEL) lxssp v0,0(CO) lxssp v1,4(CO) lxssp v2,0(T1) lxssp v3,4(T1) lxssp v4,0(T2) lxssp v5,4(T2) lxssp v6,0(T3) lxssp v7,4(T3) #endif xscvspdp vs5, vs2 xxspltw vs6, vs2, 1 xxspltw vs7, vs2, 2 xxspltw vs8, vs2, 3 xscvspdp vs6,vs6 xscvspdp vs7,vs7 xscvspdp vs8,vs8 xscvspdp vs24, vs0 xxspltw vs25, vs0, 1 xxspltw vs26, vs0, 2 xxspltw vs27, vs0, 3 xscvspdp vs25,vs25 xscvspdp vs26,vs26 xscvspdp vs27,vs27 #if defined(TRMMKERNEL) xsmuldp vs32,vs8, vs4 xsmuldp vs33,vs27, vs4 xsmuldp vs34,vs7, vs4 xsmuldp vs35,vs26, vs4 xsmuldp vs36,vs6, vs4 xsmuldp vs37,vs25, vs4 xsmuldp vs38,vs5, vs4 xsmuldp vs39,vs24, vs4 #else xsmaddadp vs32,vs8, vs4 xsmaddadp vs33,vs27, vs4 xsmaddadp vs34,vs7, vs4 xsmaddadp vs35,vs26, vs4 xsmaddadp vs36,vs6, vs4 xsmaddadp vs37,vs25, vs4 xsmaddadp vs38,vs5, vs4 xsmaddadp vs39,vs24, vs4 #endif stxssp v0,0(CO) stxssp v1,4(CO) stxssp v2,0(T1) stxssp v3,4(T1) stxssp v4,0(T2) stxssp v5,4(T2) stxssp v6,0(T3) stxssp v7,4(T3) addi CO,CO,8 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero4x1 xxlxor vs0, vs0, vs0 .endm .macro KERNEL4x1 KERNEL4x1_1 AO,BO, 0 .endm .macro KERNEL4x1_2 KERNEL4x1_2_1 AO,BO, 0 .endm .macro KERNEL4x1_1 AREG,BREG,First lxvwsx vs8, 0, \AREG lxv vs26, 0(\BREG) .if \First==1 xvmulsp vs0, vs26, vs8 .else xvmaddasp vs0, vs26, vs8 .endif addi \AREG, \AREG, 4 addi \BREG, \BREG, 16 .endm .macro KERNEL4x1_2_1 AREG,BREG,First lxsd v4, 0(\AREG) lxv vs26, 0(\BREG) lxv vs28, 16(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs0, vs28, vs9 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs0, vs28, vs9 .endif addi \AREG, \AREG, 8 addi \BREG, \BREG, 32 .endm .macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) xxspltw vs8, vs4, 3 xxspltw vs9, vs4, 2 xxspltw vs10, vs4, 1 xxspltw vs11, vs4, 0 lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs0, vs28, vs9 xvmulsp vs0, vs30, vs10 xvmulsp vs0, vs32, vs11 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs0, vs28, vs9 xvmaddasp vs0, vs30, vs10 xvmaddasp vs0, vs32, vs11 .endif .if \IsLast==1 addi \AREG, \AREG, DISP4(\Index,16) addi \BREG, \BREG, DISP16(\Index,64) .endif .endm .macro SAVE4x1 slwi T10, LDC , 1 add T1, CO, LDC add T2, CO, T10 add T3, T1, T10 /*convert alpha_r for multiply*/ xscvspdp vs4,alpha_r /* v0 corresponds to vs32, do not forget*/ #if !defined(TRMMKERNEL) lxssp v0,0(CO) lxssp v2,0(T1) lxssp v4,0(T2) lxssp v6,0(T3) #endif xscvspdp vs24, vs0 xxspltw vs25, vs0, 1 xxspltw vs26, vs0, 2 xxspltw vs27, vs0, 3 xscvspdp vs25,vs25 xscvspdp vs26,vs26 xscvspdp vs27,vs27 #if defined(TRMMKERNEL) xsmuldp vs32,vs27, vs4 xsmuldp vs34,vs26, vs4 xsmuldp vs36,vs25, vs4 xsmuldp vs38,vs24, vs4 #else xsmaddadp vs32,vs27, vs4 xsmaddadp vs34,vs26, vs4 xsmaddadp vs36,vs25, vs4 xsmaddadp vs38,vs24, vs4 #endif stxssp v0,0(CO) stxssp v2,0(T1) stxssp v4,0(T2) stxssp v6,0(T3) addi CO,CO,4 .endm /****************************N=2 section*****************/ .macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero2x16 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2, vs2, vs2 xxlxor vs3, vs3, vs3 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 .endm .macro KERNEL2x16 KERNEL2x16_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 xvmulsp vs2, vs28, vs8 xvmulsp vs3, vs29, vs8 xvmulsp vs4, vs26, vs9 xvmulsp vs5, vs27, vs9 xvmulsp vs6, vs28, vs9 xvmulsp vs7, vs29, vs9 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs28, vs8 xvmaddasp vs3, vs29, vs8 xvmaddasp vs4, vs26, vs9 xvmaddasp vs5, vs27, vs9 xvmaddasp vs6, vs28, vs9 xvmaddasp vs7, vs29, vs9 .endif addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP16(\Index,64) .endm .macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) xxspltw vs8, vs38, 3 xxspltw vs9, vs38, 2 xxspltw vs10, vs38, 1 xxspltw vs11, vs38, 0 xxspltw vs12, vs39, 3 xxspltw vs13, vs39, 2 xxspltw vs14, vs39, 1 xxspltw vs15, vs39, 0 xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs28, vs8 xvmaddasp vs3, vs29, vs8 xvmaddasp vs4, vs26, vs9 xvmaddasp vs5, vs27, vs9 xvmaddasp vs6, vs28, vs9 xvmaddasp vs7, vs29, vs9 xvmaddasp vs0, vs16, vs10 xvmaddasp vs1, vs17, vs10 xvmaddasp vs2, vs18, vs10 xvmaddasp vs3, vs19, vs10 xvmaddasp vs4, vs16, vs11 xvmaddasp vs5, vs17, vs11 xvmaddasp vs6, vs18, vs11 xvmaddasp vs7, vs19, vs11 xvmaddasp vs0, vs30, vs12 xvmaddasp vs1, vs31, vs12 xvmaddasp vs2, vs32, vs12 xvmaddasp vs3, vs33, vs12 xvmaddasp vs4, vs30, vs13 xvmaddasp vs5, vs31, vs13 xvmaddasp vs6, vs32, vs13 xvmaddasp vs7, vs33, vs13 xvmaddasp vs0, vs34, vs14 xvmaddasp vs1, vs35, vs14 xvmaddasp vs2, vs36, vs14 xvmaddasp vs3, vs37, vs14 xvmaddasp vs4, vs34, vs15 xvmaddasp vs5, vs35, vs15 xvmaddasp vs6, vs36, vs15 xvmaddasp vs7, vs37, vs15 .if \IsLast==1 addi \BREG, \BREG, DISP8(\Index,32) addi \AREG, \AREG, DISP64(\Index,256) .endif .endm .macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 3 xxspltw vs9, vs36, 2 xxspltw vs10, vs36, 1 xxspltw vs11, vs36, 0 lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs28, vs8 xvmaddasp vs3, vs29, vs8 xvmaddasp vs4, vs26, vs9 xvmaddasp vs5, vs27, vs9 xvmaddasp vs6, vs28, vs9 xvmaddasp vs7, vs29, vs9 xvmaddasp vs0, vs16, vs10 xvmaddasp vs1, vs17, vs10 xvmaddasp vs2, vs18, vs10 xvmaddasp vs3, vs19, vs10 xvmaddasp vs4, vs16, vs11 xvmaddasp vs5, vs17, vs11 xvmaddasp vs6, vs18, vs11 xvmaddasp vs7, vs19, vs11 .if \IsLast==1 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP32(\Index,128) .endif .endm .macro SAVE2x16 #ifndef TRMMKERNEL lxv vs16, 0(CO) lxv vs17, 16(CO) lxv vs18, 32(CO) lxv vs19, 48(CO) #endif add T1, CO, LDC #ifndef TRMMKERNEL lxv vs26, 0(T1) lxv vs27, 16(T1) lxv vs28, 32(T1) lxv vs29, 48(T1) #endif #if defined(TRMMKERNEL) xvmulsp vs16, vs0, alpha_r xvmulsp vs17, vs1, alpha_r xvmulsp vs18, vs2, alpha_r xvmulsp vs19, vs3, alpha_r xvmulsp vs26, vs4, alpha_r xvmulsp vs27, vs5, alpha_r xvmulsp vs28, vs6, alpha_r xvmulsp vs29, vs7, alpha_r #else xvmaddasp vs16, vs0, alpha_r xvmaddasp vs17, vs1, alpha_r xvmaddasp vs18, vs2, alpha_r xvmaddasp vs19, vs3, alpha_r xvmaddasp vs26, vs4, alpha_r xvmaddasp vs27, vs5, alpha_r xvmaddasp vs28, vs6, alpha_r xvmaddasp vs29, vs7, alpha_r #endif stxv vs16, 0(CO) stxv vs17, 16(CO) stxv vs18, 32(CO) stxv vs19, 48(CO) stxv vs26, 0(T1) stxv vs27, 16(T1) stxv vs28, 32(T1) stxv vs29, 48(T1) addi CO,CO,64 .endm /* M=8 N=2 */ .macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero2x8 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 .endm .macro KERNEL2x8 KERNEL2x8_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 xvmulsp vs4, vs26, vs9 xvmulsp vs5, vs27, vs9 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs4, vs26, vs9 xvmaddasp vs5, vs27, vs9 .endif addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP8(\Index,32) .endm .macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) xxspltw vs8, vs38, 3 xxspltw vs9, vs38, 2 xxspltw vs10, vs38, 1 xxspltw vs11, vs38, 0 xxspltw vs12, vs39, 3 xxspltw vs13, vs39, 2 xxspltw vs14, vs39, 1 xxspltw vs15, vs39, 0 xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs4, vs26, vs9 xvmaddasp vs5, vs27, vs9 xvmaddasp vs0, vs16, vs10 xvmaddasp vs1, vs17, vs10 xvmaddasp vs4, vs16, vs11 xvmaddasp vs5, vs17, vs11 xvmaddasp vs0, vs30, vs12 xvmaddasp vs1, vs31, vs12 xvmaddasp vs4, vs30, vs13 xvmaddasp vs5, vs31, vs13 xvmaddasp vs0, vs34, vs14 xvmaddasp vs1, vs35, vs14 xvmaddasp vs4, vs34, vs15 xvmaddasp vs5, vs35, vs15 .if \IsLast==1 addi \BREG, \BREG, DISP8(\Index,32) addi \AREG, \AREG, DISP32(\Index,128) .endif .endm .macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 3 xxspltw vs9, vs36, 2 xxspltw vs10, vs36, 1 xxspltw vs11, vs36, 0 lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs4, vs26, vs9 xvmaddasp vs5, vs27, vs9 xvmaddasp vs0, vs16, vs10 xvmaddasp vs1, vs17, vs10 xvmaddasp vs4, vs16, vs11 xvmaddasp vs5, vs17, vs11 .if \IsLast==1 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP16(\Index,64) .endif .endm .macro SAVE2x8 #ifndef TRMMKERNEL lxv vs16, 0(CO) lxv vs17, 16(CO) #endif add T1, CO, LDC #ifndef TRMMKERNEL lxv vs26, 0(T1) lxv vs27, 16(T1) #endif #if defined(TRMMKERNEL) xvmulsp vs16, vs0, alpha_r xvmulsp vs17, vs1, alpha_r xvmulsp vs26, vs4, alpha_r xvmulsp vs27, vs5, alpha_r #else xvmaddasp vs16, vs0, alpha_r xvmaddasp vs17, vs1, alpha_r xvmaddasp vs26, vs4, alpha_r xvmaddasp vs27, vs5, alpha_r #endif stxv vs16, 0(CO) stxv vs17, 16(CO) stxv vs26, 0(T1) stxv vs27, 16(T1) addi CO,CO,32 .endm /*M=4*/ .macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm /* we will aggregate on save vs0 +vs4 vs11+vs5 */ .macro Zero2x4 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 .endm .macro KERNEL2x4 KERNEL2x4_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs26, vs9 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs26, vs9 .endif addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP4(\Index,16) .endm .macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) xxspltw vs8, vs38, 3 xxspltw vs9, vs38, 2 xxspltw vs10, vs38, 1 xxspltw vs11, vs38, 0 xxspltw vs12, vs39, 3 xxspltw vs13, vs39, 2 xxspltw vs14, vs39, 1 xxspltw vs15, vs39, 0 xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs26, vs9 xvmaddasp vs4, vs16, vs10 xvmaddasp vs5, vs16, vs11 xvmaddasp vs0, vs30, vs12 xvmaddasp vs1, vs30, vs13 xvmaddasp vs4, vs34, vs14 xvmaddasp vs5, vs34, vs15 .if \IsLast==1 addi \BREG, \BREG, DISP8(\Index,32) addi \AREG, \AREG, DISP16(\Index,64) .endif .endm .macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 3 xxspltw vs9, vs36, 2 xxspltw vs10, vs36, 1 xxspltw vs11, vs36, 0 lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs26, vs9 xvmaddasp vs4, vs16, vs10 xvmaddasp vs5, vs16, vs11 .if \IsLast==1 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP8(\Index,32) .endif .endm .macro SAVE2x4 #ifndef TRMMKERNEL lxv vs16, 0(CO) #endif add T1, CO, LDC #ifndef TRMMKERNEL lxv vs26, 0(T1) #endif /*aggregate vectors*/ xvaddsp vs0,vs0,vs4 xvaddsp vs1,vs1,vs5 #if defined(TRMMKERNEL) xvmulsp vs16, vs0, alpha_r xvmulsp vs26, vs1, alpha_r #else xvmaddasp vs16, vs0, alpha_r xvmaddasp vs26, vs1, alpha_r #endif stxv vs16, 0(CO) stxv vs26, 0(T1) addi CO,CO,16 .endm /* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ .macro SWITCH_PERMUTE_INNER xxpermdi permute_mask, permute_mask, permute_mask,2 .endm .macro Zero2x2 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 SWITCH_PERMUTE_INNER .endm .macro KERNEL2x2 KERNEL2x2_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) xxperm vs9, vs36, permute_mask lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) .if \First==1 xvmulsp vs0, vs37, vs36 xvmulsp vs1, vs37, vs9 .else xvmaddasp vs0, vs37, vs36 xvmaddasp vs1, vs37, vs9 .endif addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP2(\Index,8) .endm .macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) xxperm vs9, vs8, permute_mask xxperm vs11, vs10, permute_mask xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs26, vs9 xvmaddasp vs0, vs16, vs10 xvmaddasp vs1, vs16, vs11 .if \IsLast==1 addi \BREG, \BREG, DISP8(\Index,32) addi \AREG, \AREG, DISP8(\Index,32) .endif .endm .macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) xxperm vs9, vs8, permute_mask xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs26, vs9 .if \IsLast==1 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP4(\Index,16) .endif .endm .macro SAVE2x2 #ifndef TRMMKERNEL lxsd v4 , 0(CO) #endif add T1, CO, LDC #ifndef TRMMKERNEL lxsd v5 , 0(T1) #endif /*aggregate vectors*/ xxpermdi vs4,vs0,vs0,2 xxpermdi vs5,vs1,vs1,2 xvaddsp vs0,vs0,vs4 xvaddsp vs1,vs1,vs5 /* */ /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ xxperm vs1,vs1, permute_mask xxmrghw vs2 ,vs1,vs0 xxpermdi vs2,vs2,vs2,2 xxmrghw vs3 ,vs0,vs1 #if defined(TRMMKERNEL) xvmulsp vs36, vs2, alpha_r xvmulsp vs37, vs3, alpha_r #else xvmaddasp vs36, vs2, alpha_r xvmaddasp vs37, vs3, alpha_r #endif /**** store last two words*/ stxsd v4, 0(CO) stxsd v5, 0(T1) addi CO,CO,8 .endm /*--------------------------- M=1 N=2 */ .macro Zero2x1 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2,vs2,vs2 xxlxor vs3,vs3,vs3 .endm .macro KERNEL2x1 KERNEL2x1_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm /* we will calculate 1 alone then will add it to batched ones */ .macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) .if \First==1 xvmulsp vs2, vs37, vs35 xvmulsp vs3, vs37, vs36 .else xsmaddadp vs2, vs37, vs35 xsmaddadp vs3, vs37, vs36 .endif addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP1(\Index,4) .endm .macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) xxmrglw vs5, vs26,vs26 xxmrghw vs6, vs26,vs26 xvmaddasp vs0, vs8, vs5 xvmaddasp vs1, vs10, vs6 .if \IsLast==1 addi \BREG, \BREG, DISP8(\Index,32) addi \AREG, \AREG, DISP4(\Index,16) .endif .endm .macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) xsmaddadp vs2, vs37, vs35 xsmaddadp vs3, vs37, vs36 xsmaddadp vs2, vs38, vs39 xsmaddadp vs3, vs38, vs40 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP2(\Index,8) .endm .macro SAVE2x1 #ifndef TRMMKERNEL lxssp v4 , 0(CO) #endif add T1, CO, LDC #ifndef TRMMKERNEL lxssp v5 , 0(T1) #endif /*convert alpha_r for multiply*/ xscvspdp vs16,alpha_r /*aggregate vectors 2x2_4 */ xxpermdi vs4,vs0,vs0,2 xxpermdi vs5,vs1,vs1,2 xvaddsp vs0,vs0,vs4 xvaddsp vs1,vs1,vs5 xvaddsp vs0,vs0,vs1 /*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ xscvspdp vs5, vs0 xxspltw vs6, vs0, 1 xscvspdp vs6,vs6 xsadddp vs2,vs2,vs6 xsadddp vs3,vs3,vs5 /**** store last two words*/ #if defined(TRMMKERNEL) xsmuldp vs36,vs2, vs16 xsmuldp vs37,vs3, vs16 #else xsmaddadp vs36,vs2, vs16 xsmaddadp vs37,vs3, vs16 #endif stxssp v4, 0(CO) stxssp v5, 0(T1) addi CO,CO,4 .endm /****************************N=1 section*****************/ .macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero1x16 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2, vs2, vs2 xxlxor vs3, vs3, vs3 .endm .macro KERNEL1x16 KERNEL1x16_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) xscvdpspn vs36,vs36 xxspltw vs8, vs36, 0 lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 xvmulsp vs2, vs28, vs8 xvmulsp vs3, vs29, vs8 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs28, vs8 xvmaddasp vs3, vs29, vs8 .endif addi \BREG, \BREG, DISP1(\Index,4) addi \AREG, \AREG, DISP16(\Index,64) .endm .macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) xxspltw vs8, vs38, 3 xxspltw vs9, vs38, 2 lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) xxspltw vs10, vs38, 1 xxspltw vs11, vs38, 0 xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs28, vs8 xvmaddasp vs3, vs29, vs8 xvmaddasp vs0, vs16, vs9 xvmaddasp vs1, vs17, vs9 xvmaddasp vs2, vs18, vs9 xvmaddasp vs3, vs19, vs9 xvmaddasp vs0, vs30, vs10 xvmaddasp vs1, vs31, vs10 xvmaddasp vs2, vs32, vs10 xvmaddasp vs3, vs33, vs10 xvmaddasp vs0, vs34, vs11 xvmaddasp vs1, vs35, vs11 xvmaddasp vs2, vs36, vs11 xvmaddasp vs3, vs37, vs11 .if \IsLast==1 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP64(\Index,256) .endif .endm .macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs28, vs8 xvmaddasp vs3, vs29, vs8 xvmaddasp vs0, vs16, vs9 xvmaddasp vs1, vs17, vs9 xvmaddasp vs2, vs18, vs9 xvmaddasp vs3, vs19, vs9 .if \IsLast==1 addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP32(\Index,128) .endif .endm .macro SAVE1x16 #ifndef TRMMKERNEL lxv vs16, 0(CO) lxv vs17, 16(CO) lxv vs18, 32(CO) lxv vs19, 48(CO) #endif #if defined(TRMMKERNEL) xvmulsp vs16, vs0, alpha_r xvmulsp vs17, vs1, alpha_r xvmulsp vs18, vs2, alpha_r xvmulsp vs19, vs3, alpha_r #else xvmaddasp vs16, vs0, alpha_r xvmaddasp vs17, vs1, alpha_r xvmaddasp vs18, vs2, alpha_r xvmaddasp vs19, vs3, alpha_r #endif stxv vs16, 0(CO) stxv vs17, 16(CO) stxv vs18, 32(CO) stxv vs19, 48(CO) addi CO,CO,64 .endm /* M=8 N=1 */ .macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero1x8 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2, vs2, vs2 xxlxor vs3, vs3, vs3 .endm .macro KERNEL1x8 KERNEL1x8_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) xscvdpspn vs36,vs36 xxspltw vs8, vs36, 0 lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) .if \First==1 xvmulsp vs0, vs26, vs8 xvmulsp vs1, vs27, vs8 .else xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 .endif addi \BREG, \BREG, DISP1(\Index,4) addi \AREG, \AREG, DISP8(\Index,32) .endm .macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) xxspltw vs8, vs38, 3 xxspltw vs9, vs38, 2 lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) xxspltw vs10, vs38, 1 xxspltw vs11, vs38, 0 xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs16, vs9 xvmaddasp vs3, vs17, vs9 xvmaddasp vs0, vs30, vs10 xvmaddasp vs1, vs31, vs10 xvmaddasp vs2, vs34, vs11 xvmaddasp vs3, vs35, vs11 .if \IsLast==1 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP32(\Index,128) .endif .endm .macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs8 xvmaddasp vs2, vs16, vs9 xvmaddasp vs3, vs17, vs9 .if \IsLast==1 addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP16(\Index,64) .endif .endm .macro SAVE1x8 #ifndef TRMMKERNEL lxv vs16, 0(CO) lxv vs17, 16(CO) #endif /* aggregate vs0 vs2 and vs1 vs3*/ xvaddsp vs0,vs0,vs2 xvaddsp vs1,vs1,vs3 #if defined(TRMMKERNEL) xvmulsp vs16, vs0, alpha_r xvmulsp vs17, vs1, alpha_r #else xvmaddasp vs16, vs0, alpha_r xvmaddasp vs17, vs1, alpha_r #endif stxv vs16, 0(CO) stxv vs17, 16(CO) addi CO,CO,32 .endm /*M=4*/ .macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro Zero1x4 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2, vs2, vs2 xxlxor vs3, vs3, vs3 .endm .macro KERNEL1x4 KERNEL1x4_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) xscvdpspn vs36,vs36 xxspltw vs8, vs36, 0 lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) .if \First==1 xvmulsp vs0, vs26, vs8 .else xvmaddasp vs0, vs26, vs8 .endif addi \BREG, \BREG, DISP1(\Index,4) addi \AREG, \AREG, DISP4(\Index,16) .endm .macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) xxspltw vs8, vs38, 3 xxspltw vs9, vs38, 2 lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) xxspltw vs10, vs38, 1 xxspltw vs11, vs38, 0 xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs9 xvmaddasp vs2, vs30, vs10 xvmaddasp vs3, vs31, vs11 .if \IsLast==1 addi \BREG, \BREG, DISP4(\Index,16) addi \AREG, \AREG, DISP16(\Index,64) .endif .endm .macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) xxspltw vs8, vs36, 1 xxspltw vs9, vs36, 0 lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) xvmaddasp vs0, vs26, vs8 xvmaddasp vs1, vs27, vs9 .if \IsLast==1 addi \BREG, \BREG, DISP2(\Index,8) addi \AREG, \AREG, DISP8(\Index,32) .endif .endm .macro SAVE1x4 #ifndef TRMMKERNEL lxv vs16, 0(CO) #endif /* aggregate */ xvaddsp vs0,vs0,vs2 xvaddsp vs1,vs1,vs3 xvaddsp vs0,vs1,vs0 #if defined(TRMMKERNEL) xvmulsp vs16, vs0, alpha_r #else xvmaddasp vs16, vs0, alpha_r #endif stxv vs16, 0(CO) addi CO,CO,16 .endm /* M=2 N=1*/ .macro Zero1x2 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2,vs2,vs2 xxlxor vs3,vs3,vs3 .endm .macro KERNEL1x2 KERNEL1x2_1 AO,BO, 0, 0,0,0 .endm .macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm /* we will calculate 1 alone then will add it to batched ones */ .macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) .if \First==1 xvmuldp vs2, vs37, vs35 xvmuldp vs3, vs37, vs36 .else xsmaddadp vs2, vs37, vs35 xsmaddadp vs3, vs37, vs36 .endif addi \AREG, \AREG, DISP2(\Index,8) addi \BREG, \BREG, DISP1(\Index,4) .endm .macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) xxmrglw vs5, vs26,vs26 xxmrghw vs6, vs26,vs26 xvmaddasp vs0, vs8, vs5 xvmaddasp vs1, vs10, vs6 .if \IsLast==1 addi \AREG, \AREG, DISP8(\Index,32) addi \BREG, \BREG, DISP4(\Index,16) .endif .endm .macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) xsmaddadp vs2, vs37, vs35 xsmaddadp vs3, vs37, vs36 xsmaddadp vs2, vs38, vs39 xsmaddadp vs3, vs38, vs40 addi \AREG, \AREG, DISP4(\Index,16) addi \BREG, \BREG, DISP2(\Index,8) .endm .macro SAVE1x2 #ifndef TRMMKERNEL lxssp v4 , 0(CO) lxssp v5 , 4(CO) #endif /*convert alpha_r for multiply*/ xscvspdp vs16,alpha_r /*aggregate vectors 1x2_4 */ xxpermdi vs4,vs0,vs0,2 xxpermdi vs5,vs1,vs1,2 xvaddsp vs0,vs0,vs4 xvaddsp vs1,vs1,vs5 xvaddsp vs0,vs0,vs1 /*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ xscvspdp vs5, vs0 xxspltw vs6, vs0, 1 xscvspdp vs6,vs6 xsadddp vs2,vs2,vs6 xsadddp vs3,vs3,vs5 /**** store last two words*/ #if defined(TRMMKERNEL) xsmuldp vs36,vs2, vs16 xsmuldp vs37,vs3, vs16 #else xsmaddadp vs36,vs2, vs16 xsmaddadp vs37,vs3, vs16 #endif stxssp v4, 0(CO) stxssp v5, 4(CO) addi CO,CO,8 .endm /*///////////////// N=1 M=1 //////////////////*/ .macro Zero1x1 xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxlxor vs2, vs2,vs2 xxlxor vs3,vs3,vs3 xxlxor vs4,vs4,vs4 .endm .macro KERNEL1x1 KERNEL1x1_1 AO,BO, 1, 0,0,0 .endm .macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm .macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast .endm /* we will calculate 1 alone ( FIRST==1 to zero vs4) */ .macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) .if \First==1 xvmuldp vs4, vs37, vs35 .else xsmaddadp vs4, vs37, vs35 .endif addi \AREG, \AREG, DISP1(\Index,4) addi \BREG, \BREG, DISP1(\Index,4) .endm .macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) xvmaddasp vs0, vs8, vs26 xvmaddasp vs1, vs9, vs16 xvmaddasp vs2, vs10, vs17 xvmaddasp vs3, vs11, vs18 .if \IsLast==1 addi \AREG, \AREG, DISP16(\Index,64) addi \BREG, \BREG, DISP16(\Index,64) .endif .endm .macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) xvmaddasp vs0, vs8, vs26 xvmaddasp vs1, vs9, vs16 .if \IsLast==1 addi \AREG, \AREG, DISP8(\Index,32) addi \BREG, \BREG, DISP8(\Index,32) .endif .endm .macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) xvmaddasp vs0, vs8, vs26 .if \IsLast==1 addi \AREG, \AREG, DISP4(\Index,16) addi \BREG, \BREG, DISP4(\Index,16) .endif .endm .macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) xvmaddasp vs0, vs36, vs37 addi \AREG, \AREG, DISP2(\Index,8) addi \BREG, \BREG, DISP2(\Index,8) .endm .macro SAVE1x1 #ifndef TRMMKERNEL lxssp v4 , 0(CO) #endif /*convert alpha_r for multiply*/ xscvspdp vs16,alpha_r /*aggregate vectors */ xvaddsp vs0,vs0,vs1 xvaddsp vs2,vs2,vs3 xvaddsp vs0,vs0,vs2 xxpermdi vs7,vs0,vs0,2 xvaddsp vs0,vs0,vs7 /*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ xscvspdp vs5, vs0 xxspltw vs6, vs0, 1 xscvspdp vs6,vs6 xsadddp vs7,vs5,vs6 xsadddp vs4,vs4,vs7 /**** store last two words*/ #if defined(TRMMKERNEL) xsmuldp vs36,vs4, vs16 #else xsmaddadp vs36,vs4, vs16 #endif stxssp v4, 0(CO) addi CO,CO,4 .endm /****************************TRMM POINTER REFRESH MACROSES*************************/ .macro SHIFT_REG REG1,REG2,SHIFT_VAL .if \SHIFT_VAL==16 slwi \REG1, \REG2, 6 .elseif \SHIFT_VAL==8 slwi \REG1, \REG2, 5 .elseif \SHIFT_VAL==4 slwi \REG1, \REG2, 4 .elseif \SHIFT_VAL==2 slwi \REG1, \REG2, 3 .elseif \SHIFT_VAL==1 slwi \REG1, \REG2, 2 .endif .endm /* //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) // ptrbb = bb; // #else // ptrba += off*16; // ptrbb = bb + off*2; // #endif */ .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /* ptrbb = bb;*/ mr \PTR_B,\B_VAL /* refresh BPOINT */ #else /* // ptrba =ptrba+ off*C_A; // ptrbb = bb + off*C_B; */ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ add \PTR_B, \B_VAL , T4 /* Add values to BO */ add \PTR_A, \PTR_A, T2 /* Add values to AO */ #endif .endm /* // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) // temp = bk-off; // #elif defined(LEFT) // temp = off+16; // number of values in A // #else // temp = off+2; // number of values in B // #endif */ .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) /* temp = bk-off;*/ sub \TEMP_BK,\BK_VAL,\OFF_VAL #elif defined(LEFT) /* temp = off+INCR_A; // number of values in A */ addi \TEMP_BK, \OFF_VAL, \INCR_A #else /* temp = off+INCR_B // number of values in B*/ addi \TEMP_BK,\OFF_VAL, \INCR_B #endif .endm /* // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) // temp = bk - off; // #ifdef LEFT // temp -= 16; // number of values in A // #else // temp -= 2; // number of values in B // #endif // ptrba += temp*16; // ptrbb += temp*2; // #endif // #ifdef LEFT // off += 16; // number of values in A // #endif */ .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /*temp = bk - off;*/ sub \TEMP_BK,\BK_VAL,\OFF_VAL #ifdef LEFT /*temp -= 8; // number of values in A*/ addi \TEMP_BK,\TEMP_BK,-\C_A #else /*temp -= 4; // number of values in B*/ addi \TEMP_BK,\TEMP_BK,-\C_B #endif /*ptrba += temp*C_A; ptrbb += temp*C_B;*/ SHIFT_REG T4,\TEMP_BK,\C_A SHIFT_REG T2,\TEMP_BK,\C_B add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ add \PTR_B, \PTR_B,T2 #endif #ifdef LEFT /*off += 8; // number of values in A*/ addi \OFF_VAL,\OFF_VAL,\C_A #endif .endm