##define REALNAME gemm 
#define ASSEMBLER
#include "common.h"

#define FETCH	ld
#define	STACKSIZE	192
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)


#####	Parameter registers	####
#define M	$4
#define	N	$5
#define	K	$6
#define A	$9
#define B	$10
#define C	$11
#define LDC	$8

####	Pointer A, B, C	####
#define AO	$12
#define BO	$13

#define CO1	$14
#define CO2	$15

#define PREA	$18
#define PREB	$19

####	Used registers	####
#define A1	$f0
#define A2	$f1
#define A3	$f2
#define A4	$f3
#define A5	$f4
#define	A6	$f5
#define	A7	$f6
#define	A8	$f7

#define B1	$f8
#define B2	$f9
#define B3	$f10
#define B4	$f11
#define B5	$f12
#define	B6	$f13
#define	B7	$f14
#define	B8	$f15

#define C11	$f16
#define C12	$f17
#define C21	$f18
#define C22	$f19
#define C31	$f20
#define C32	$f21
#define C41	$f22
#define C42	$f23
#define C13	$f24
#define C14	$f25
#define C23	$f26
#define C24	$f27
#define C33	$f28
#define C34	$f29
#define C43	$f30
#define C44	$f31

#define I	$2
#define J	$3
#define L	$7

####	Alpha register	####
#define ALPHA	$f15

#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24 
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16 
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4 
#define F3 3 
#define F2 2 
#define F1 1 
#define F0 0

#define	R12	12
#define	R13	13

#define R14	14
#define R15	15
#define	R16	16
#define	R17	17

#if defined(TRMMKERNEL)
#define	OFFSET	$23
#define	KK		$24
#define TEMP	$25
#endif


	PROLOGUE

	LDARG	LDC,   0($sp)
	daddiu	$sp,$sp,-STACKSIZE

	sd	$16,   0($sp)
	sd	$17,   8($sp)
	sd	$18,  16($sp)
	sd	$19,  24($sp)
	sd	$20,  32($sp)
	sd	$21,  40($sp)
	sd	$22,  48($sp)

	ST	$f24, 56($sp)
	ST	$f25, 64($sp)
	ST	$f26, 72($sp)
	ST	$f27, 80($sp)
	ST	$f28, 88($sp)

#if defined(TRMMKERNEL)
	sd	$23,  96($sp)
	sd	$24, 104($sp)
	sd	$25, 112($sp)

	LDARG	OFFSET, STACKSIZE+8($sp)
#endif

#ifndef __64BIT__
	ST	$f20,120($sp)
	ST	$f21,128($sp)
	ST	$f22,136($sp)
	ST	$f23,144($sp)
#endif

	.align	4
.L2:
	dsra	J, N, 1				#	NR=2
	ST		$f15, 152($sp)

#if defined(TRMMKERNEL) && !defined(LEFT)
	neg	KK, OFFSET
#endif

	dsll	LDC, LDC, ZBASE_SHIFT#	LDC*SIZE
	blez	J, .L1
	ST		$f16, 160($sp)

.L24:
#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	dsra	I, M, 2				#	MR=8
	move	AO, A				#	Reset A

	dsll	PREA, K, 1 + ZBASE_SHIFT
	move	CO1, C

	daddu	CO2, C,   LDC
	daddu	PREA, AO, PREA

	blez	I, .L22
	daddu	C,   CO2, LDC

	.align	4
.L241:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L, KK, 2 + ZBASE_SHIFT
	dsll	TEMP, KK, 1 + ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	dsll	PREB, K, ZBASE_SHIFT
	MOV		C21, C11
	MOV		C22, C11
	
	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MOV		C31, C11
	MOV		C32, C11

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C41, C11
	MOV		C42, C11

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MOV		C13, C11
	MOV		C14, C11

	MOV		C23, C11
	MOV		C24, C11

	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	MOV		C44, C11

	PLU		B3,	B1, B1
	PLU		B4, B2, B2
	daddu	PREB, BO, PREB

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 8 * SIZE(CO1)
	FETCH	$0, 0 * SIZE(CO2)
	FETCH	$0, 8 * SIZE(CO2)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 4
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L,  TEMP, 2
	blez	L, .L242
	NOP

#else

	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=64
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	dsll	PREB, K, ZBASE_SHIFT
	MOV		C21, C11
	MOV		C22, C11
	
	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MOV		C31, C11
	MOV		C32, C11

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C41, C11
	MOV		C42, C11

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MOV		C13, C11
	MOV		C14, C11

	FETCH	$0, 0 * SIZE(CO1)
	MOV		C23, C11
	MOV		C24, C11

	FETCH	$0, 0 * SIZE(CO2)
	MOV		C33, C11
	MOV		C34, C11

	MOV		C43, C11
	MOV		C44, C11
	daddu	PREB, BO, PREB

	PLU		B3,	B1, B1
	PLU		B4, B2, B2
	
	FETCH	$0, 8 * SIZE(CO1)
	blez	L, .L242
	FETCH	$0, 8 * SIZE(CO2)
#endif

.L2410:
	daddiu	L, L, -1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	gsLQC1(R12, F5, F4, 2)		#	A5 A6
	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	gsLQC1(R12, F7, F6, 3)		#	A7 A8
	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1

	FETCH	$0, 0 * SIZE(PREB)
	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2

	FETCH	$0, 0 * SIZE(PREA)
	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6
	daddu	PREB, PREB, 8 * SIZE

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	gsLQC1(R13, F9, F8, 2)		#	B1 B2
	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5

	gsLQC1(R12, F1, F0, 4)		#	A1 A2
	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6

	gsLQC1(R12, F3, F2, 5)		#	A3 A4
	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5

	FETCH	$0, 8 * SIZE(PREA)
	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	MADPS	C24, C24, A6, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8
	
	gsLQC1(R13, F13, F12, 3)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	gsLQC1(R12, F5, F4, 6)		#	A5 A6
	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	gsLQC1(R12, F7, F6, 7)		#	A7 A8
	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	daddiu	BO, BO, 4 * 4 * SIZE	#	4KR*4NR

	FETCH	$0, 16 * SIZE(PREA)
	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	daddiu	AO, AO, 8 * 4 * SIZE 	#	4KR*8MR

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5

	FETCH	$0, 24 * SIZE(PREA)
	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddu	PREA, PREA, 32 * SIZE

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	MADPS	C24, C24, A6, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2

	MADPS	C34, C34, A7, B8
	bgtz	L, .L2410
	MADPS	C44, C44, A8, B8


	.align	4
.L242:
#ifndef	TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L247
	NOP

	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	gsLQC1(R12, F5, F4, 2)		#	A5 A6
	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	gsLQC1(R12, F7, F6, 3)		#	A7 A8
	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	daddiu	BO, BO, 2 * 4 * SIZE	#	4KR*4NR

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	daddiu	AO, AO, 4 * 4 * SIZE

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	MADPS	C24, C24, A6, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8
	
	.align	4
.L247:
#ifndef	TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L240
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	daddiu	BO, BO, 1 * 4 * SIZE	#	4KR*4NR

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	daddiu	AO, AO, 2 * 4 * SIZE

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4


	.align	4
.L240:							#	Write Back
#ifndef TRMMKERNEL
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C31
	CVTU	A4, C41

	CVTU	A5, C13
	CVTU	A6, C23

	CVTU	A7, C33
	CVTU	A8, C43

	CVTU	B1, C12
	CVTU	B2, C22

	CVTU	B3, C32
	CVTU	B4, C42

	CVTU	B5, C14
	CVTU	B6, C24

	CVTU	B7, C34
	CVTU	B8, C44

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	load alpha_r
	SUB		C41, C41, A4
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	ADD		C13, A5, C13		#	ad'+'cb
	ADD		C23, A6, C23
	ADD		C33, A7, C33
	ADD		C43, A8, C43
	SUB		C12, C12, B1
	SUB		C22, C22, B2
	SUB		C32, C32, B3
	SUB		C42, C42, B4
	ADD		C14, B5, C14
	ADD		C24, B6, C24
	ADD		C34, B7, C34
	ADD		C44, B8, C44

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	LD		C13, 0 * SIZE(CO2)
	LD		C23, 2 * SIZE(CO2)
	LD		C33, 4 * SIZE(CO2)
	LD		C43, 6 * SIZE(CO2)
	LD		C11, 1 * SIZE(CO2)
	LD		C21, 3 * SIZE(CO2)
	LD		C31, 5 * SIZE(CO2)
	LD		C41, 7 * SIZE(CO2)

	MADD	C13, C13, C12, A1
	MADD	C23, C23, C22, A1

	MADD	C33, C33, C32, A1
	ST		B1, 0 * SIZE(CO1)

	MADD	C43, C43, C42, A1
	ST		B3, 2 * SIZE(CO1)

	MADD	C11, C11, C14, A1
	ST		B5, 4 * SIZE(CO1)

	MADD	C21, C21, C24, A1
	ST		B7, 6 * SIZE(CO1)

	MADD	C31, C31, C34, A1
	ST		B2, 1 * SIZE(CO1)

	MADD	C41, C41, C44, A1
	ST		B4, 3 * SIZE(CO1)

	NMSUB	C13, C13, C14, A2
	ST		B6, 5 * SIZE(CO1)

	NMSUB	C23, C23, C24, A2
	ST		B8, 7 * SIZE(CO1)

	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2
	
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	SUB		C13, A5, C13		#	ad'+'cb
	SUB		C23, A6, C23
	SUB		C33, A7, C33
	SUB		C43, A8, C43
	ADD		C12, B1, C12
	ADD		C22, B2, C22
	ADD		C32, B3, C32
	ADD		C42, B4, C42
	SUB		C14, B5, C14
	SUB		C24, B6, C24
	SUB		C34, B7, C34
	SUB		C44, B8, C44

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	LD		C13, 0 * SIZE(CO2)
	LD		C23, 2 * SIZE(CO2)
	LD		C33, 4 * SIZE(CO2)
	LD		C43, 6 * SIZE(CO2)
	LD		C11, 1 * SIZE(CO2)
	LD		C21, 3 * SIZE(CO2)
	LD		C31, 5 * SIZE(CO2)
	LD		C41, 7 * SIZE(CO2)

	MADD	C13, C13, C12, A1
	MADD	C23, C23, C22, A1

	MADD	C33, C33, C32, A1
	ST		B1, 0 * SIZE(CO1)

	MADD	C43, C43, C42, A1
	ST		B3, 2 * SIZE(CO1)

	MADD	C11, C11, C14, A1
	ST		B5, 4 * SIZE(CO1)

	MADD	C21, C21, C24, A1
	ST		B7, 6 * SIZE(CO1)

	MADD	C31, C31, C34, A1
	ST		B2, 1 * SIZE(CO1)

	MADD	C41, C41, C44, A1
	ST		B4, 3 * SIZE(CO1)

	NMSUB	C13, C13, C14, A2
	ST		B6, 5 * SIZE(CO1)

	NMSUB	C23, C23, C24, A2
	ST		B8, 7 * SIZE(CO1)

	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2
	
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)

#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
	SUB		C13, C13, A5		#	ad'+'cb
	SUB		C23, C23, A6
	SUB		C33, C33, A7
	SUB		C43, C43, A8
	ADD		C12, B1, C12
	ADD		C22, B2, C22
	ADD		C32, B3, C32
	ADD		C42, B4, C42
	SUB		C14, C14, B5
	SUB		C24, C24, B6
	SUB		C34, C34, B7
	SUB		C44, C44, B8

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	LD		C13, 0 * SIZE(CO2)
	LD		C23, 2 * SIZE(CO2)
	LD		C33, 4 * SIZE(CO2)
	LD		C43, 6 * SIZE(CO2)
	LD		C11, 1 * SIZE(CO2)
	LD		C21, 3 * SIZE(CO2)
	LD		C31, 5 * SIZE(CO2)
	LD		C41, 7 * SIZE(CO2)

	MADD	C13, C13, C12, A1
	MADD	C23, C23, C22, A1

	MADD	C33, C33, C32, A1
	ST		B1, 0 * SIZE(CO1)

	MADD	C43, C43, C42, A1
	ST		B3, 2 * SIZE(CO1)

	MADD	C11, C11, C14, A1
	ST		B5, 4 * SIZE(CO1)

	MADD	C21, C21, C24, A1
	ST		B7, 6 * SIZE(CO1)

	MADD	C31, C31, C34, A1
	ST		B2, 1 * SIZE(CO1)

	MADD	C41, C41, C44, A1
	ST		B4, 3 * SIZE(CO1)

	NMSUB	C13, C13, C14, A2
	ST		B6, 5 * SIZE(CO1)

	NMSUB	C23, C23, C24, A2
	ST		B8, 7 * SIZE(CO1)

	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2
	
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)

#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	SUB		C41, C41, A4
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i

	ADD		C13, A5, C13		#	ad'+'cb
	ADD		C23, A6, C23
	ADD		C33, A7, C33
	ADD		C43, A8, C43
	SUB		C12, C12, B1
	SUB		C22, C22, B2
	SUB		C32, C32, B3
	SUB		C42, C42, B4
	ADD		C14, B5, C14
	ADD		C24, B6, C24
	ADD		C34, B7, C34
	ADD		C44, B8, C44
	NEG		C13, C13
	NEG		C23, C23
	NEG		C33, C33
	NEG		C43, C43
	NEG		C14, C14
	NEG		C24, C24
	NEG		C34, C34
	NEG		C44, C44

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	LD		C13, 0 * SIZE(CO2)
	LD		C43, 6 * SIZE(CO2)
	LD		C23, 2 * SIZE(CO2)
	LD		C33, 4 * SIZE(CO2)
	LD		C11, 1 * SIZE(CO2)
	LD		C21, 3 * SIZE(CO2)
	LD		C31, 5 * SIZE(CO2)
	LD		C41, 7 * SIZE(CO2)

	MADD	C13, C13, C12, A1
	ST		B1, 0 * SIZE(CO1)

	MADD	C23, C23, C22, A1
	ST		B3, 2 * SIZE(CO1)

	MADD	C33, C33, C32, A1
	ST		B5, 4 * SIZE(CO1)

	MADD	C43, C43, C42, A1
	ST		B7, 6 * SIZE(CO1)

	MADD	C11, C11, C14, A1
	ST		B2, 1 * SIZE(CO1)

	MADD	C21, C21, C24, A1
	ST		B4, 3 * SIZE(CO1)

	MADD	C31, C31, C34, A1
	ST		B6, 5 * SIZE(CO1)

	MADD	C41, C41, C44, A1
	ST		B8, 7 * SIZE(CO1)

	NMSUB	C13, C13, C14, A2
	NMSUB	C23, C23, C24, A2
	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)

#endif

#else
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C31
	CVTU	A4, C41

	CVTU	A5, C13
	CVTU	A6, C23

	CVTU	A7, C33
	CVTU	A8, C43

	CVTU	B1, C12
	CVTU	B2, C22

	CVTU	B3, C32
	CVTU	B4, C42

	CVTU	B5, C14
	CVTU	B6, C24

	CVTU	B7, C34
	CVTU	B8, C44

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	load alpha_r
	SUB		C41, C41, A4
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	ADD		C13, A5, C13		#	ad'+'cb
	ADD		C23, A6, C23
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	ADD		C33, A7, C33
	ADD		C43, A8, C43
	SUB		C12, C12, B1
	SUB		C22, C22, B2
	SUB		C32, C32, B3
	SUB		C42, C42, B4
	ADD		C14, B5, C14
	ADD		C24, B6, C24
	ADD		C34, B7, C34
	ADD		C44, B8, C44

	MUL		B1, C11, A1		#	A1 = alpha_r
	MUL		B3, C21, A1
	MUL		B5, C31, A1
	MUL		B7, C41, A1
	MUL		B2, C13, A1
	MUL		B4, C23, A1
	MUL		B6, C33, A1
	MUL		B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2
	
	ST		B1, 0 * SIZE(CO1)
	MUL		C13, C12, A1
	MUL		C23, C22, A1

	ST		B3, 2 * SIZE(CO1)
	MUL		C33, C32, A1
	MUL		C43, C42, A1

	ST		B5, 4 * SIZE(CO1)
	MUL		C11, C14, A1
	MUL		C21, C24, A1

	ST		B7, 6 * SIZE(CO1)
	MUL		C31, C34, A1
	MUL		C41, C44, A1

	ST		B2, 1 * SIZE(CO1)
	NMSUB	C13, C13, C14, A2
	NMSUB	C23, C23, C24, A2

	ST		B4, 3 * SIZE(CO1)
	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	ST		B6, 5 * SIZE(CO1)
	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2
	
	ST		B8, 7 * SIZE(CO1)
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	SUB		C13, A5, C13		#	ad'+'cb
	SUB		C23, A6, C23
	SUB		C33, A7, C33
	SUB		C43, A8, C43
	ADD		C12, B1, C12
	ADD		C22, B2, C22
	ADD		C32, B3, C32
	ADD		C42, B4, C42
	SUB		C14, B5, C14
	SUB		C24, B6, C24
	SUB		C34, B7, C34
	SUB		C44, B8, C44

	MUL		B1, C11, A1		#	A1 = alpha_r
	MUL		B3, C21, A1
	MUL		B5, C31, A1
	MUL	 	B7, C41, A1
	MUL		B2, C13, A1
	MUL		B4, C23, A1
	MUL		B6, C33, A1
	MUL		B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	MUL		C13, C12, A1
	MUL		C23, C22, A1

	ST		B1, 0 * SIZE(CO1)
	MUL		C33, C32, A1
	MUL		C43, C42, A1

	ST		B3, 2 * SIZE(CO1)
	MUL		C11, C14, A1
	MUL		C21, C24, A1

	ST		B5, 4 * SIZE(CO1)
	MUL 	C31, C34, A1
	MUL 	C41, C44, A1

	ST		B7, 6 * SIZE(CO1)
	NMSUB	C13, C13, C14, A2
	NMSUB	C23, C23, C24, A2

	ST		B2, 1 * SIZE(CO1)
	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	ST		B4, 3 * SIZE(CO1)
	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2

	ST		B6, 5 * SIZE(CO1)
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		B8, 7 * SIZE(CO1)
	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)

#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
	SUB		C13, C13, A5		#	ad'+'cb
	SUB		C23, C23, A6
	SUB		C33, C33, A7
	SUB		C43, C43, A8
	ADD		C12, B1, C12
	ADD		C22, B2, C22
	ADD		C32, B3, C32
	ADD		C42, B4, C42
	SUB		C14, C14, B5
	SUB		C24, C24, B6

	SUB		C34, C34, B7
	SUB		C44, C44, B8

	MUL	B1, C11, A1		#	A1 = alpha_r
	MUL	B3, C21, A1
	MUL	B5, C31, A1
	MUL	B7, C41, A1
	MUL	B2, C13, A1
	MUL	B4, C23, A1
	MUL	B6, C33, A1
	MUL	B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	MUL	C13, C12, A1
	MUL	C23, C22, A1

	ST		B1, 0 * SIZE(CO1)
	MUL	C33, C32, A1
	MUL	C43, C42, A1

	ST		B3, 2 * SIZE(CO1)
	MUL	C11, C14, A1
	MUL	C21, C24, A1

	ST		B5, 4 * SIZE(CO1)
	MUL	C31, C34, A1
	MUL	C41, C44, A1

	ST		B7, 6 * SIZE(CO1)
	NMSUB	C13, C13, C14, A2
	NMSUB	C23, C23, C24, A2

	ST		B2, 1 * SIZE(CO1)
	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	ST		B4, 3 * SIZE(CO1)
	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2

	ST		B6, 5 * SIZE(CO1)
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		B8, 7 * SIZE(CO1)
	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)

#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	SUB		C41, C41, A4
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i

	ADD		C13, A5, C13		#	ad'+'cb
	ADD		C23, A6, C23
	ADD		C33, A7, C33
	ADD		C43, A8, C43
	SUB		C12, C12, B1
	SUB		C22, C22, B2
	SUB		C32, C32, B3
	SUB		C42, C42, B4
	ADD		C14, B5, C14
	ADD		C24, B6, C24
	ADD		C34, B7, C34
	ADD		C44, B8, C44

	NEG		C13, C13
	NEG		C23, C23
	NEG		C33, C33
	NEG		C43, C43
	NEG		C14, C14
	NEG		C24, C24
	NEG		C34, C34
	NEG		C44, C44

	MUL		B1, C11, A1		#	A1 = alpha_r
	MUL		B3, C21, A1
	MUL		B5, C31, A1
	MUL		B7, C41, A1
	MUL		B2, C13, A1
	MUL		B4, C23, A1
	MUL		B6, C33, A1
	MUL		B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	MUL		C13, C12, A1
	MUL		C23, C22, A1

	ST		B3, 2 * SIZE(CO1)
	MUL		C33, C32, A1
	MUL		C43, C42, A1

	ST		B5, 4 * SIZE(CO1)
	MUL		C11, C14, A1
	MUL		C21, C24, A1

	ST		B7, 6 * SIZE(CO1)
	MUL		C31, C34, A1
	MUL		C41, C44, A1

	ST		B2, 1 * SIZE(CO1)
	NMSUB	C13, C13, C14, A2
	NMSUB	C23, C23, C24, A2

	ST		B4, 3 * SIZE(CO1)
	NMSUB	C33, C33, C34, A2
	NMSUB	C43, C43, C44, A2

	ST		B6, 5 * SIZE(CO1)
	MADD	C11, C11, C12, A2
	MADD	C21, C21, C22, A2

	ST		B8, 7 * SIZE(CO1)
	MADD	C31, C31, C32, A2
	MADD	C41, C41, C42, A2

	ST		C13, 0 * SIZE(CO2)
	ST		C23, 2 * SIZE(CO2)
	ST		C33, 4 * SIZE(CO2)
	ST		C43, 6 * SIZE(CO2)
	ST		C11, 1 * SIZE(CO2)
	ST		C21, 3 * SIZE(CO2)
	ST		C31, 5 * SIZE(CO2)
	ST		C41, 7 * SIZE(CO2)
#endif


#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	L, TEMP, 2 + ZBASE_SHIFT
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif

#endif
	daddiu	CO1, CO1, 8 * SIZE
	bgtz	I, .L241
	daddiu	CO2, CO2, 8 * SIZE

	.align	4
.L22:
	andi	I, M, 2				#	MR=4
	blez	I, .L21
	NOP

	.align	4
.L221:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	TEMP, KK, 1 + ZBASE_SHIFT		#	NR=2

	daddu	AO, AO, TEMP
	daddu	BO, B,  TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	MOV		C21, C11
	MOV		C22, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11
	MOV		C14, C11

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)

	FETCH	$0, 8 * SIZE(CO1)
	MOV		C24, C11
	
	FETCH	$0, 0 * SIZE(CO2)
	FETCH	$0, 8 * SIZE(CO2)

	PLU		B3,	B1, B1
	PLU		B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2							#	MR=2
#else	
	daddiu	TEMP, KK, 2							#	NR=2
#endif
	dsra	L,  TEMP, 2
	blez	L, .L222
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=64

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	MOV		C21, C11
	MOV		C22, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11
	MOV		C14, C11

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)

	FETCH	$0, 8 * SIZE(CO1)
	MOV		C24, C11
	
	FETCH	$0, 0 * SIZE(CO2)
	FETCH	$0, 8 * SIZE(CO2)

	PLU		B3,	B1, B1
	blez	L, .L222
	PLU		B4, B2, B2
#endif

.L2210:
	daddiu	L, L, -1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	gsLQC1(R12, F5, F4, 2)		#	A5 A6
	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	gsLQC1(R13, F9, F8, 2)		#	B1 B2
	MADPS	C11, C11, A3, B5
	MADPS	C21, C21, A4, B5

	MADPS	C12, C12, A3, B6
	MADPS	C22, C22, A4, B6

	MADPS	C13, C13, A3, B7
	MADPS	C23, C23, A4, B7

	MADPS	C14, C14, A3, B8
	MADPS	C24, C24, A4, B8

	gsLQC1(R12, F7, F6, 3)		#	A7 A8
	PLU		B3,	B1, B1
	PLU		B4, B2, B2

	gsLQC1(R13, F13, F12, 3)	#	B3 B4
	MADPS	C11, C11, A5, B1
	MADPS	C21, C21, A6, B1

	MADPS	C12, C12, A5, B2
	MADPS	C22, C22, A6, B2
	daddiu	BO, BO, 4 * 4 * SIZE	#	4KR*4NR

	daddiu	AO, AO, 4 * 4 * SIZE 	#	4KR*8MR
	MADPS	C13, C13, A5, B3
	MADPS	C23, C23, A6, B3

	MADPS	C14, C14, A5, B4
	MADPS	C24, C24, A6, B4

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MADPS	C11, C11, A7, B5
	MADPS	C21, C21, A8, B5

	MADPS	C12, C12, A7, B6
	MADPS	C22, C22, A8, B6

	MADPS	C13, C13, A7, B7
	MADPS	C23, C23, A8, B7

	MADPS	C14, C14, A7, B8
	MADPS	C24, C24, A8, B8

	PLU		B3,	B1, B1
	bgtz	L, .L2210
	PLU		B4, B2, B2


	.align	4
.L222:
#ifndef TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L227
	NOP

	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6
	daddiu	BO, BO, 2 * 4 * SIZE

	daddiu	AO, AO, 2 * 4 * SIZE
	MADPS	C11, C11, A3, B5
	MADPS	C21, C21, A4, B5
	gsLQC1(R13, F9, F8, 0)		#	A1 A2

	MADPS	C12, C12, A3, B6
	MADPS	C22, C22, A4, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C13, C13, A3, B7
	MADPS	C23, C23, A4, B7

	MADPS	C14, C14, A3, B8
	MADPS	C24, C24, A4, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2


	.align	4
.L227:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L220
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	daddiu	BO, BO, 4 * SIZE
	daddiu	AO, AO, 4 * SIZE

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	.align	4
.L220:							#	Write Back
#ifndef TRMMKERNEL
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C13
	CVTU	A4, C23

	CVTU	A5, C12
	CVTU	A6, C22

	CVTU	A7, C14
	CVTU	A8, C24


#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	SUB		C22, C22, A6
	ADD		C14, A7, C14
	ADD		C24, A8, C24

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B7, 2 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)
	LD		B8, 3 * SIZE(CO2)

	MADD	B5, B5, C12, A1
	MADD	B7, B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MADD	B6, B6, C14, A1
	MADD	B8, B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, A3, C13		#	ad'+'cb
	SUB		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C12, A5, C12
	ADD		C22, A6, C22
	SUB		C14, A7, C14
	SUB		C24, A8, C24

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B7, 2 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)
	LD		B8, 3 * SIZE(CO2)

	MADD	B5, B5, C12, A1
	MADD	B7, B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MADD	B6, B6, C14, A1
	MADD	B8, B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)

#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, C13, A3		#	ad'+'cb
	SUB		C23, C23, A4
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	ADD		C12, A5, C12
	ADD		C22, A6, C22
	SUB		C14, C14, A7
	SUB		C24, C24, A8
	
	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B7, 2 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)
	LD		B8, 3 * SIZE(CO2)

	MADD	B5, B5, C12, A1
	MADD	B7, B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MADD	B6, B6, C14, A1
	MADD	B8, B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)

#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	SUB		C22, C22, A6
	ADD		C14, A7, C14
	ADD		C24, A8, C24
	NEG		C13, C13
	NEG		C23, C23
	NEG		C14, C14
	NEG		C24, C24


	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B7, 2 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)
	LD		B8, 3 * SIZE(CO2)

	MADD	B5, B5, C12, A1
	MADD	B7, B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MADD	B6, B6, C14, A1
	MADD	B8, B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2

	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)
#endif

#else
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C13
	CVTU	A4, C23

	CVTU	A5, C12
	CVTU	A6, C22

	CVTU	A7, C14
	CVTU	A8, C24


#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	SUB		C22, C22, A6
	ADD		C14, A7, C14
	ADD		C24, A8, C24

	MUL	B1, C11, A1		#	A1 = alpha_r
	MUL	B3, C21, A1
	MUL	B2, C13, A1
	MUL	B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2


	MUL	B5, C12, A1
	MUL	B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MUL	B6, C14, A1
	MUL	B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, A3, C13		#	ad'+'cb
	SUB		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C12, A5, C12
	ADD		C22, A6, C22
	SUB		C14, A7, C14
	SUB		C24, A8, C24

	MUL	B1, C11, A1		#	A1 = alpha_r
	MUL	B3, C21, A1
	MUL	B2, C13, A1
	MUL	B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	MUL	B5, C12, A1
	MUL	B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MUL	B6, C14, A1
	MUL	B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)

#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, C13, A3		#	ad'+'cb
	SUB		C23, C23, A4
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	ADD		C12, A5, C12
	ADD		C22, A6, C22
	SUB		C14, C14, A7
	SUB		C24, C24, A8
	
	MUL	B1, C11, A1		#	A1 = alpha_r
	MUL	B3, C21, A1
	MUL	B2, C13, A1
	MUL	B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	MUL	B5, C12, A1
	MUL	B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MUL	B6, C14, A1
	MUL	B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)

#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	SUB		C22, C22, A6
	ADD		C14, A7, C14
	ADD		C24, A8, C24
	NEG		C13, C13
	NEG		C23, C23
	NEG		C14, C14
	NEG		C24, C24

	MUL	B1, C11, A1		#	A1 = alpha_r
	MUL	B3, C21, A1
	MUL	B2, C13, A1
	MUL	B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	MUL	B5, C12, A1
	MUL	B7, C22, A1

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	MUL	B6, C14, A1
	MUL	B8, C24, A1

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	NMSUB	B7, B7, C24, A2

	MADD	B6, B6, C12, A2
	MADD	B8, B8, C22, A2

	ST		B5, 0 * SIZE(CO2)
	ST		B7, 2 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
	ST		B8, 3 * SIZE(CO2)
#endif

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT

	daddu	AO, AO, TEMP
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif

#endif
	daddiu	CO1, CO1, 4 * SIZE
	daddiu	CO2, CO2, 4 * SIZE


	.align	4
.L21:
	andi	I, M, 1
	blez	I, .L20
	NOP

	.align	4
.L211:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, ZBASE_SHIFT			#	MR=1
	dsll	TEMP, KK, 1 + ZBASE_SHIFT		#	NR=2

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11
	MOV		C14, C11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 0 * SIZE(CO2)

	PLU		B3,	B1, B1
	PLU		B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1							#	MR=1
#else	
	daddiu	TEMP, KK, 2							#	NR=2
#endif
	dsra	L,  TEMP, 2
	blez	L, .L212
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=64

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11
	MOV		C14, C11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 0 * SIZE(CO2)

	PLU		B3,	B1, B1
	blez	L, .L212
	PLU		B4, B2, B2
#endif

.L2110:
	daddiu	L, L, -1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C12, C12, A1, B2

	MADPS	C13, C13, A1, B3
	MADPS	C14, C14, A1, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	gsLQC1(R13, F9, F8, 2)		#	B1 B2
	MADPS	C11, C11, A2, B5
	MADPS	C12, C12, A2, B6

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C13, C13, A2, B7
	MADPS	C14, C14, A2, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2

	gsLQC1(R13, F13, F12, 3)	#	B3 B4
	MADPS	C11, C11, A3, B1
	MADPS	C12, C12, A3, B2
	daddiu	BO, BO, 4 * 4 * SIZE	#	4KR*4NR

	daddiu	AO, AO, 2 * 4 * SIZE 	#	4KR*8MR
	MADPS	C13, C13, A3, B3
	MADPS	C14, C14, A3, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MADPS	C11, C11, A4, B5
	MADPS	C12, C12, A4, B6

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MADPS	C13, C13, A4, B7
	MADPS	C14, C14, A4, B8

	PLU		B3,	B1, B1
	bgtz	L, .L2110
	PLU		B4, B2, B2


	.align	4
.L212:
#ifndef TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L217
	NOP

	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C12, C12, A1, B2

	MADPS	C13, C13, A1, B3
	MADPS	C14, C14, A1, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6
	daddiu	BO, BO, 2 * 4 * SIZE

	MADPS	C11, C11, A2, B5
	MADPS	C12, C12, A2, B6
	daddiu	AO, AO, 4 * SIZE

	MADPS	C13, C13, A2, B7
	MADPS	C14, C14, A2, B8

	gsLQC1(R12, F1, F0, 0)		#	A5 A6
	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	PLU		B3,	B1, B1
	PLU		B4, B2, B2


	.align	4
.L217:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L210
	NOP

	MADPS	C11, C11, A1, B1
	daddiu	BO, BO, 4 * SIZE
	MADPS	C12, C12, A1, B2
	daddiu	AO, AO, 2 * SIZE

	MADPS	C13, C13, A1, B3
	MADPS	C14, C14, A1, B4

	.align	4
.L210:							#	Write Back
#ifndef TRMMKERNEL
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A3, C13
	CVTU	A5, C12
	CVTU	A7, C14

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	ADD		C14, A7, C14

	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)

	MADD	B5, B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MADD	B6, B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, A3, C13		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C12, A5, C12
	SUB		C14, A7, C14

	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)

	MADD	B5, B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MADD	B6, B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)

#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, C13, A3		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	ADD		C12, A5, C12
	SUB		C14, C14, A7
	
	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)

	MADD	B5, B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MADD	B6, B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
	LD		A4, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	ADD		C14, A7, C14
	NEG		C13, C13
	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	NEG		C14, C14

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	LD		B5, 0 * SIZE(CO2)
	LD		B6, 1 * SIZE(CO2)

	MADD	B5, B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MADD	B6, B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2

	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
#endif

#else
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A3, C13
	CVTU	A5, C12
	CVTU	A7, C14

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	ADD		C14, A7, C14

	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	MUL B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MUL B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, A3, C13		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C12, A5, C12
	SUB		C14, A7, C14

	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	MUL B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MUL B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)

#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, C13, A3		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	ADD		C12, A5, C12
	SUB		C14, C14, A7
	
	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	MUL B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MUL B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2
	
	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
	LD		A4, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	SUB		C12, C12, A5
	ADD		C14, A7, C14
	NEG		C13, C13
	NEG		C14, C14

	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	MUL B5, C12, A4
	ST		B1, 0 * SIZE(CO1)
	MUL B6, C14, A4
	ST		B2, 1 * SIZE(CO1)

	NMSUB	B5, B5, C14, A2
	MADD	B6, B6, C12, A2

	ST		B5, 0 * SIZE(CO2)
	ST		B6, 1 * SIZE(CO2)
#endif


#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	L,    TEMP, ZBASE_SHIFT
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif

#endif
	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE


	.align	4
.L20:
	daddiu	J, J, -1
	move	B, BO

#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 2
#endif

	bgtz	J, .L24
	NOP


	.align	4
.L1:
	andi	J, N, 1
	blez	J, .L999
	NOP

.L14:
	dsra	I, M, 2				#	MR=8
	move	AO, A				#	Reset A

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	move	CO1, C
	blez	I, .L12
	daddu	C,   CO1, LDC

	.align	4
.L141:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 2 + ZBASE_SHIFT
	dsll	TEMP, KK, ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C21, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C31, C11
	MOV		C41, C11

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MOV		C13, C11
	MOV		C23, C11

	FETCH	$0, 0 * SIZE(CO1)
	MOV		C33, C11
	MOV		C43, C11

	FETCH	$0, 8 * SIZE(CO1)
	PLU		B3,	B1, B1
	PLU		B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 4						#	define Mr=4
#else
	daddiu	TEMP, KK, 1						#	define	NR=1
#endif
	dsra	L,  TEMP, 2
	blez	L, .L142
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=64

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C21, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C31, C11
	MOV		C41, C11

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MOV		C13, C11
	MOV		C23, C11

	FETCH	$0, 0 * SIZE(CO1)
	MOV		C33, C11
	MOV		C43, C11

	FETCH	$0, 8 * SIZE(CO1)
	PLU		B3,	B1, B1
	blez	L, .L142
	PLU		B4, B2, B2
#endif

.L1410:
	daddiu	L, L, -1
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	gsLQC1(R12, F7, F6, 3)		#	A7 A8
	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	gsLQC1(R12, F1, F0, 4)		#	A1 A2
	MADPS	C11, C11, A5, B2
	MADPS	C21, C21, A6, B2

	gsLQC1(R12, F3, F2, 5)		#	A3 A4
	MADPS	C31, C31, A7, B2
	MADPS	C41, C41, A8, B2
	daddiu	BO, BO, 2 * 4 * SIZE	#	4KR*4NR

	MADPS	C13, C13, A5, B4
	MADPS	C23, C23, A6, B4

	MADPS	C33, C33, A7, B4
	MADPS	C43, C43, A8, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6
	
	MADPS	C11, C11, A1, B5
	MADPS	C21, C21, A2, B5
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	gsLQC1(R12, F7, F6, 7)		#	A7 A8
	MADPS	C31, C31, A3, B5
	MADPS	C41, C41, A4, B5

	daddiu	AO, AO, 8 * 4 * SIZE 	#	4KR*8MR
	MADPS	C13, C13, A1, B7
	MADPS	C23, C23, A2, B7

	MADPS	C33, C33, A3, B7
	MADPS	C43, C43, A4, B7
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MADPS	C11, C11, A5, B6
	MADPS	C21, C21, A6, B6

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C31, C31, A7, B6
	MADPS	C41, C41, A8, B6

	MADPS	C13, C13, A5, B8
	MADPS	C23, C23, A6, B8

	MADPS	C33, C33, A7, B8
	MADPS	C43, C43, A8, B8

	PLU		B3,	B1, B1
	bgtz	L, .L1410
	PLU		B4, B2, B2


	.align	4
.L142:
#ifndef TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L147
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	gsLQC1(R12, F7, F6, 3)		#	A7 A8
	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	daddiu	AO, AO, 4 * 4 * SIZE 	#	4KR*8MR

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3
	gsLQC1(R13, F13, F8, 1)	#	B3 B4

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MADPS	C11, C11, A5, B2
	MADPS	C21, C21, A6, B2

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C31, C31, A7, B2
	MADPS	C41, C41, A8, B2
	daddiu	BO, BO, 4 * SIZE	#	4KR*4NR

	MADPS	C13, C13, A5, B4
	MADPS	C23, C23, A6, B4

	MADPS	C33, C33, A7, B4
	MADPS	C43, C43, A8, B4
	PLU		B3,	B1, B1

	
	.align	4
.L147:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L140
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	daddiu	BO, BO, 2 * SIZE

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	daddiu	AO, AO, 2 * 4 * SIZE

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3


	.align	4
.L140:							#	Write Back
#ifndef TRMMKERNEL
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C31
	CVTU	A4, C41

	CVTU	A5, C13
	CVTU	A6, C23

	CVTU	A7, C33
	CVTU	A8, C43

	CVTU	B1, C12
	CVTU	B2, C22

	CVTU	B3, C32
	CVTU	B4, C42

	CVTU	B5, C14
	CVTU	B6, C24

	CVTU	B7, C34
	CVTU	B8, C44

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	load alpha_r
	SUB		C41, C41, A4
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	ADD		C13, A5, C13		#	ad'+'cb
	ADD		C23, A6, C23
	ADD		C33, A7, C33
	ADD		C43, A8, C43

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	SUB		C13, A5, C13		#	ad'+'cb
	SUB		C23, A6, C23
	SUB		C33, A7, C33
	SUB		C43, A8, C43

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
	SUB		C13, C13, A5		#	ad'+'cb
	SUB		C23, C23, A6
	SUB		C33, C33, A7
	SUB		C43, C43, A8

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	AC'+'BD
	SUB		C21, C21, A2
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	LOAD ALPHA_R
#	LD		A1, 0 * SIZE(A)		#	LOAD ALPHA_R
	SUB		C41, C41, A4
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	LOAD ALPHA_I

	ADD		C13, A5, C13		#	AD'+'CB
	ADD		C23, A6, C23
	ADD		C33, A7, C33
	ADD		C43, A8, C43
	NEG		C13, C13		#	AD'+'CB
	NEG		C23, C23
	NEG		C33, C33
	NEG		C43, C43


	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B5, 4 * SIZE(CO1)
	LD		B7, 6 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)
	LD		B6, 5 * SIZE(CO1)
	LD		B8, 7 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = ALPHA_R
	MADD	B3, B3, C21, A1
	MADD	B5, B5, C31, A1
	MADD	B7, B7, C41, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	MADD	B6, B6, C33, A1
	MADD	B8, B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = ALPHA_I
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif

#else
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C31
	CVTU	A4, C41

	CVTU	A5, C13
	CVTU	A6, C23

	CVTU	A7, C33
	CVTU	A8, C43

	CVTU	B1, C12
	CVTU	B2, C22

	CVTU	B3, C32
	CVTU	B4, C42

	CVTU	B5, C14
	CVTU	B6, C24

	CVTU	B7, C34
	CVTU	B8, C44

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	load alpha_r
	SUB		C41, C41, A4
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	ADD		C13, A5, C13		#	ad'+'cb
	ADD		C23, A6, C23
	ADD		C33, A7, C33
	ADD		C43, A8, C43

	MUL B1, C11, A1		#	A1 = alpha_r
	MUL B3, C21, A1
	MUL B5, C31, A1
	MUL B7, C41, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	MUL B6, C33, A1
	MUL B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	SUB		C13, A5, C13		#	ad'+'cb
	SUB		C23, A6, C23
	SUB		C33, A7, C33
	SUB		C43, A8, C43

	MUL B1, C11, A1		#	A1 = alpha_r
	MUL B3, C21, A1
	MUL B5, C31, A1
	MUL B7, C41, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	MUL B6, C33, A1
	MUL B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	ADD		C31, A3, C31
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	ADD		C41, A4, C41
	LD		A2, 160($sp)		#	load alpha_i
	SUB		C13, C13, A5		#	ad'+'cb
	SUB		C23, C23, A6
	SUB		C33, C33, A7
	SUB		C43, C43, A8

	MUL B1, C11, A1		#	A1 = alpha_r
	MUL B3, C21, A1
	MUL B5, C31, A1
	MUL B7, C41, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	MUL B6, C33, A1
	MUL B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	AC'+'BD
	SUB		C21, C21, A2
	SUB		C31, C31, A3
	LD		A1, 152($sp)		#	LOAD ALPHA_R
#	LD		A1, 0 * SIZE(A)		#	LOAD ALPHA_R
	SUB		C41, C41, A4
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	LOAD ALPHA_I

	ADD		C13, A5, C13		#	AD'+'CB
	ADD		C23, A6, C23
	ADD		C33, A7, C33
	ADD		C43, A8, C43
	NEG		C13, C13		#	AD'+'CB
	NEG		C23, C23
	NEG		C33, C33
	NEG		C43, C43

	MUL B1, C11, A1		#	A1 = ALPHA_R
	MUL B3, C21, A1
	MUL B5, C31, A1
	MUL B7, C41, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	MUL B6, C33, A1
	MUL B8, C43, A1
	NMSUB	B1, B1, C13, A2		#	A2 = ALPHA_I
	NMSUB	B3, B3, C23, A2
	NMSUB	B5, B5, C33, A2
	NMSUB	B7, B7, C43, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2
	MADD	B6, B6, C31, A2
	MADD	B8, B8, C41, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B5, 4 * SIZE(CO1)
	ST		B7, 6 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
	ST		B6, 5 * SIZE(CO1)
	ST		B8, 7 * SIZE(CO1)
#endif


#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	L,    TEMP, 2 + ZBASE_SHIFT
	dsll	TEMP, TEMP, ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif

#endif
	bgtz	I, .L141
	daddiu	CO1, CO1, 8 * SIZE

	.align	4
.L12:
	andi	I, M, 2				#	MR=4
	blez	I, .L11
	NOP

	.align	4
.L121:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L, KK, 1 + ZBASE_SHIFT
	dsll	TEMP, KK,  ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C21, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11
	MOV		C23, C11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 8 * SIZE(CO1)
	
	PLU		B3,	B1, B1
	PLU		B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L,  TEMP, 2
	blez	L, .L122
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=64

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C21, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11
	MOV		C23, C11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 8 * SIZE(CO1)
	
	PLU		B3,	B1, B1
	blez	L, .L122
	PLU		B4, B2, B2
#endif

.L1210:
	daddiu	L, L, -1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	gsLQC1(R12, F5, F4, 2)		#	A5 A6
	PLU		B7,	B5, B5
	PLU		B8, B6, B6
	daddiu	BO, BO, 2 * 4 * SIZE	#	4KR*4NR

	MADPS	C11, C11, A3, B2
	MADPS	C21, C21, A4, B2

	gsLQC1(R12, F7, F6, 3)		#	A7 A8
	MADPS	C13, C13, A3, B4
	MADPS	C23, C23, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	daddiu	AO, AO, 4 * 4 * SIZE 	#	4KR*8MR

	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MADPS	C11, C11, A7, B6
	MADPS	C21, C21, A8, B6

	MADPS	C13, C13, A7, B8
	MADPS	C23, C23, A8, B8

	PLU		B3,	B1, B1
	bgtz	L, .L1210
	PLU		B4, B2, B2


	.align	4
.L122:
#ifndef	TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L127
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	PLU		B7,	B5, B5
	daddiu	BO, BO, 1 * 4 * SIZE

	daddiu	AO, AO, 2 * 4 * SIZE
	MADPS	C11, C11, A3, B2
	MADPS	C21, C21, A4, B2

	MADPS	C13, C13, A3, B4
	MADPS	C23, C23, A4, B4

	gsLQC1(R13, F9, F8, 0)
	gsLQC1(R12, F1, F0, 0)
	PLU		B3, B1, B1

	.align	4
.L127:
#ifndef	TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L120
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	daddiu	BO, BO, 2 * SIZE
	daddiu	AO, AO, 4 * SIZE

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	.align	4
.L120:							#	Write Back
#ifndef	TRMMKERNEL
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C13
	CVTU	A4, C23


#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, A3, C13		#	ad'+'cb
	SUB		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, C13, A3		#	ad'+'cb
	SUB		C23, C23, A4
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	
	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	NEG		C13, C13		#	ad'+'cb
	NEG		C23, C23

	LD		B1, 0 * SIZE(CO1)
	LD		B3, 2 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)
	LD		B4, 3 * SIZE(CO1)

	MADD	B1, B1, C11, A1		#	A1 = alpha_r
	MADD	B3, B3, C21, A1
	MADD	B2, B2, C13, A1
	MADD	B4, B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif

#else
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A2, C21

	CVTU	A3, C13
	CVTU	A4, C23


#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i

	MUL B1, C11, A1		#	A1 = alpha_r
	MUL B3, C21, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)

	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, A3, C13		#	ad'+'cb
	SUB		C23, A4, C23
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_r

	MUL B1, C11, A1		#	A1 = alpha_r
	MUL B3, C21, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	ADD		C21, A2, C21
	SUB		C13, C13, A3		#	ad'+'cb
	SUB		C23, C23, A4
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A2, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	
	MUL B1, C11, A1		#	A1 = alpha_r
	MUL B3, C21, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	SUB		C21, C21, A2
	ADD		C13, A3, C13		#	ad'+'cb
	ADD		C23, A4, C23
	LD		A1, 152($sp)		#	load alpha_r
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A2, 160($sp)
#	LD		A2, 0 * SIZE(A)		#	load alpha_i
	NEG		C13, C13		#	ad'+'cb
	NEG		C23, C23

	MUL B1, C11, A1		#	A1 = alpha_r
	MUL B3, C21, A1
	MUL B2, C13, A1
	MUL B4, C23, A1
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	NMSUB	B3, B3, C23, A2
	MADD	B2, B2, C11, A2
	MADD	B4, B4, C21, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B3, 2 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
	ST		B4, 3 * SIZE(CO1)
#endif
#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -1
#endif
	dsll	L, TEMP, 1 + ZBASE_SHIFT
	dsll	TEMP, TEMP, ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif

#endif
	daddiu	CO1, CO1, 4 * SIZE
	daddiu	CO2, CO2, 4 * SIZE


	.align	4
.L11:
	andi	I, M, 1
	blez	I, .L10
	NOP

	.align	4
.L111:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	TEMP, KK,  ZBASE_SHIFT

	daddu	AO, AO, TEMP
	daddu	BO, B,  TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11

	FETCH	$0, 0 * SIZE(CO1)

	PLU		B3,	B1, B1
	PLU		B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L,  TEMP, 2
	blez	L, .L112
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=64

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MOV		C13, C11

	FETCH	$0, 0 * SIZE(CO1)

	PLU		B3,	B1, B1
	blez	L, .L112
	PLU		B4, B2, B2
#endif

.L1110:
	daddiu	L, L, -1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4
	MADPS	C11, C11, A1, B1

	gsLQC1(R12, F3, F2, 1)		#	A3 A4
	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 2 * 4 * SIZE	#	4KR*4NR

	PLU		B7,	B5, B5
	PLU		B8, B6, B6
	daddiu	AO, AO, 2 * 4 * SIZE 	#	4KR*8MR

	MADPS	C11, C11, A2, B2
	MADPS	C13, C13, A2, B4

	MADPS	C11, C11, A3, B5
	MADPS	C13, C13, A3, B7

	gsLQC1(R13, F9, F8, 0)		#	B1 B2
	MADPS	C11, C11, A4, B6

	gsLQC1(R12, F1, F0, 0)		#	A1 A2
	MADPS	C13, C13, A4, B8

	PLU		B3,	B1, B1
	bgtz	L, .L1110
	PLU		B4, B2, B2


	.align	4
.L112:
#ifndef	TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L117
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 4 * SIZE
	daddiu	AO, AO, 4 * SIZE

	MADPS	C11, C11, A2, B2
	MADPS	C13, C13, A2, B4

	gsLQC1(R13, F9, F8, 0)
	gsLQC1(R12, F1, F0, 0)
	PLU		B3,	B1, B1


	.align	4
.L117:
#ifndef	TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L110
	NOP

	daddiu	BO, BO, 2 * SIZE
	daddiu	AO, AO, 2 * SIZE

	MADPS	C11, C11, A1, B1
	MADPS	C13, C13, A1, B3


	.align	4
.L110:							#	Write Back
#ifndef	TRMMKERNEL
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A3, C13

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i

	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, A3, C13		#	ad'+'cb
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i

	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, C13, A3		#	ad'+'cb
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
	
	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
	NEG		C13, C13
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)

	LD		B1, 0 * SIZE(CO1)
	LD		B2, 1 * SIZE(CO1)

	MADD	B1, B1, C11, A4		#	A1 = alpha_r
	MADD	B2, B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif

#else
	daddiu	I, I, -1
	CVTU	A1, C11
	CVTU	A3, C13

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	/*	(a + bi) * (c + di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
#	LD		A1, 0 * SIZE(A)		#	load alpha_r
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i
#	LD		A2, 0 * SIZE(A)		#	load alpha_i

	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
	/*	(a + bi) * (c - di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, A3, C13		#	ad'+'cb
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i

	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif

#if	  defined(RN) || defined(RT) || defined(CN) || defined(CT)
	/*	(a - bi) * (c + di) */
	ADD		C11, A1, C11		#	ac'+'bd
	SUB		C13, C13, A3		#	ad'+'cb
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)		#	load alpha_i

	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
	/*	(a - bi) * (c - di) */
	SUB		C11, C11, A1		#	ac'+'bd
	ADD		C13, A3, C13		#	ad'+'cb
	NEG		C13, C13
	LD		A4, 152($sp)		#	load alpha_r
	LD		A2, 160($sp)

	MUL B1, C11, A4		#	A1 = alpha_r
	MUL B2, C13, A4
	NMSUB	B1, B1, C13, A2		#	A2 = alpha_i
	MADD	B2, B2, C11, A2

	ST		B1, 0 * SIZE(CO1)
	ST		B2, 1 * SIZE(CO1)
#endif


#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	TEMP, TEMP, ZBASE_SHIFT

	daddu	AO, AO, TEMP
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif

#endif
	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE


	.align	4
.L10:
	move	B, BO
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 1
#endif

.L999:
	ld	$16,   0($sp)
	ld	$17,   8($sp)
	ld	$18,  16($sp)
	ld	$19,  24($sp)
	ld	$20,  32($sp)
	ld	$21,  40($sp)
	ld	$22,  48($sp)

	LD	$f24, 56($sp)
	LD	$f25, 64($sp)
	LD	$f26, 72($sp)
	LD	$f27, 80($sp)
	LD	$f28, 88($sp)

#if defined(TRMMKERNEL)
	ld	$23,  96($sp)
	ld	$24, 104($sp)
	ld	$25, 112($sp)
#endif

#ifndef __64BIT__
	LD	$f20,120($sp)
	LD	$f21,128($sp)
	LD	$f22,136($sp)
	LD	$f23,144($sp)
#endif

	daddiu	$sp,$sp,STACKSIZE
	j	$31
	nop

	EPILOGUE
