fetch
dup
and1
and2
multiply
shift
or

pack
store

0	u	fetch A
	v	or C
1	u	fetch B
	v	or D
2	u	dup A
	v	dup B
3	u	shift C
	v	and1 A
4	u	multiply A		(not available until clock 7)
	v	and1 B
5	u	multiply B		(not available until clock 8)
	v	shift D
6	u	and2 A
	v	and2 B
7	u	or A			(multiply A available)
	v	pack CD
8	u	store CD		(multiply B available)
	v	or B
9	u	fetch C
	v	shift A
10	u	fetch D
	v	shift B
11	u	dup C
	v	pack AB
12	u	store AB
	v	dup D
13	u	and1 C
	v	and1 D
14	u	multiply C		(not available until clock 0)
	v	and2 C
15	u	multiply D		(not available until clock 1)
	v	and2 D
16	u	update loop cntr
	v	loop back



0	u	fetch A		movq		mm0,[esi+ebp*2]	;allocate 0
	v	or C			por		mm4,mm1		;free 1
1	u	fetch B		movq		mm1,[esi+ebp*2]	;allocate 1
	v	or D			por		mm5,mm3		;free 3
2	u	dup A			movq		mm2,mm0		;allocate 2
	v	dup B			movq		mm3,mm1		;allocate 3
3	u	shift C		psrlq		mm4,6
	v	and1 A			pand		mm0,mm6
4	u	multiply A		pmaddwd	mm0,MULT
	v	and1 B			pand		mm2,mm6
5	u	multiply B		pmaddwd	mm1,MULT
	v	shift D		psrlq		mm5,6
6	u	and2 A			pand		mm1,mm7
	v	and2 B			pand		mm3,mm7
7	u	or A			por		mm0,mm1		;free 1
	v	pack CD		packusdw	mm4,mm5
8	u	store CD		movq		[edi+ebp],mm4
	v	or B			por		mm2,mm3		;free 3
9	u	fetch C		movq		mm4,[esi+ebp*2]	;allocate 4
	v	shift A		psrlq		mm0,6
10	u	fetch D		movq		mm5,[esi+ebp*2]	;allocate 5
	v	shift B		psrlq		mm2,6
11	u	dup C			movq		mm1,mm4		;allocate 1
	v	pack AB		packusdw	mm0,mm2		;free 2
12	u	store AB		movq		[edi+ebp],mm0		;free 0
	v	dup D			movq		mm3,mm5		;allocate 3
13	u	and1 C			pand		mm4,mm6
	v	and1 D			pand		mm5,mm6
14	u	multiply C		pmaddwd	mm4,MULT
	v	and2 C			pand		mm1,mm7
15	u	multiply D		pmaddwd	mm5,MULT
	v	and2 D			pand		mm3,mm7
16	u	update loop cntr	sub		ebp,16
	v	loop back		jne		looptop

;------------------------------------------------------------------------------
| b0 g0 r0 b1 | g1 r1 b2 g2 | r2 b3 g3 r3 | b4 g4 r4 b5 | g5 r5 b6 g6 | r6 b7 g7 r7 |


	movq	mm0,[src0]		;mm0 = g2b2r1g1b1r0g0b0
	movq	mm1,mm0		;mm1 = g2b2r1g1b1r0g0b0
	movq	mm2,mm0		;mm2 = g2b2r1g1b1r0g0b0
	pand	mm0,MASK1		;mm0 = ..........r0g0b0
	pand	mm1,MASK2		;mm1 = ....r1g1b1......
	psrlq	mm2,48			;mm2 = ............g2b2
	psllq	mm1,8			;mm1 = ..r1g1b1........
	por	mm0,mm1		;mm0 = ..r1g1b1..r0g0b0
	movq	[dest0],mm0
	movq	mm3,[src1]		;mm3 = b5r4g4b4r3g3b3r2
	movq	mm4,mm3		;mm4 = b5r4g4b4r3g3b3r2
	movq	mm5,mm3		;mm5 = b5r4g4b4r3g3b3r2
	psllq	mm4,16			;mm4 = g4b4r3g3b3r2....
	por	mm2,mm4		;mm2 = g4b4r3g3b3r2g2b2
	pand	mm2,MASK3		;mm2 = ..........r2g2b2
	pand	mm5,MASK4		;mm5 = ....r3g3b3......
	psllq	mm5,8			;mm5 = ..r3g3b3........
	por	mm2,mm5		;mm2 = ..r3g3b3..r2g2b2
	movq	[dest1],mm2
	movq	mm1,mm3		;mm1 = b5r4g4b4r3g3b3r2
	psrlq	mm3,32			;mm3 = ........b5r4g4b4
	psllq	mm1,24			;mm1 = ......b5r4g4b4r3
	movq	mm2,[src2]		;mm2 = r7g7b7r6g6b6r5g5
	movq	mm0,mm2		;mm0 = r7g7b7r6g6b6r5g5
	psllq	mm0,40			;mm0 = b6r5g5..........
	pand	mm1,MASK5		;mm1 = ......b5r4g4b4r3
	por	mm1,mm0		;mm1 = b6r5g5b5r4g4b4r3
	movq	[dest2],mm



src0		g2b2r1g1b1r0g0b0
src1		b5r4g4b4r3g3b3r2
src2		r7g7b7r6g6b6r5g5

src0->a0	..........r0g0b0
src0->a1	b2r1g1b1r0g0b0..
a1->a2		..r1g1b1........
a0+a2->dst0	..r1g1b1..r0g0b0

src0->b0	............g2b2
src1->b1	g4b4r3g3b3r2....
b0+b1->b2	g4b4r3g3b3r2g2b2
src1->b2	b4r3g3b3r2......
b2->b3		..........r2g2b2
b2+b3->dst1	b4r3g3b3r2r2g2b2

src1->c1	........b5r4g4b4
src1->c2	......b5r4g4b4r3
src2->c3	b6r5g5..........
c2+c3->c4	b6r5g5b5r4g4b4r3
c4->c5		b6r5g5b5........
c1+c5->dst2	b6r5g5b5b5r4g4b4

src2->d1	....r7g7b7r6g6b6
src2->d2	..r7g7b7r6g6b6r5
d2->d3		..r7g7b7........
d1->d4		..........r6g6b6
d3+d4->dst3	..r7g7b7..r6g6b6
