	.386
	.mmx
	.model	flat

	extern	_MMX_enabled:byte
	extern	_FPU_enabled:byte

_TEXT64	segment public use32 para 'CODE'

bilinMMX_tab1	dq	0000000000000000h
		dq	0001000100010001h
		dq	0002000200020002h
		dq	0003000300030003h
		dq	0004000400040004h
		dq	0005000500050005h
		dq	0006000600060006h
		dq	0007000700070007h
		dq	0008000800080008h
		dq	0009000900090009h
		dq	000a000a000a000ah
		dq	000b000b000b000bh
		dq	000c000c000c000ch
		dq	000d000d000d000dh
		dq	000e000e000e000eh
		dq	000f000f000f000fh

bilinMMX_tab2	dq	0010001000100010h
		dq	000f000f000f000fh
		dq	000e000e000e000eh
		dq	000d000d000d000dh
		dq	000c000c000c000ch
		dq	000b000b000b000bh
		dq	000a000a000a000ah
		dq	0009000900090009h
		dq	0008000800080008h
		dq	0007000700070007h
		dq	0006000600060006h
		dq	0005000500050005h
		dq	0004000400040004h
		dq	0003000300030003h
		dq	0002000200020002h
		dq	0001000100010001h

zero	dq	0000000000000000h
sixteen	dq	0010001000100010h

;**************************************************************************
;
;asm_resize_nearest(
;	[esp+ 4] Pixel32 *dst + width,
;	[esp+ 8] Pixel32 *src,
;	[esp+12] ulong -width*4,
;	[esp+16] ulong height,
;	[esp+20] ulong srcpitch,
;	[esp+24] ulong dstpitch,
;	[esp+28] ulong xaccum,
;	[esp+32] ulong yaccum,
;	[esp+36] ulong xfrac,
;	[esp+40] ulong yfrac,
;	[esp+44] ulong xistep,
;	[esp+48] ulong yistep);
;
;**************************************************************************

	public	_asm_resize_nearest

_asm_resize_nearest:
	push	ebp
	push	edi
	push	esi
	push	ebx

	mov	esi,[esp+ 8+16]
	mov	edi,[esp+ 4+16]

rowloop_nearest:
	shr	esi,2
	mov	ebp,[esp+12+16]

	;EAX
	;EBX	accumulator
	;ECX	fractional increment
	;EDX	integer increment
	;ESI	source
	;EDI	destination
	;EBP	loop counter

	mov	edx,[esp+44+16]
	mov	ecx,[esp+36+16]
	mov	ebx,[esp+28+16]
colloop_nearest:
	mov	eax,[esi*4]		;1u
	add	ebx,ecx			;1v
	adc	esi,edx			;2u
	mov	[edi+ebp],eax		;2v
	add	ebp,4			;3u
	jne	colloop_nearest		;3v

	mov	esi,[esp+ 8+16]		;reload source pointer
	mov	eax,[esp+32+16]		;get y accumulator

	add	esi,[esp+48+16]		;add integer source bump
	add	eax,[esp+40+16]		;add y fraction

	sbb	ebx,ebx			;ebx = -1 if need fractional step
	mov	[esp+32+16],eax		;store new y accumulator

	and	ebx,[esp+24+16]		;ebx = fractional step
	add	edi,[esp+20+16]		;advance dest to next row

	add	esi,ebx			;add y fractional step
	mov	ebp,[esp+16+16]		;get y counter

	mov	[esp+ 8+16],esi		;store new source ptr
	dec	ebp			;decrement y counter

	mov	[esp+16+16],ebp		;store y counter
	jne	rowloop_nearest		;continue until all rows done

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	ret

	public	_asm_resize_bilinear

;**************************************************************************
;
;asm_resize_bilinear(
;	[esp+ 4] void *dst,
;	[esp+ 8] void *src,
;	[esp+12] ulong w,
;	[esp+16] ulong h,
;	[esp+20] ulong dstpitch,
;	[esp+24] ulong srcpitch,
;	[esp+28] ulong xaccum,
;	[esp+32] ulong yaccum,
;	[esp+36] ulong xfrac,
;	[esp+40] ulong yfrac,
;	[esp+44] long xint,
;	[esp+48] long yint,
;	[esp+52] ulong xprecopy,
;	[esp+56] ulong xpostcopy,
;	[esp+60] void *srclimit);
;
;**************************************************************************


_asm_resize_bilinear:
	test	_MMX_enabled,1
	jnz	asm_resize_bilinear_MMX

        test	_FPU_enabled,1
	jnz	asm_resize_bilinear_FPU

	push	ebp
	push	edi
	push	esi
	push	ebx

	sub	esp,32

	;compute y_frac, y_frac2

	mov	eax,[esp+32+16+32]
	mov	ebx,16
	shr	eax,28
	mov	[esp+24],eax
	sub	ebx,eax
	mov	[esp+28],ebx

;
;	[esp+28] y_frac2
;	[esp+24] y_frac
;	[esp+20] dest ptr
;	[esp+16] tstore2
;	[esp+12] tstore1
;	[esp+ 8] x_accum
;	[esp+ 4] x_frac2
;	[esp+ 0] x_frac

rowloop_bilinear:
	mov	ebp,[esp+12+16+32]		;ebp = -w*4
	mov	eax,[esp+28+16+32]		;compute x_frac, x_frac2
	mov	ebx,16
	mov	[esp+8],eax
	shr	eax,28
	mov	[esp],eax			;x_frac
	sub	ebx,eax
	mov	[esp+4],ebx			;x_frac2 = 16-x_frac

	mov	esi,[esp+ 8+16+32]		;load source ptr
	mov	edi,[esp+24+16+32]		;load source pitch

	mov	edx,[esp+4+16+32]		;load dest pointer
	cmp	esi,[esp+60+16+32]		;src >= srclimit bound for 2lines?

	sbb	ecx,ecx				;ecx=-1 if not
	mov	[esp+20],edx			;store in temporary

	and	edi,ecx

	shr	esi,2
	mov	ecx,[esp+52+16+32]		;load precopy value
	or	ecx,ecx
	jz	colloop_bilinear_start

	;do precopy

	add	edx,ebp
	mov	[esp+20],edx
	mov	ebp,ecx

	call	bilinear_prepostcopy

	mov	ebp,[esp+12+16+32]
	mov	edx,[esp+4+16+32]
	mov	[esp+20],edx

colloop_bilinear_start:
	or	ebp,ebp
	jz	bilinear_check_postcopy
colloop_bilinear:
	mov	eax,[esi*4]
	mov	ecx,[esi*4+4]
	mov	ebx,eax
	mov	edx,ecx
	and	eax,00ff00ffh
	and	ebx,0000ff00h
	and	ecx,00ff00ffh
	and	edx,0000ff00h
	imul	eax,[esp+4]		;x_frac2
	imul	ebx,[esp+4]		;x_frac2
	imul	ecx,[esp]		;x_frac
	imul	edx,[esp]		;x_frac
	add	eax,ecx
	add	ebx,edx
	imul	eax,[esp+28]		;y_frac2
	imul	ebx,[esp+28]		;y_frac2
	mov	[esp+12],eax		;tstore1
	mov	[esp+16],ebx		;tstore2

	mov	eax,[esi*4+edi]
	mov	ecx,[esi*4+edi+4]
	mov	ebx,eax
	mov	edx,ecx
	and	eax,00ff00ffh
	and	ebx,0000ff00h
	and	ecx,00ff00ffh
	and	edx,0000ff00h
	imul	eax,[esp+4]		;x_frac2
	imul	ebx,[esp+4]		;x_frac2
	imul	ecx,[esp]		;x_frac
	imul	edx,[esp]		;x_frac
	add	eax,ecx
	add	ebx,edx
	imul	eax,[esp+24]		;y_frac
	imul	ebx,[esp+24]		;y_frac
	add	eax,[esp+12]		;tstore1
	add	ebx,[esp+16]		;tstore2

	shr	eax,8
	and	ebx,00ff0000h
	shr	ebx,8
	and	eax,00ff00ffh

	or	eax,ebx			;[data write ] u
	mov	edx,[esp+20]		;[data write ] v

	mov	ebx,[esp+8]		;[frac update] u x_accum
	mov	ecx,[esp+36+16+32]	;[frac update] v xfrac

	mov	[edx+ebp],eax		;[data write ] u
	add	ebx,ecx			;[frac update] v: update x_accum

	adc	esi,[esp+44+16+32]	;[frac update] v: update source pointer [2 cycles]
	mov	[esp+8],ebx		;[frac update] u: store x_accum

	shr	ebx,28			;[frac update] u: x_frac = x_accum>>28
	mov	eax,16			;[frac update] v:

	sub	eax,ebx			;[frac update] u: x_frac2 = 16 - x_frac
	mov	[esp],ebx		;[frac update] v: store x_frac

	mov	[esp+4],eax		;[frac update] u: store x_frac2

	add	ebp,4
	jne	colloop_bilinear

bilinear_check_postcopy:
	mov	ebp,[esp+56+16+32]		;check for postcopy
	or	ebp,ebp
	jz	bilinear_no_postcopy

	sub	edx,ebp
	mov	[esp+20],edx

	call	bilinear_prepostcopy

bilinear_no_postcopy:
	mov	eax,[esp+32+16+32]		;load yaccum
	mov	edx,[esp+ 4+16+32]		;load destination ptr

	add	edx,[esp+20+16+32]		;next destination line
	add	eax,[esp+40+16+32]		;add yfrac increment to yaccum

	sbb	ebx,ebx				;ebx=-1 if need fractional src increment
	mov	esi,[esp+ 8+16+32]		;reload source ptr

	add	esi,[esp+48+16+32]		;add integral source increment
	and	ebx,[esp+24+16+32]		;ebx = fractional src delta

	add	esi,ebx				;add fractional source increment
	mov	[esp+ 4+16+32],edx		;store destination ptr
	mov	[esp+ 8+16+32],esi		;store source ptr
	mov	[esp+32+16+32],eax		;store yaccum
	shr	eax,28
	mov	ebx,16
	mov	[esp+24],eax
	sub	ebx,eax
	mov	[esp+28],ebx

	dec	dword ptr [esp+16+16+32]	;next line!!
	jne	rowloop_bilinear

	add	esp,32

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	ret

bilinear_prepostcopy:
	mov	eax,[esi*4]
	mov	ecx,[esi*4+edi]
	mov	ebx,eax
	mov	edx,ecx
	and	eax,00ff00ffh
	and	ebx,0000ff00h
	and	ecx,00ff00ffh
	and	edx,0000ff00h
	imul	eax,[esp+28+4]		;y_frac2
	imul	ebx,[esp+28+4]		;y_frac2
	imul	ecx,[esp+24+4]		;y_frac
	imul	edx,[esp+24+4]		;y_frac
	add	eax,ecx
	add	ebx,edx

	shr	eax,4
	and	ebx,00ff0000h
	shr	ebx,4
	and	eax,00ff00ffh

	or	eax,ebx			;[data write ] u
	mov	edx,[esp+20+4]		;[data write ] v

	mov	ebx,[esp+8+4]		;[frac update] u x_accum
	mov	ecx,[esp+36+16+36]	;[frac update] v xfrac

	mov	[edx+ebp],eax		;[data write ] u
	add	ebx,ecx			;[frac update] v: update x_accum

	adc	esi,[esp+44+16+36]	;[frac update] v: update source pointer [2 cycles]
	mov	[esp+8+4],ebx		;[frac update] u: store x_accum

	shr	ebx,28			;[frac update] u: x_frac = x_accum>>28
	mov	eax,16			;[frac update] v:

	sub	eax,ebx			;[frac update] u: x_frac2 = 16 - x_frac
	mov	[esp+4],ebx		;[frac update] v: store x_frac

	mov	[esp+8],eax		;[frac update] u: store x_frac2

	add	ebp,4
	jne	bilinear_prepostcopy

	ret

;
;
;******* FPU optimized version.
;
;

;real80_adjust	real10	4611686018427387904.0
;real80_adjust	real10	9223372036854775808.0
;real80_adjust	real10	13835058055282163712.0
;real80_adjust	real10	1180591620717411303424.0	;shift right by 7
real80_adjust	real10	2361183241434822606848.0	;shift right by 8
real80_adjust16	real10	73786976294838206464.0		;shift right by 4

asm_resize_bilinear_FPU:
	push	ebp
	push	edi
	push	esi
	push	ebx

	mov	eax,esp
	and	esp,-32

LOCALS=92

	sub	esp,LOCALS
	mov	[esp+LOCALS-4],eax

        ;copy down parameters.

	mov	esi,eax
	lea	edi,[esp+24]
	add	esi,20
	mov	ecx,15
	rep	movsd

	;flip the FPU into 80-bit, round-down mode.

	fstcw	[esp+84]
	mov	eax,[esp+84]
	and	eax,0fffff0ffh
	or	eax,000000700h
	mov	[esp],eax
	fldcw	[esp]

	;compute y_frac and y_frac2.

	mov	eax,[esp+52]
	mov	ebx,16
	shr	eax,28
	mov	[esp],eax
	sub	ebx,eax
	mov	[esp+4],ebx

;	ffree	st(0)
;	ffree	st(1)
;	ffree	st(2)
;	ffree	st(3)
;	ffree	st(4)
;	ffree	st(5)
;	ffree	st(6)
;	ffree	st(7)

	;prime FPU stack.

	fld	real80_adjust
	fild	dword ptr [esp+4]
	fild	dword ptr [esp]

;******************************************************
;
;	[esp+ 88] original esp
;	[esp+ 84] old FP control word
;	[esp+ 80] srclimit
;	[esp+ 76] xpostcopy
;	[esp+ 72] xprecopy
;	[esp+ 68] yint
;	[esp+ 64] xint
;	[esp+ 60] yfrac
;	[esp+ 56] xfrac
;	[esp+ 52] yaccum
;	[esp+ 48] xaccum
;	[esp+ 44] srcstride
;	[esp+ 40] dststride
;	[esp+ 36] height
;	[esp+ 32] width
;	[esp+ 28] src
;	[esp+ 24] dst
;	[esp+ 20] xaccum'
;	[esp+ 16]
;	[esp+ 12]
;	[esp+  8]
;	[esp+  4]
;	[esp+  0]

	mov	edx,[esp+24]		;load dest. pointer
rowloop_bilinear_FPU:
	mov	ebp,[esp+32]		;load width count
	mov	eax,[esp+48]		;copy xaccum
	mov	[esp+20],eax		;xaccum' = xaccum

	mov	esi,[esp+28]		;load source pointer
	mov	edi,[esp+44]		;load source stride
	cmp	esi,[esp+80]		;can we access the next scanline?

	sbb	ebx,ebx			;ebx=-1 if yes
	shr	esi,2			;divide source ptr by 4 (!)
	and	edi,ebx			;kill source stride if there's no next scanline

	;precompute x_frac, x_frac2

	mov	eax,[esp+20]
	shr	eax,28
	mov	ebx,16
	mov	[esp],eax
	sub	ebx,eax
	mov	[esp+4],ebx

	fild	dword ptr [esp+4]
	fild	dword ptr [esp]

	;check for precopy

	mov	ecx,[esp+72]
	or	ecx,ecx
	jz	colloop_bilinear_start_FPU

	;do precopy

	add	edx,ebp
	mov	ebp,ecx

	call	bilinear_prepostcopy_FPU

	mov	ebp,[esp+32]
	sub	edx,[esp+72]

colloop_bilinear_start_FPU:
	or	ebp,ebp
	jz	bilinear_check_postcopy_FPU

colloop_bilinear_FPU:
	mov	eax,[esi*4]
	mov	ecx,[esi*4+edi]
	mov	ebx,eax
	and	eax,00ff00ffh
	and	ebx,0000ff00h
	mov	[esp+0],eax
	mov	[esp+4],ebx
	mov	ebx,ecx

	fild	qword ptr [esp+0]	;stack: tl x1 x2 y1 y2 cv

	and	ecx,00ff00ffh
	and	ebx,0000ff00h

	mov	[esp+8],ecx
	mov	[esp+12],ebx

	fmul	st,st(4)		;stack: (tl*y2) x1 x2 y1 y2 cv
	fild	qword ptr [esp+8]	;stack: bl (tl*y2) x1 x2 y1 y2 cv

	mov	eax,[esi*4+4]
	mov	ecx,[esi*4+edi+4]

	mov	ebx,eax
	and	eax,00ff00ffh

	fmul	st,st(4)		;stack: (bl*y1) (tl*y2) x1 x2 y1 y2 cv

	and	ebx,0000ff00h
	mov	[esp+0],eax

	mov	[esp+4],ebx
	mov	ebx,ecx

	faddp	st(1),st		;stack: (bl*y1+tl*y2) x1 x2 y1 y2 cv
	fild	qword ptr [esp+0]	;stack: tr (bl*y1+tl*y2) x1 x2 y1 y2 cv

	and	ecx,00ff00ffh
	and	ebx,0000ff00h

	mov	[esp+8],ecx
	mov	[esp+12],ebx

	fmul	st,st(5)		;stack: (tr*y2) (bl*y1+tl*y2) x1 x2 y1 y2 cv
	fild	qword ptr [esp+8]	;stack: br (tr*y2) (bl*y1+tl*y2) x1 x2 y1 y2 cv

	mov	ebx,[esp+20]		;[frac update] u x_accum
	mov	ecx,[esp+56]		;[frac update] v x_inc

	mov	eax,[esp+64]		;[frac update] u xint
	add	ebx,ecx			;[frac update] v

	fmul	st,st(5)		;stack: (br*y1) (tr*y2) (bl*y1+tl*y2) x1 x2 y1 y2 cv

	adc	esi,eax			;[frac update] u: update source pointer
	mov	[esp+20],ebx		;[frac update] v

	shr	ebx,28			;[frac update] u
	mov	eax,16			;[frac update] v

	faddp	st(1),st		;stack: (br*y1+tr*y2) (bl*y1+tl*y2) x1 x2 y1 y2 cv
	fxch	st(1)			;stack: (bl*y1+tl*y2) (br*y1+tr*y2) x1 x2 y1 y2 cv
	fmulp	st(3),st		;stack: (br*y1+tr*y2) x1 (x2*(bl*y1+tl*y2)) y1 y2 cv

	mov	[esp],ebx		;[frac update] u x_frac
	sub	eax,ebx			;[frac update] v x_frac2

	fmulp	st(1),st		;stack: ((br*y1+tr*y2)*x1) (x2*(bl*y1+tl*y2)) y1 y2 cv
	fxch	st(1)			;stack: (x2*(bl*y1+tl*y2)) ((br*y1+tr*y2)*x1) y1 y2 cv
	fadd	st,st(4)		;stack: (x2*(bl*y1+tl*y2))+cv ((br*y1+tr*y2)*x1) y1 y2 cv

	mov	[esp+4],eax		;[frac update] v x_frac2

	faddp	st(1),st		;stack: ((x2*(bl*y1+tl*y2))+((br*y1+tr*y2)*x1)+cv) y1 y2 cv

	fild	dword ptr [esp+0]	;stack: x1 res y1 y2 cv
	fild	dword ptr [esp+4]	;stack: x2 x1 res y1 y2 cv
	fxch	st(2)			;stack: res x1 x2 y1 y2 cv
	fstp	real10 ptr [esp+8]	;stack: x1 x2 y1 y2 cv

	mov	eax,[esp+8]		;[data merge ] u
	mov	ecx,[esp+12]		;[data merge ] v

	and	eax,00ff00ffh
	and	ecx,0000ff00h

	or	eax,ecx			;[data write ] u

	mov	[edx+ebp],eax		;[data write ] u

	add	ebp,4
	jne	colloop_bilinear_FPU

bilinear_check_postcopy_FPU:

	mov	ebp,[esp+76]
	or	ebp,ebp
	jz	bilinear_no_postcopy_FPU

	sub	edx,ebp

	call	bilinear_prepostcopy_FPU

	;******************

bilinear_no_postcopy_FPU:
	mov	eax,[esp+52]		;load yaccum
	mov	edx,[esp+24]		;load dest. pointer

	add	edx,[esp+40]		;next destination line
	add	eax,[esp+60]		;add fractional y increment

	sbb	ebx,ebx			;ebx = -1 if fraction overflowed
	mov	esi,[esp+28]		;reload source ptr

	add	esi,[esp+68]		;add integer increment to source ptr
	and	ebx,[esp+44]		;ebx = fractional y bump

	add	esi,ebx			;bump source ptr if fraction overflowed
	mov	[esp+52],eax		;store yaccum

	shr	eax,28			;eax = src
	mov	ebx,16

	mov	[esp],eax		;store y_frac
	sub	ebx,eax

	mov	[esp+28],esi		;store source ptr
	mov	[esp+4],ebx		;store y_frac2

	mov	[esp+24],edx

	fstp	st(0)
	fstp	st(0)
	fstp	st(0)
	fstp	st(0)
	fild	dword ptr [esp+4]
	fild	dword ptr [esp+0]

	dec	dword ptr [esp+36]
	jne	rowloop_bilinear_FPU

	;ditch fp values

	fstp	st(0)
	fstp	st(0)
	fstp	st(0)

	;restore FPU rounding and precision

	fldcw	[esp+84]

	mov	esp,[esp+LOCALS-4]

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	ret

bilinear_prepostcopy_FPU:
	fld	real80_adjust16		;stack: cv' x1 x2 y1 y2 cv
	fxch	st(5)			;stack: cv x1 x2 y1 y2 cv'
	fstp	st(0)			;stack: x1 x2 y1 y2 cv'
	fstp	st(0)
	fstp	st(0)			;stack: y1 y2 cv'

colloop_bilinear_prepostcopy_FPU:
	mov	eax,[esi*4]
	mov	ecx,[esi*4+edi]
	mov	ebx,eax
	and	eax,00ff00ffh
	and	ebx,0000ff00h
	mov	[esp+0+4],eax
	mov	[esp+4+4],ebx
	mov	ebx,ecx

	fild	qword ptr [esp+0+4]	;stack: tl y1 y2 cv

	and	ecx,00ff00ffh
	and	ebx,0000ff00h

	mov	[esp+8+4],ecx
	mov	[esp+12+4],ebx

	fmul	st,st(2)		;stack: (tl*y2) y1 y2 cv
	fild	qword ptr [esp+8+4]	;stack: bl (tl*y2) y1 y2 cv
	fmul	st,st(2)		;stack: (bl*y1) (tl*y2) y1 y2 cv
	faddp	st(1),st		;stack: (bl*y1+tl*y2) y1 y2 cv

	mov	ebx,[esp+20+4]		;[frac update] u x_accum
	mov	ecx,[esp+56+4]		;[frac update] v x_inc

	mov	eax,[esp+64+4]		;[frac update] u xint
	add	ebx,ecx			;[frac update] v

	adc	esi,eax			;[frac update] u: update source pointer
	mov	[esp+20+4],ebx		;[frac update] v

	fadd	st,st(3)		;stack: (bl*y1+tl*y2)+cv y1 y2 cv

	fstp	real10 ptr [esp+8+4]	;stack: y1 y2 cv

	mov	eax,[esp+8+4]		;[data merge ] u
	mov	ecx,[esp+12+4]		;[data merge ] v

	and	eax,00ff00ffh
	and	ecx,0000ff00h

	or	eax,ecx			;[data write ] u

	mov	[edx+ebp],eax		;[data write ] u

	add	ebp,4
	jne	colloop_bilinear_prepostcopy_FPU

	fld	real80_adjust		;stack: cv y1 y2 cv'
	fxch	st(3)			;stack: cv' y1 y2 cv
	fstp	st(0)			;stack: y1 y2 cv

	mov	ebx,[esp+20]		;[frac update] v

	shr	ebx,28			;[frac update] u
	mov	eax,16			;[frac update] v

	mov	[esp+0+4],ebx		;[frac update] u x_frac
	sub	eax,ebx			;[frac update] v x_frac2

	mov	[esp+4+4],eax		;[frac update] v x_frac2

	fild	dword ptr [esp+0+4]	;stack: x1 res y1 y2 cv
	fild	dword ptr [esp+4+4]	;stack: x2 x1 res y1 y2 cv

	ret

;
;
;******* MMX optimized version.
;
;

asm_resize_bilinear_MMX:
	push	ebp
	push	edi
	push	esi
	push	ebx

	sub	esp,32

;******************************************************
;
;	[esp+28]
;	[esp+24]
;	[esp+20] y_frac2
;	[esp+16] y_frac2
;	[esp+12] y_frac
;	[esp+ 8] y_frac
;	[esp+ 4] sixteen
;	[esp+ 0] sixteen

	mov	eax,00100010h
	mov	[esp+0],eax
	mov	[esp+4],eax

bilinear_rowloop_MMX:
	mov		eax,[esp+28+16+32]		;eax = x accumulator

	mov		esi,[esp+ 8+16+32]		;esi = source
	mov		edi,[esp+24+16+32]		;edi = source pitch

	cmp		esi,[esp+60+16+32]		;past two-line limit?
	sbb		ebx,ebx
	and		edi,ebx				;set pitch=0 if so

	mov		edx,[esp+4+16+32]		;edx = destination
	mov		ebx,[esp+36+16+32]		;ebx = fractional x increment
	mov		ecx,[esp+44+16+32]		;ecx = integer x increment

	mov		eax,[esp+32+16+32]
	shr		eax,28
	movd		mm5,eax
	punpcklwd	mm5,mm5
	movq		mm4,sixteen
	punpckldq	mm5,mm5
	psubw		mm4,mm5
	movq		[esp+8],mm5
	movq		[esp+16],mm4
	pxor		mm5,mm5

	shr		esi,2

	mov	ebp,[esp+52+16+32]		;load precopy value
	or	ebp,ebp
	jz	colloop_bilinear_start_MMX

	;do precopy

	add	edx,[esp+12+16+32]

	call	bilinear_prepostcopy_MMX

	mov	edx,[esp+4+16+32]

colloop_bilinear_start_MMX:
	mov	ebp,[esp+12+16+32]
	or	ebp,ebp
	jz	bilinear_check_postcopy_MMX

	movq		mm6,[esp+16]
	movq		mm7,[esp+8]

	;<------------- begin pre-entry phase ------------->


	mov		ecx,eax
	shr		ecx,28

	movd		mm0,[esi*4]		;mm0 = top left pixel
	movd		mm1,[esi*4+4]		;mm1 = top right pixel
	movd		mm3,[esi*4+edi+4]	;mm3 = bottom right pixel
	movd		mm2,[esi*4+edi]		;mm2 = bottom left pixel
	punpcklbw	mm0,mm5
	pmullw		mm0,[bilinMMX_tab2 + ecx*8]
	punpcklbw	mm1,mm5
	pmullw		mm1,[bilinMMX_tab1 + ecx*8]
	punpcklbw	mm2,mm5
	punpcklbw	mm3,mm5
	jmp		short bilinear_colloop_MMX_entry

	align		16

bilinear_colloop_MMX:
	movd		mm0,[esi*4]		;mm0 = top left pixel
	paddw		mm4,mm2			;[last]

	movd		mm1,[esi*4+4]		;mm1 = top right pixel
	psrlw		mm4,8			;[last]

	movd		mm3,[esi*4+edi+4]	;mm3 = bottom right pixel
	packuswb	mm4,mm4			;[last]

	movd		mm2,[esi*4+edi]		;mm2 = bottom left pixel
	punpcklbw	mm0,mm5

	pmullw		mm0,[bilinMMX_tab2 + ecx*8]
	punpcklbw	mm1,mm5

	pmullw		mm1,[bilinMMX_tab1 + ecx*8]
	punpcklbw	mm2,mm5

	movd		[edx+ebp-4],mm4		;[last]
	punpcklbw	mm3,mm5

bilinear_colloop_MMX_entry:
	pmullw		mm2,[bilinMMX_tab2 + ecx*8]
	movq		mm4,mm0

	pmullw		mm3,[bilinMMX_tab1 + ecx*8]
	paddw		mm4,mm1

	add		eax,ebx			;update x accumulator
	pmullw		mm4,mm6

	mov		ecx,[esp+44+16+32]
	paddw		mm2,mm3

	adc		esi,ecx			;update source address
	pmullw		mm2,mm7

	mov		ecx,eax
	;stall

	shr		ecx,28
	add		ebp,4

	jnz		bilinear_colloop_MMX

	;<-------------- begin exit phase -------------->

	paddw		mm4,mm2			;[last]
	psrlw		mm4,8			;[last]
	packuswb	mm4,mm4			;[last]
	movd		[edx+ebp-4],mm4		;[last]
	mov		ecx,[esp+44+16+32]


bilinear_check_postcopy_MMX:
	mov	ebp,[esp+56+16+32]		;check for postcopy
	or	ebp,ebp
	jz	bilinear_no_postcopy_MMX

	sub	edx,ebp

	call	bilinear_prepostcopy_MMX

	;********************************

bilinear_no_postcopy_MMX:
	mov	eax,[esp+32+16+32]		;eax = y accumulator
	mov	edx,[esp+ 4+16+32]		;reload destination pointer

	add	edx,[esp+20+16+32]		;advance to next destination line
	add	eax,[esp+40+16+32]		;add y fraction to y accumulator

	sbb	ebx,ebx				;ebx = -1 if we have a fractional increment
	mov	esi,[esp+ 8+16+32]		;reload source pointer

	add	esi,[esp+48+16+32]		;add y integer increment
	and	ebx,[esp+24+16+32]		;ebx = y fractional increment

	add	esi,ebx				;add y fractional increment
	mov	[esp+ 4+16+32],edx		;store destination pointer

	mov	[esp+ 8+16+32],esi		;store source pointer
	mov	[esp+32+16+32],eax		;store new y accumulator

	shr	eax,28				;eax = y_frac
	mov	ebx,16

	mov	[esp+24],eax			;store y_frac
	sub	ebx,eax				;ebx = y_frac2

	mov	[esp+28],ebx			;store y_frac2

	dec	dword ptr [esp+16+16+32]
	jne	bilinear_rowloop_MMX

	add	esp,32

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	emms
	ret

	align		16
bilinear_prepostcopy_MMX:
	movd		mm7,eax

	movq		mm6,[esp+4]

	psrld		mm7,28

	movd		mm0,[esi*4]		;mm0 = top left pixel
	punpcklwd	mm7,mm7

	movd		mm2,[esi*4+edi]		;mm2 = bottom left pixel
	punpckldq	mm7,mm7

	punpcklbw	mm0,mm5
	psubw		mm6,mm7

	punpcklbw	mm2,mm5

	add		eax,ebx			;update x accumulator

	adc		esi,ecx			;update source address

	pmullw		mm0,[esp+16+4]
	add		ebp,4

	pmullw		mm2,[esp+8+4]		;[last]

	paddw		mm0,mm2			;[last]

	psrlw		mm0,4			;[last]

	packuswb	mm0,mm0			;[last]

	movd		[edx+ebp-4],mm0		;[last]
	jne		bilinear_prepostcopy_MMX

	ret


;**************************************************************************
;
;void asm_bitmap_xlat1(
;	[esp+ 8] Pixel32 *src,
;	[esp+ 4] Pixel32 *dst,
;	[esp+16] PixOffset spitch,
;	[esp+12] PixOffset dpitch,
;	[esp+20] PixDim w,
;	[esp+24] PixDim h,
;	[esp+28] const Pixel8 *tbl);
;
;**************************************************************************

	public	_asm_bitmap_xlat1

_asm_bitmap_xlat1:
	push	ebp
	push	edi
	push	esi
	push	ebx

	mov	esi,[esp+ 8+16]		;esi = source
	mov	edi,[esp+ 4+16]		;edi = destination
	mov	edx,[esp+28+16]		;edx = table pointer
rowloop_xlat1:
	mov	ebp,[esp+20+16]

colloop_xlat1:
	mov	eax,[esi+ebp]		;fetch pixel
	xor	ebx,ebx

	mov	bl,al			;bl = blue
	xor	ecx,ecx

	mov	cl,ah			;cl = green
	and	eax,00ff0000h

	shr	eax,16			;al = red
	mov	bl,[edx+ebx]		;ebx = 000000BB

	mov	cl,[edx+ecx]		;cl = translated green

	shl	ecx,8			;ecx = 0000GG00
	mov	al,[edx+eax]		;al = translated red

	shl	eax,16			;eax = 00RR0000
	or	ecx,ebx			;ecx = 0000GGBB

	or	eax,ecx			;eax = 00RRGGBB

	mov	[edi+ebp],eax		;write new pixel

	add	ebp,4
	jne	colloop_xlat1

	add	esi,[esp+16+16]		;next source row
	add	edi,[esp+12+16]		;next dest row

	dec	dword ptr [esp+24+16]
	jnz	rowloop_xlat1

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	ret

;**************************************************************************

;void asm_bitmap_xlat3(
;	[esp+ 4] Pixel32 *dst,
;	[esp+ 8] Pixel32 *src,
;	[esp+12] PixOffset dpitch,
;	[esp+16] PixOffset spitch,
;	[esp+20] PixDim w,
;	[esp+24] PixDim h,
;	[esp+28] const Pixel32 *tbl);

	public	_asm_bitmap_xlat3

_asm_bitmap_xlat3:
	push	ebp
	push	edi
	push	esi
	push	ebx

	mov	esi,[esp+ 8+16]		;esi = source
	mov	edi,[esp+ 4+16]		;edi = dest
	mov	edx,[esp+28+16]		;edx = table pointer
rowloop_xlat3:
	mov	ebp,[esp+20+16]

colloop_xlat3:
	mov	eax,[esi+ebp]		;fetch pixel
	xor	ebx,ebx

	mov	bl,al			;bl = blue
	xor	ecx,ecx

	mov	cl,ah			;cl = green
	and	eax,00ff0000h

	shr	eax,16			;al = red
	mov	ebx,[edx+ebx*4]		;ebx = xxxxxxBB

	mov	ecx,[edx+ecx*4]		;ecx = xxxxGGxx
	and	ebx,000000ffh		;ebx = 000000BB

	mov	eax,[edx+eax*4]		;eax = xxRRxxxx
	and	ecx,0000ff00h		;ecx = 0000GG00

	or	ecx,ebx			;ecx = 0000GGBB
	and	eax,00ff0000h		;eax = 00RR0000

	or	eax,ecx			;eax = 00RRGGBB

	mov	[edi+ebp],eax		;write new pixel

	add	ebp,4
	jne	colloop_xlat3

	add	esi,[esp+16+16]		;next source row
	add	edi,[esp+12+16]		;next dest row

	dec	dword ptr [esp+24+16]
	jnz	rowloop_xlat3

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	ret

_TEXT64	ends

	end
