;
; Huffyuv v2.1.1, by Ben Rudiak-Gould.
; http://www.math.berkeley.edu/~benrg/huffyuv.html
;
; This file is copyright 2000 Ben Rudiak-Gould, and distributed under
; the terms of the GNU General Public License, v2 or later.  See
; http://www.gnu.org/copyleft/gpl.html.
;

;
; This file makes heavy use of macros to define a bunch of almost-identical
; functions -- see huffyuv_a.h.
;

	BITS 32
	CPU 586

; alignment has to be 'page' so that I can use 'align 32' below

SECTION	.text align=256

	EXTERN	encode1_shift
	EXTERN	encode1_add_shifted
	EXTERN	encode2_shift
	EXTERN	encode2_add_shifted
	EXTERN	encode3_shift
	EXTERN	encode3_add_shifted

	EXTERN	decode1
	EXTERN	decode1_shift
	EXTERN	decode2
	EXTERN	decode2_shift
	EXTERN	decode3
	EXTERN	decode3_shift

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_CODEC_PROC_START	0

	push	ebp
	push	edi
	push	esi
	push	ebx

	mov	esi,[esp+4+16]
	mov	edi,[esp+8+16]
	mov	ebp,[esp+12+16]
	mov	eax,[esi]

%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_COMPRESS	5
%define	CHANNEL		%1
%assign	INDEX		%2
%assign	BACK1		%3
%assign	BACK2 		%4
%assign	INCREMENT	%5
%define	BACK		BACK1 || BACK2

%if BACK
	mov	cl,[esi+INDEX+INCREMENT]
%else
	movzx	ebx,byte [esi+INDEX+INCREMENT]
%endif
;	xor	ebx,ebx
%if BACK1
	sub	cl,[esi+INDEX+INCREMENT-BACK1]
%endif
%if BACK2
	sub	cl,[esi+INDEX+INCREMENT-BACK2]
%if BACK1
	add	cl,[esi+INDEX+INCREMENT-BACK1-BACK2]
%endif
%endif
%if INCREMENT
	add	esi,INCREMENT
%endif
;	mov	bl,cl
%if BACK
	movzx	ebx,cl
%endif
	mov	cl,[CHANNEL %+ _shift+ebx]
	mov	eax,[CHANNEL %+ _add_shifted + ebx*4]
	add	ch,cl
	jl	.nostore_ %+ INDEX

	sub	cl,ch
	sub	ch,32
	shld	edx,eax,cl
	add	cl,ch		; restore original cl (32 is added, but it doesn't matter because shld only looks at lower 5 bits)
	mov	[edi],edx
	add	edi,4

.nostore_ %+ INDEX
	shld	edx,eax,cl

%undef	CHANNEL
%undef	INDEX
%undef	BACK1
%undef	BACK2
%undef	INCREMENT
%undef	BACK
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_COMPRESS_END	1
%define	LOOPNAME		%1

	cmp	esi,ebp
	jnz	LOOPNAME

	cmp	ch,-32
	jle	.noextra
	mov	cl,ch
	neg	cl
	shl	edx,cl
	mov	[edi],edx
	add	edi,4
.noextra

	mov	eax,edi

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp

	retn

%undef	LOOPNAME
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_COMPRESS_PROC_YUV	4
%define	PROCNAME		%1
%define	UYVY			%2
%define	DELTA			%3
%define	DECORRELATE		%4

	GLOBAL	PROCNAME

;;unsigned long* __cdecl procname(
;;	[esp+ 4] unsigned char* src,
;;	[esp+ 8] unsigned long* dst,
;;	[esp+12] unsigned char* src_end);

PROCNAME:

	HUFF_CODEC_PROC_START
%if UYVY
	bswap	eax
	rol	eax,16
%endif
	mov	[edi],eax
	add	edi,4
	mov	ch,-32
	sub	ebp,4
	align	32

.loop0
	HUFF_COMPRESS	encode1,0+UYVY,2*DELTA,0,4
	HUFF_COMPRESS	encode2,1-UYVY,4*DELTA,0,0
	HUFF_COMPRESS	encode1,2+UYVY,2*DELTA,0,0
	HUFF_COMPRESS	encode3,3-UYVY,4*DELTA,2*DECORRELATE,0
	HUFF_COMPRESS_END	.loop0

. %+ PROCNAME

%undef	PROCNAME
%undef	UYVY
%undef	DELTA
%undef	DECORRELATE
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

HUFF_COMPRESS_PROC_YUV	asm_CompressYUY2,0,0,0
HUFF_COMPRESS_PROC_YUV	asm_CompressYUY2Delta,0,1,0
HUFF_COMPRESS_PROC_YUV	asm_CompressUYVY,1,0,0
HUFF_COMPRESS_PROC_YUV	asm_CompressUYVYDelta,1,1,0

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_COMPRESS_PROC_RGB	3
%define	PROCNAME		%1
%define	RGBA			%2
%define	DECORRELATE		%3

	GLOBAL	PROCNAME

;;unsigned char* __cdecl procname(
;;	[esp+ 4] unsigned char* src,
;;	[esp+ 8] unsigned char* dst,
;;	[esp+12] unsigned char* src_end);

PROCNAME:

	HUFF_CODEC_PROC_START
%if 0 == RGBA
	shl	eax,8
%endif
	mov	[edi],eax
	add	edi,4
	mov	ch,-32
	sub	ebp,3+RGBA
	align	32

.loop0
%if DECORRELATE
	HUFF_COMPRESS	encode2,1,3+RGBA,0,3+RGBA
	HUFF_COMPRESS	encode1,0,3+RGBA,-1,0
	HUFF_COMPRESS	encode3,2,3+RGBA,1,0
%else
	HUFF_COMPRESS	encode1,0,3+RGBA,0,3+RGBA
	HUFF_COMPRESS	encode2,1,3+RGBA,0,0
	HUFF_COMPRESS	encode3,2,3+RGBA,0,0
%endif
%if RGBA
	HUFF_COMPRESS	encode3,3,4,0,0
%endif
	HUFF_COMPRESS_END	.loop0

. %+ PROCNAME

%undef	PROCNAME
%undef	RGBA
%undef	DECORRELATE
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

HUFF_COMPRESS_PROC_RGB	asm_CompressRGBDelta,0,0
HUFF_COMPRESS_PROC_RGB	asm_CompressRGBDeltaDecorrelate,0,1
HUFF_COMPRESS_PROC_RGB	asm_CompressRGBADelta,1,0
HUFF_COMPRESS_PROC_RGB	asm_CompressRGBADeltaDecorrelate,1,1

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	GLOBAL	mmx_RowDiff

;void __cdecl mmx_RowDiff(
;	[esp+ 4] unsigned char* src,
;	[esp+ 8] unsigned char* dst,
;	[esp+12] unsigned char* src_end,
;	[esp+16] int stride);

mmx_RowDiff:

	push	ebp
	push	edi
	push	esi
	push	ebx

	mov	esi,[esp+4+16]
	mov	edi,[esp+8+16]
	mov	ecx,[esp+16+16]
	add	ecx,esi

	cmp	esi,edi
	je	.diff

	; make sure we're 8-byte aligned
.loop0
	test	edi,7
	jz	.endloop0
	mov	al,[esi]
	inc	esi
	mov	[edi],al
	inc	edi
	jmp	.loop0
.endloop0

	; copy the (rest of the) first row
.loop1
	movq	mm0,[esi]
	movq	mm1,[esi+8]
	add	esi,16
	movq	[edi],mm0
	movq	[edi+8],mm1
	add	edi,16
	cmp	esi,ecx
	jb	.loop1

	; diff the remaining rows
.diff
	mov	esi,[esp+12+16]
	mov	ecx,[esp+4+16]
	mov	edi,[esp+8+16]
	mov	ebx,[esp+16+16]
	add	edi,esi
	sub	edi,ecx
	add	ecx,ebx
	neg	ebx

	; align again (sigh...)
.loop2
	test	edi,7
	jz	.endloop2
	mov	al,[esi-1]
	sub	al,[esi+ebx-1]
	dec	esi
	mov	[edi-1],al
	dec	edi
	jmp	.loop2
.endloop2

	mov	edx,32
	sub	esi,edx
	sub	edi,edx
	align	32
.loop3
	movq	mm3,[esi+24]
	movq	mm2,[esi+16]
	movq	mm6,[esi+ebx+16]
	psubb	mm3,[esi+ebx+24]	; 2
	psubb	mm2,mm6
	movq	mm1,[esi+8]
	movq	[edi+24],mm3		; 2
	movq	mm5,[esi+ebx+8]
	movq	mm0,[esi]
	movq	[edi+16],mm2		; 2
	psubb	mm1,mm5
	psubb	mm0,[esi+ebx]		; 2
	sub	esi,edx
	movq	[edi+8],mm1		; 2
	cmp	esi,ecx
	movq	[edi],mm0		; 2
	lea	edi,[edi-32]
	jae	.loop3

	; and more alignment
	add	esi,edx
	add	edi,edx
.loop4
	cmp	esi,ecx
	jbe	.endloop4
	mov	al,[esi-1]
	sub	al,[esi+ebx-1]
	dec	esi
	mov	[edi-1],al
	dec	edi
	jmp	.loop4
.endloop4

	emms
	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	retn

.mmx_RowDiff

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	GLOBAL	mmx_RowAccum

;void __cdecl mmx_RowAccum(
;	[esp+ 4] unsigned char* buf,
;	[esp+ 8] unsigned char* buf_end,
;	[esp+12] int stride);

mmx_RowAccum:

	push	ebp
	push	esi
	push	ebx

	mov	esi,[esp+4+12]
	mov	ebx,[esp+12+12]
	add	esi,ebx
	neg	ebx

.loop0
	test	esi,7
	jz	.endloop0
	mov	al,[esi+ebx]
	add	[esi],al
	inc	esi
	jmp	.loop0
.endloop0

	mov	ecx,[esp+8+12]
	sub	ecx,32
	align	32
.loop1
	movq	mm0,[esi]
	movq	mm1,[esi+8]
	movq	mm5,[esi+ebx+24]
	paddb	mm0,[esi+ebx]
	movq	mm2,[esi+16]
	movq	mm4,[esi+ebx+16]
	paddb	mm1,[esi+ebx+8]
	movq	mm3,[esi+24]
	paddb	mm2,mm4
	movq	[esi],mm0
	paddb	mm3,mm5
	movq	[esi+8],mm1
	movq	[esi+16],mm2
	movq	[esi+24],mm3
	add	esi,32
	cmp	esi,ecx
	jbe	.loop1

	; cleanup end in case of misalignment
	add	ecx,32
.loop2
	cmp	esi,ecx
	jae	.endloop2
	mov	al,[esi+ebx]
	add	[esi],al
	inc	esi
	jmp	.loop2
.endloop2

	emms
	pop	ebx
	pop	esi
	pop	ebp
	retn

.mmx_RowAccum

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	YUV_SHIFT	3	; clobbers mm4,5
;define	MMB		%1
;define	MMA		%2
;define	UYVY		%3

; mma:mmb = ABCDEFGHIJKLMNOP (VYUYVYUY...) - backwards from mem order
;   we want mmb = EDGFIHKJ (prev pixel of same channel)

	movq	mm4,%1
	punpcklbw	%1,%2		; mm4:mmb = AIBJCKDLEMFNGOHP
	punpckhbw	mm4,%2
	movq	mm5,%1
	punpcklbw	%1,mm4		; mm5:mmb = AEIMBFJNCGKODHLP
	punpckhbw	mm5,mm4
	movq	mm4,%1
	punpcklbw	%1,mm5		; mm4:mmb = ACEGIKMOBDFHJLNP
	punpckhbw	mm4,mm5
	psllq	%1,8+8* %3		; mm4:mmb = EGIKMO__DFHJLNP_
	psllq	mm4,16-8* %3
	punpckhbw	%1,mm4		; mmb = EDGFIHKJ (for YUY2; different for UYVY)

%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	MEDIAN_PREDICT_PROC	2
%define	PROCNAME		%1
%define	UYVY			%2

	GLOBAL	PROCNAME

;void __cdecl mmx_MedianPredict(
;	[esp+ 4] unsigned char* src,
;	[esp+ 8] unsigned char* dst,
;	[esp+12] unsigned char* src_end,
;	[esp+16] int stride);

PROCNAME:

	push	ebp
	mov	ebp,esp
	push	edi
	push	esi
	push	ebx

	; do the first row
	mov	esi,[ebp+4+4]
	mov	edi,[ebp+8+4]
	mov	ebx,[ebp+16+4]
	lea	ecx,[ebx+esi+8]
	neg	ebx

	pxor	mm2,mm2
	movq	mm3,[esi]	; for use in next loop

.loop0
	movq	mm0,[esi]
	YUV_SHIFT	mm2,mm0,UYVY
	add	esi,8
	movq	mm1,mm0
	psubb	mm1,mm2
	movq	[edi],mm1
	movq	mm2,mm0
	add	edi,8
	cmp	esi,ecx
	jb	.loop0

	mov	ecx,[ebp+8+4]	; recopy first group of four, just for consistency with other compression methods
	movd	[ecx],mm3

	; do the remaining rows
	mov	ecx,[ebp+12+4]
	; mm2,3 are already initialized from previous loop

	align	32

	; pixel arrangement:
	;    mm3 mm1
	;    mm2 mm0

.loop1
	; mm2,3 <- appropriate left and above-left pixels
	movq	mm0,[esi]
	movq	mm1,[esi+ebx]

	YUV_SHIFT	mm2,mm0,UYVY	; note: clobbers mm4,5
	add	esi,8
	YUV_SHIFT	mm3,mm1,UYVY

	; mm4 <- median of mm1,mm2,(mm1+mm2-mm3)

	movq	mm4,mm2		; (mm2,mm4) <- (min(mm1,mm2),max(mm1,mm2))
	movq	mm5,mm2		; mm5 <- mm1+mm2-mm3
	psubusb	mm4,mm1
	paddb	mm5,mm1
	psubb	mm2,mm4
	psubb	mm5,mm3
	paddb	mm4,mm1

	psubusb	mm2,mm5		; mm2 = max(mm2,mm5)
	paddb	mm2,mm5

	movq	mm5,mm4		; mm4 = min(mm2,mm4)
	psubusb	mm5,mm2
	psubb   mm4,mm5		; now mm4 = median

	; write out the result and loop
	movq	mm2,mm0
	movq	mm3,mm1
	psubb	mm0,mm4
	movq	[edi],mm0
	cmp	esi,ecx
	lea	edi,[edi+8]
	jb	.loop1

	emms
	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	retn

. %+ PROCNAME

%undef	PROCNAME
%undef	UYVY
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

MEDIAN_PREDICT_PROC	mmx_MedianPredictYUY2,0
MEDIAN_PREDICT_PROC	mmx_MedianPredictUYVY,1

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	salc	0	; see http://www.df.lth.se/~john_e/gems/gem0004.html
	db	0d6h
%endmacro

%macro	MEDIAN_RESTORE	3
%define	OFS1		%1
%define	OFS2		%2
%define	INCREMENT	%3

	mov	ah,[esi+OFS2]
	mov	ch,[esi+ebx+OFS1]
	mov	dh,[esi+ebx+OFS2]
	neg	dh		; compute ah+ch-dh
	mov	cl,ch		; (interleaved) exchange ah,ch if necessary so ah<ch
	add	dh,ch
	add	dh,ah
	sub	cl,ah
	salc
	and	cl,al
	sub	ch,cl
	add	ah,cl
	mov	cl,dh		; exchange ch,dh (but toss dh)
	sub	cl,ch
	salc
%if INCREMENT
	add	esi,INCREMENT
%endif
	and	cl,al
	add	ch,cl
	mov	cl,ch		; exchange ah,ch (but toss ah)
	sub	cl,ah
	salc
	and	cl,al
	sub	ch,cl		; now ch = median
	add	[esi+OFS1-INCREMENT],ch

%undef	OFS1
%undef	OFS2
%undef	INCREMENT
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	GLOBAL	asm_MedianRestore

;void __cdecl asm_MedianRestore(
;	[esp+ 4] unsigned char* buf,
;	[esp+ 8] unsigned char* buf_end,
;	[esp+12] int stride);

asm_MedianRestore:

	push	ebp
	mov	ebp,esp
	push	esi
	push	edi
	push	ebx

	mov	esi,[ebp+4+4]
	mov	ebx,[ebp+12+4]
	lea	edi,[esi+ebx+8]
	add	esi,4
	neg	ebx

	; process first row (left predict)

.loop0
	mov	al,[esi-2]
	mov	ah,[esi-3]
	mov	dl,[esi]
	mov	dh,[esi-1]
	add	dl,al
	add	[esi+1],ah
	mov	[esi],dl
	add	[esi+2],dl
	add	[esi+3],dh
	add	esi,4
	cmp	esi,edi
	jb	.loop0

	; process remaining rows

	mov	edi,[ebp+8+4]

	align	32
.loop1
	MEDIAN_RESTORE	0,-2,0
	MEDIAN_RESTORE	1,-3,2
	cmp	esi,edi
	jb	.loop1

	pop	ebx
	pop	edi
	pop	esi
	pop	ebp
	retn

.asm_MedianRestore

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_DECOMPRESS		5
%define	DECODE_TABLE		%1
%define	DECODE_SHIFT_TABLE	%2
%define	OFFS			%3
%assign	BACK1			%4
%assign	BACK2			%5

	mov	ebx,eax
	mov	cl,al
	shr	ebx,5
	mov	edx,[esi+ebx*4]
	mov	ebx,[esi+ebx*4+4]
	shld	edx,ebx,cl
	or	edx,1		; ensure that edx is non-zero
	bsr	ebx,edx
	btr	edx,ebx
	mov	ebx,[DECODE_TABLE+ebx*4]
	mov	cl,[ebx]
	shr	edx,cl
;	xor	ecx,ecx
;	mov	cl,[ebx+edx+1]
;	xor	ebx,ebx
;	mov	bl,[DECODE_SHIFT_TABLE+ecx]
	movzx	ecx,byte [ebx+edx+1]
	movzx	ebx,byte [DECODE_SHIFT_TABLE+ecx]
%if BACK1
	add	cl,[edi+OFFS-BACK1]
%endif
	add	eax,ebx
%if BACK2
	add	cl,[edi+OFFS-BACK2]
%if BACK1
	sub	cl,[edi+OFFS-BACK1-BACK2]
%endif
%endif
	mov	[edi+OFFS],cl

%undef	DECODE_TABLE
%undef	DECODE_SHIFT_TABLE
%undef	OFFS
%undef	BACK1
%undef	BACK2
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_DECOMPRESS_PROC_YUV	3
%define	PROCNAME			%1
%define	DELTA				%2
%define	DECORRELATE			%3

	GLOBAL	PROCNAME

;;void __cdecl procname(
;;	[esp+ 4] unsigned long* src,
;;	[esp+ 8] unsigned char* dst,
;;	[esp+12] unsigned char* dst_end);

PROCNAME:

	HUFF_CODEC_PROC_START
	mov	[edi],eax
	add	edi,4
	mov	eax,32

	align	32

.loop0
	HUFF_DECOMPRESS	decode1,decode1_shift,0,2*DELTA,0
	HUFF_DECOMPRESS	decode2,decode2_shift,1,4*DELTA,0
	HUFF_DECOMPRESS	decode1,decode1_shift,2,2*DELTA,0
	HUFF_DECOMPRESS	decode3,decode3_shift,3,4*DELTA,2*DECORRELATE

	add	edi,4
	cmp	edi,ebp
	jb	.loop0

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	retn

. %+ PROCNAME

%undef	PROCNAME
%undef	DELTA
%undef	DECORRELATE
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	HUFF_DECOMPRESS_PROC_RGB	4
%define	PROCNAME			%1
%define	SRC32				%2
%define	DST32				%3
%define	DECORRELATE			%4

	GLOBAL	PROCNAME

;;void __cdecl procname(
;;	[esp+ 4] unsigned long* src,
;;	[esp+ 8] unsigned char* dst,
;;	[esp+12] unsigned char* dst_end);

PROCNAME:

	HUFF_CODEC_PROC_START
%if 0 == SRC32
	shr	eax,8
%endif
	mov	[edi],eax
	add	edi,DST32+3
	mov	eax,32

	align	32

.loop0
%if DECORRELATE
	HUFF_DECOMPRESS	decode2,decode2_shift,1,DST32+3,0
	HUFF_DECOMPRESS	decode1,decode1_shift,0,DST32+3,-1
	HUFF_DECOMPRESS	decode3,decode3_shift,2,DST32+3,1
%else
	HUFF_DECOMPRESS	decode1,decode1_shift,0,DST32+3,0
	HUFF_DECOMPRESS	decode2,decode2_shift,1,DST32+3,0
	HUFF_DECOMPRESS	decode3,decode3_shift,2,DST32+3,0
%endif
%if DST32
%if SRC32
	HUFF_DECOMPRESS	decode3,decode3_shift,3,4,0
%else
	mov	byte [edi+3],0
%endif
%endif

	add	edi,DST32+3
	cmp	edi,ebp
	jb	.loop0

	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	retn

. %+ PROCNAME

%undef	PROCNAME
%undef	SRC32
%undef	DST32
%undef	DECORRELATE
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

HUFF_DECOMPRESS_PROC_YUV	asm_DecompressHFYU16,0,0
HUFF_DECOMPRESS_PROC_YUV	asm_DecompressHFYU16Delta,1,0
HUFF_DECOMPRESS_PROC_RGB	asm_DecompressHFYU24To24Delta,0,0,0
HUFF_DECOMPRESS_PROC_RGB	asm_DecompressHFYU24To24DeltaDecorrelate,0,0,1
HUFF_DECOMPRESS_PROC_RGB	asm_DecompressHFYU24To32Delta,0,1,0
HUFF_DECOMPRESS_PROC_RGB	asm_DecompressHFYU24To32DeltaDecorrelate,0,1,1
HUFF_DECOMPRESS_PROC_RGB	asm_DecompressHFYU32To32Delta,1,1,0
HUFF_DECOMPRESS_PROC_RGB	asm_DecompressHFYU32To32DeltaDecorrelate,1,1,1

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	GLOBAL	asm_SwapFields

asm_SwapFields:

;void __cdecl asm_SwapFields(
;	[esp+ 4] unsigned char* buf,
;	[esp+ 8] unsigned char* buf_end,
;	[esp+12] int stride);

	push	ebp
	push	esi
	push	ebx

	mov	esi,[esp+4+12]
	mov	ebp,[esp+8+12]
	mov	ebx,[esp+12+12]

.loop0
	mov	ecx,ebx
	shr	ecx,2
.loop1
	mov	eax,[esi]
	mov	edx,[esi+ebx]
	dec	ecx
	mov	[esi+ebx],eax
	mov	[esi],edx
	lea	esi,[esi+4]
	jnz	.loop1
	add	esi,ebx
	cmp	esi,ebp
	jb	.loop0

	pop	ebx
	pop	esi
	pop	ebp
	retn

.asm_SwapFields

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	align	8

yuv2rgb_constants:

x0000_0000_0010_0010	db	0x10,0x00,0x10,0x00,0x00,0x00,0x00,0x00
x0080_0080_0080_0080	db	0x80,0x00,0x80,0x00,0x80,0x00,0x80,0x00
x00FF_00FF_00FF_00FF	db	0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00
x00002000_00002000	db	0x00,0x20,0x00,0x00,0x00,0x20,0x00,0x00
cy			db	0x85,0x4A,0x00,0x00,0x85,0x4A,0x00,0x00
crv			db	0x00,0x00,0x13,0x33,0x00,0x00,0x13,0x33
cgu_cgv			db	0x77,0xF3,0xFC,0xE5,0x77,0xF3,0xFC,0xE5
cbu			db	0x8D,0x40,0x00,0x00,0x8D,0x40,0x00,0x00

%define	ofs_x0000_0000_0010_0010	0
%define	ofs_x0080_0080_0080_0080	8
%define	ofs_x00FF_00FF_00FF_00FF	16
%define	ofs_x00002000_00002000		24
%define	ofs_cy				32
%define	ofs_crv				40
%define	ofs_cgu_cgv			48
%define	ofs_cbu				56

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	GET_Y	2
;define	MMA	%1
;define	UYVY	%2

%if %2
	psrlw		%1,8
%else
	pand		%1,[edx+ofs_x00FF_00FF_00FF_00FF]
%endif

%endmacro

	
%macro	GET_UV	2
;define	MMA	%1
;define	UYVY	%2

	GET_Y		%1,1- %2
	
%endmacro

    
%macro	YUV2RGB_INNER_LOOP	3
%define	UYVY			%1
%define	RGB32			%2
%define	NO_NEXT_PIXEL		%3

;; This YUV422->RGB conversion code uses only four MMX registers per
;; source dword, so I convert two dwords in parallel.  Lines corresponding
;; to the "second pipe" are indented an extra space.  There's almost no
;; overlap, except at the end and in the two lines marked ***.

	movd		mm0,[esi]
	 movd		 mm5,[esi+4]
	movq		mm1,mm0
	GET_Y		mm0,UYVY	; mm0 = __________Y1__Y0
	 movq		 mm4,mm5
	GET_UV		mm1,UYVY	; mm1 = __________V0__U0
	 GET_Y		 mm4,UYVY
	movq		mm2,mm5		; *** avoid reload from [esi+4]
	 GET_UV		 mm5,UYVY
	psubw		mm0,[edx+ofs_x0000_0000_0010_0010]
	 movd		 mm6,[esi+8-4*(NO_NEXT_PIXEL)]
	GET_UV		mm2,UYVY	; mm2 = __________V2__U2
	 psubw		 mm4,[edx+ofs_x0000_0000_0010_0010]
	paddw		mm2,mm1
	 GET_UV		 mm6,UYVY
	psubw		mm1,[edx+ofs_x0080_0080_0080_0080]
	 paddw		 mm6,mm5
	psllq		mm2,32
	 psubw		 mm5,[edx+ofs_x0080_0080_0080_0080]
	punpcklwd	mm0,mm2		; mm0 = ______Y1______Y0
	 psllq		 mm6,32
	pmaddwd		mm0,[edx+ofs_cy]	; mm0 scaled
	 punpcklwd	 mm4,mm6
	paddw		mm1,mm1
	 pmaddwd	 mm4,[edx+ofs_cy]
	 paddw		 mm5,mm5
	paddw		mm1,mm2		; mm1 = __V1__U1__V0__U0 * 2
	paddd		mm0,[edx+ofs_x00002000_00002000]
	 paddw		 mm5,mm6
	movq		mm2,mm1
	 paddd		 mm4,[edx+ofs_x00002000_00002000]
	movq		mm3,mm1
	 movq		 mm6,mm5
	pmaddwd		mm1,[edx+ofs_crv]
	 movq		 mm7,mm5
	paddd		mm1,mm0
	 pmaddwd	 mm5,[edx+ofs_crv]
	psrad		mm1,14		; mm1 = RRRRRRRRrrrrrrrr
	 paddd		 mm5,mm4
	pmaddwd		mm2,[edx+ofs_cgu_cgv]
	 psrad		 mm5,14
	paddd		mm2,mm0
	 pmaddwd	 mm6,[edx+ofs_cgu_cgv]
	psrad		mm2,14		; mm2 = GGGGGGGGgggggggg
	 paddd		 mm6,mm4
	pmaddwd		mm3,[edx+ofs_cbu]
	 psrad		 mm6,14
	paddd		mm3,mm0
	 pmaddwd	 mm7,[edx+ofs_cbu]
       add	       esi,8
       add	       edi,12+4*RGB32
%if 0 == NO_NEXT_PIXEL
       cmp	       esi,ecx
%endif
	psrad		mm3,14		; mm3 = BBBBBBBBbbbbbbbb
	 paddd		 mm7,mm4
	pxor		mm0,mm0
	 psrad		 mm7,14
	packssdw	mm3,mm2	; mm3 = GGGGggggBBBBbbbb
	 packssdw	 mm7,mm6
	packssdw	mm1,mm0	; mm1 = ________RRRRrrrr
	 packssdw	 mm5,mm0	; *** avoid pxor mm4,mm4
	movq		mm2,mm3
	 movq		 mm6,mm7
	punpcklwd	mm2,mm1	; mm2 = RRRRBBBBrrrrbbbb
	 punpcklwd	 mm6,mm5
	punpckhwd	mm3,mm1	; mm3 = ____GGGG____gggg
	 punpckhwd	 mm7,mm5
	movq		mm0,mm2
	 movq		 mm4,mm6
	punpcklwd	mm0,mm3	; mm0 = ____rrrrggggbbbb
	 punpcklwd	 mm4,mm7
%if 0 == RGB32
	psllq		mm0,16
	 psllq		 mm4,16
%endif
	punpckhwd	mm2,mm3	; mm2 = ____RRRRGGGGBBBB
	 punpckhwd	 mm6,mm7
	packuswb	mm0,mm2	; mm0 = __RRGGBB__rrggbb <- ta dah!
	 packuswb	 mm4,mm6

%if RGB32
	movd	[edi-16],mm0	; store the quadwords independently
	 movd	 [edi-8],mm4	; (and in pieces since we may not be aligned)
	psrlq	mm0,32
	 psrlq	 mm4,32
	movd	[edi-12],mm0
	 movd	 [edi-4],mm4
%else
	psrlq	mm0,8		; pack the two quadwords into 12 bytes
	psllq	mm4,8		; (note: the two shifts above leave
	movd	[edi-12],mm0	; mm0,4 = __RRGGBBrrggbb__)
	psrlq	mm0,32
	por	mm4,mm0
	movd	[edi-8],mm4
	psrlq	mm4,32
	movd	[edi-4],mm4
%endif

%undef	UYVY
%undef	RGB32
%undef	NO_NEXT_PIXEL
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%macro	YUV2RGB_PROC	3
%define	PROCNAME	%1
;define	UYVY		%2
;define	RGB32		%3

	GLOBAL	PROCNAME

;;void __cdecl procname(
;;	[esp+ 4] const unsigned char* src,
;;	[esp+ 8] unsigned char* dst,
;;	[esp+12] const unsigned char* src_end,
;;	[esp+16] int stride);

PROCNAME:

	push	esi
	push	edi

	mov	eax,[esp+16+8]
	mov	esi,[esp+12+8]		; read source bottom-up
	mov	edi,[esp+8+8]
	mov	edx, yuv2rgb_constants

.loop0
	lea	ecx,[esi-8]
	sub	esi,eax

	align 32
.loop1
	YUV2RGB_INNER_LOOP	%2,%3,0
	jb	.loop1

	YUV2RGB_INNER_LOOP	%2,%3,1

	sub	esi,eax
	cmp	esi,[esp+4+8]
	ja	.loop0

	emms
	pop	edi
	pop	esi
	retn

. %+ PROCNAME

%undef	PROCNAME
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

YUV2RGB_PROC	mmx_YUY2toRGB24,0,0
YUV2RGB_PROC	mmx_YUY2toRGB32,0,1
YUV2RGB_PROC	mmx_UYVYtoRGB24,1,0
YUV2RGB_PROC	mmx_UYVYtoRGB32,1,1

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
