;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; asmblit.asm
;
; Copy Bitmap to DirectDraw Surface
;
; for MSVC and GCC, use 'COFF' format:
;    nasmw -f win32 asmblit.asm
;
; for BCC32, use 'OMF' format:
;    nasmw -f obj asmblit.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

        BITS 32

%ifdef SUPPORT_32BPP
        SECTION .data USE32 CLASS=DATA
        ALIGN 32
mask_r:   dd 0x000000f8, 0x000000f8
mask_g15: dd 0x0000f800, 0x0000f800
mask_g16: dd 0x0000fc00, 0x0000fc00
mask_b:   dd 0x00f80000, 0x00f80000

%define shift_r15 3
%define shift_g15 6
%define shift_b15 9

%define shift_r16 3
%define shift_g16 5
%define shift_b16 8
%endif

        SECTION .text USE32 CLASS=CODE
        EXTERN _palette_lookup
        EXTERN _brightmask

;-----------------------------------------------------------------------------
; blit core
;-----------------------------------------------------------------------------

%macro blitcore 9

%define srcbpp     %1
%define dstbpp     %2
%define xscale     %3
%define yscale     %4
%define scanline   %5
%define brightness %6
%define mmx        %7
%define wshift     %8
%define nearjump   %9

%if (srcbpp == 32 && (dstbpp >= 24 || mmx != 0))
%define regnum 3
%else
%define regnum 4
%endif

%if (nearjump == 0)
%define range1 short
%define range2 short
%elif (nearjump == 1)
%define range1 near
%define range2 short
%elif (nearjump == 2)
%define range1 near
%define range2 near
%endif

%if (scanline == 2)
    %define linefunc EXPANDLINE%1to%2MMX%7
%elif (xscale == 2)
    %define linefunc DOUBLELINE%1to%2MMX%7
%else
    %define linefunc LINE%1to%2MMX%7
%endif

%define pSrc     dword [esp + (regnum * 4) +  4]
%define pDst     dword [esp + (regnum * 4) +  8]
%define nSrcLen  dword [esp + (regnum * 4) + 12]
%define nDstLen  dword [esp + (regnum * 4) + 16]
%define nWidth   dword [esp + (regnum * 4) + 20]
%define nHeight  dword [esp + (regnum * 4) + 24]

;--------------------------------------------------------------

        GLOBAL _asmblit_%1to%2_%3x%4_sl%5_bl%6_mmx%7
        ALIGN 32
_asmblit_%1to%2_%3x%4_sl%5_bl%6_mmx%7:

%if ((mmx == 0 || (scanline == 2 && dstbpp == 24)) && brightness != 0)

        jmp         near _asmblit_%1to%2_%3x%4_sl%5_bl0_mmx%7

%else
        push        ebp
        push        esi
        push        edi
%if (regnum == 4)
        push        ebx
%endif

        mov         ebp, nHeight
%if (scanline == 1 && brightness == 0)
        shl         nDstLen, 1
%endif
%if (wshift != 0)
        shr         nWidth, wshift
%endif
%if (srcbpp != 32)
        mov         ebx, [_palette_lookup]
%endif
%if (mmx != 0)
  %if (dstbpp == 16 && scanline == 2 && brightness == 0)
        pxor        mm2, mm2
  %endif
  %if (srcbpp == 32)
    %if (dstbpp == 15)
        movq        mm5, [mask_r]
        movq        mm6, [mask_g15]
        movq        mm7, [mask_b]
    %elif (dstbpp == 16)
        movq        mm5, [mask_r]
        movq        mm6, [mask_g16]
        movq        mm7, [mask_b]
    %endif
  %endif
%endif
%if (brightness != 0)
  %if (dstbpp == 15 || dstbpp == 16)
        movq        mm4, [_brightmask]
  %elif (dstbpp == 24 || dstbpp == 32)
        movq        mm6, [_brightmask]
  %endif
%endif

        ALIGN 16
L1@%1%2%3%4%5%6%7:
        mov         esi, pSrc
        mov         edi, pDst
        mov         ecx, nWidth

        ALIGN 16
L2@%1%2%3%4%5%6%7:
%if (scanline == 2 && brightness != 0)
        linefunc %6
%else
        linefunc 0
%endif
        dec         ecx
        jnz         range2 L2@%1%2%3%4%5%6%7

%if ((yscale == 2 && scanline != 1) || (scanline == 1 && brightness != 0))
        mov         eax, nDstLen
        mov         esi, pSrc
        add         pDst, eax
        mov         edi, pDst
        mov         ecx, nWidth

        ALIGN 16
L3@%1%2%3%4%5%6%7:
        linefunc %6
        dec         ecx
        jnz         range2 L3@%1%2%3%4%5%6%7
%endif
        mov         eax, nSrcLen
        mov         edx, nDstLen
        add         pSrc, eax
        add         pDst, edx
        dec         ebp
        jnz         range1 L1@%1%2%3%4%5%6%7

%if (regnum == 4)
        pop         ebx
%endif
        pop         edi
        pop         esi
        pop         ebp
%if (mmx > 0)
        emms
%endif
        ret

%endif

;--------------------------------------------------------------

%undef pSrc
%undef pDst
%undef nSrcLen
%undef nDstLen
%undef nWidth
%undef nHeight

%undef srcbpp
%undef dstbpp
%undef xscale
%undef yscale
%undef scanline
%undef brightness
%undef mmx
%undef wshift
%undef nearjump
%undef linefunc
%undef regnum
%undef range1
%undef range2
%endmacro

;------------------------------------------------------------------
; Scanline Brightness
;------------------------------------------------------------------

%macro HSCANLINE_BRIGHTNESS 4
%if (%1 == 75)
        psrlq       %2, 1
        pand        %2, %4
        movq        %3, %2
        psrlq       %3, 1
        pand        %3, %4
  %if (dstbpp < 24)
        paddw       %2, %3
  %else
        paddb       %2, %3
  %endif
%elif (%1 == 50)
        psrlq       %2, 1
        pand        %2, %4
%elif (%1 == 25)
        psrlq       %2, 1
        pand        %2, %4
        psrlq       %2, 1
        pand        %2, %4
%endif
%endmacro

%macro VSCANLINE_BRIGHTNESS 5
%if (%1 == 0)
  %if (srcbpp == 16 && dstbpp == 16)
        punpcklwd   %2, %3
  %endif
%elif (%1 == 75)
        movq        %3, %2
        psrlq       %3, 1
        pand        %3, %5
        movq        %4, %3
        psrlq       %4, 1
        pand        %4, %5
  %if (dstbpp < 24)
        paddw       %3, %4
        punpcklwd   %2, %3
  %else
        paddb       %3, %4
        punpckldq   %2, %3
  %endif
%elif (%1 == 50)
        movq        %3, %2
        psrlq       %3, 1
        pand        %3, %5
  %if (dstbpp < 24)
        punpcklwd   %2, %3
  %else
        punpckldq   %2, %3
  %endif
%elif (%1 == 25)
        movq        %3, %2
        psrlq       %3, 1
        pand        %3, %5
        psrlq       %3, 1
        pand        %3, %5
  %if (dstbpp < 24)
        punpcklwd   %2, %3
  %else
        punpckldq   %2, %3
  %endif
%endif
%endmacro


;------------------------------------------------------------------
; 16to16
;------------------------------------------------------------------

%macro LINE16to16MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        and         eax, 0x0000ffff
        and         edx, 0xffff0000
        or          eax, edx
        mov         [edi], eax
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        and         eax, 0x0000ffff
        and         edx, 0xffff0000
        or          eax, edx
        mov         [edi + 4], eax
        add         esi, 8
        add         edi, 8
%endmacro

%macro DOUBLELINE16to16MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 4], edx
        add         esi, 4
        add         edi, 8
%endmacro

%macro EXPANDLINE16to16MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movzx       eax, word [ebx + eax * 4]
        movzx       edx, word [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 4], edx
        add         esi, 4
        add         edi, 8
%endmacro

%macro LINE16to16MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpcklwd   mm0, mm1
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm1, [ebx + eax * 4]
        movd        mm2, [ebx + edx * 4]
        punpcklwd   mm1, mm2
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        movq        [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro DOUBLELINE16to16MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        movq        [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro EXPANDLINE16to16MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpcklwd   mm0, mm1
        VSCANLINE_BRIGHTNESS %1, mm0, mm2, mm3, mm4
        movq        [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro LINE16to16MMX2 1
        mov         eax, [esi]
        prefetchnta [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpcklwd   mm0, mm1
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm1, [ebx + eax * 4]
        movd        mm2, [ebx + edx * 4]
        punpcklwd   mm1, mm2
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        prefetchnta [esi + 8]
        movntq      [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro DOUBLELINE16to16MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        prefetchnta [esi + 4]
        movntq      [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro EXPANDLINE16to16MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpcklwd   mm0, mm1
        VSCANLINE_BRIGHTNESS %1, mm0, mm2, mm3, mm4
        prefetchnta [esi + 4]
        movntq      [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

;         sb db xs ys sl  bl mmx ws nj
blitcore 16, 16, 1, 1, 0,  0,  0, 2, 0
blitcore 16, 16, 1, 2, 0,  0,  0, 2, 1
blitcore 16, 16, 1, 2, 1,  0,  0, 2, 0
blitcore 16, 16, 1, 2, 1, 25,  0, 2, 0 ; dummy
blitcore 16, 16, 1, 2, 1, 50,  0, 2, 0 ; dummy
blitcore 16, 16, 1, 2, 1, 75,  0, 2, 0 ; dummy
blitcore 16, 16, 2, 1, 0,  0,  0, 1, 0
blitcore 16, 16, 2, 1, 2,  0,  0, 1, 0
blitcore 16, 16, 2, 1, 2, 25,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 1, 2, 50,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 1, 2, 75,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 2, 0,  0,  0, 1, 1
blitcore 16, 16, 2, 2, 1,  0,  0, 1, 0
blitcore 16, 16, 2, 2, 1, 25,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 2, 1, 50,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 2, 1, 75,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 2, 2,  0,  0, 1, 1
blitcore 16, 16, 2, 2, 2, 25,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 2, 2, 50,  0, 1, 0 ; dummy
blitcore 16, 16, 2, 2, 2, 75,  0, 1, 0 ; dummy

blitcore 16, 16, 1, 1, 0,  0,  1, 2, 0
blitcore 16, 16, 1, 2, 0,  0,  1, 2, 1
blitcore 16, 16, 1, 2, 1,  0,  1, 2, 0
blitcore 16, 16, 1, 2, 1, 25,  1, 2, 1
blitcore 16, 16, 1, 2, 1, 50,  1, 2, 1
blitcore 16, 16, 1, 2, 1, 75,  1, 2, 1
blitcore 16, 16, 2, 1, 0,  0,  1, 1, 0
blitcore 16, 16, 2, 1, 2,  0,  1, 1, 0
blitcore 16, 16, 2, 1, 2, 25,  1, 1, 1
blitcore 16, 16, 2, 1, 2, 50,  1, 1, 1
blitcore 16, 16, 2, 1, 2, 75,  1, 1, 1
blitcore 16, 16, 2, 2, 0,  0,  1, 1, 1
blitcore 16, 16, 2, 2, 1,  0,  1, 1, 0
blitcore 16, 16, 2, 2, 1, 25,  1, 1, 1
blitcore 16, 16, 2, 2, 1, 50,  1, 1, 1
blitcore 16, 16, 2, 2, 1, 75,  1, 1, 1
blitcore 16, 16, 2, 2, 2,  0,  1, 1, 1
blitcore 16, 16, 2, 2, 2, 25,  1, 1, 1
blitcore 16, 16, 2, 2, 2, 50,  1, 1, 1
blitcore 16, 16, 2, 2, 2, 75,  1, 1, 1

blitcore 16, 16, 1, 1, 0,  0,  2, 2, 1
blitcore 16, 16, 1, 2, 0,  0,  2, 2, 1
blitcore 16, 16, 1, 2, 1,  0,  2, 2, 1
blitcore 16, 16, 1, 2, 1, 25,  2, 2, 1
blitcore 16, 16, 1, 2, 1, 50,  2, 2, 1
blitcore 16, 16, 1, 2, 1, 75,  2, 2, 1
blitcore 16, 16, 2, 1, 0,  0,  2, 1, 0
blitcore 16, 16, 2, 1, 2,  0,  2, 1, 0
blitcore 16, 16, 2, 1, 2, 25,  2, 1, 1
blitcore 16, 16, 2, 1, 2, 50,  2, 1, 1
blitcore 16, 16, 2, 1, 2, 75,  2, 1, 1
blitcore 16, 16, 2, 2, 0,  0,  2, 1, 1
blitcore 16, 16, 2, 2, 1,  0,  2, 1, 0
blitcore 16, 16, 2, 2, 1, 25,  2, 1, 1
blitcore 16, 16, 2, 2, 1, 50,  2, 1, 1
blitcore 16, 16, 2, 2, 1, 75,  2, 1, 1
blitcore 16, 16, 2, 2, 2,  0,  2, 1, 1
blitcore 16, 16, 2, 2, 2, 25,  2, 1, 1
blitcore 16, 16, 2, 2, 2, 50,  2, 1, 1
blitcore 16, 16, 2, 2, 2, 75,  2, 1, 1

;------------------------------------------------------------------
; 16to24
;------------------------------------------------------------------

%macro LINE16to24MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 3], edx
        add         esi, 4
        add         edi, 6
%endmacro

%macro DOUBLELINE16to24MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 3], eax
        mov         [edi + 6], edx
        mov         [edi + 9], edx
        add         esi, 4
        add         edi, 12
%endmacro

%macro EXPANDLINE16to24MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 6], edx
        add         esi, 4
        add         edi, 12
%endmacro

%macro LINE16to24MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        psllq       mm1, 24
        por         mm0, mm1
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm1, [ebx + eax * 4]
        movd        mm2, [ebx + edx * 4]
        movq        mm3, mm1
        psllq       mm1, 48
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi], mm0
        psrlq       mm3, 16
        psllq       mm2, 8
        por         mm2, mm3
        mov         eax, [esi + 8]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        psllq       mm0, 32
        por         mm0, mm2
        movq        mm2, mm1
        psllq       mm1, 56
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi + 8], mm0
        mov         eax, [esi + 12]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        psllq       mm0, 16
        psrlq       mm2, 8
        por         mm0, mm2
        psllq       mm1, 40
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi + 16], mm0
        add         esi, 16
        add         edi, 24
%endmacro

%macro DOUBLELINE16to24MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        movq        mm2, mm0
        movq        mm3, mm1
        psllq       mm2, 24
        por         mm0, mm2
        psllq       mm1, 48
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi], mm0
        movq        mm2, mm3
        psllq       mm2, 8
        psrlq       mm3, 16
        por         mm2, mm3
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        movq        mm3, mm0
        psllq       mm0, 32
        por         mm0, mm2
        movq        mm2, mm3
        psllq       mm3, 56
        por         mm0, mm3
        HSCANLINE_BRIGHTNESS %1, mm0, mm3, mm6
        movq        [edi + 8], mm0
        psrlq       mm2, 8
        movq        mm0, mm1
        psllq       mm0, 16
        por         mm0, mm2
        psllq       mm1, 40
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi + 16], mm0
        add         esi, 8
        add         edi, 24
%endmacro

%macro EXPANDLINE16to24MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        movq        mm2, mm1
        psllq       mm1, 48
        por         mm0, mm1
        movq        [edi], mm0
        psrlq       mm2, 16
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        psllq       mm0, 32
        por         mm0, mm2
        movq        [edi + 8], mm0
        psllq       mm1, 16
        movq        [edi + 16], mm1
        add         esi, 8
        add         edi, 24
%endmacro

%macro LINE16to24MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        prefetchnta [esi + 4]
        psllq       mm1, 24
        por         mm0, mm1
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm1, [ebx + eax * 4]
        movd        mm2, [ebx + edx * 4]
        movq        mm3, mm1
        psllq       mm1, 48
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        prefetchnta [esi + 8]
        movntq      [edi], mm0
        psrlq       mm3, 16
        psllq       mm2, 8
        por         mm2, mm3
        mov         eax, [esi + 8]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        psllq       mm0, 32
        por         mm0, mm2
        movq        mm2, mm1
        psllq       mm1, 56
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        prefetchnta [esi + 12]
        movntq      [edi + 8], mm0
        mov         eax, [esi + 12]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        psllq       mm0, 16
        psrlq       mm2, 8
        por         mm0, mm2
        psllq       mm1, 40
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        prefetchnta [esi + 16]
        movntq      [edi + 16], mm0
        add         esi, 16
        add         edi, 24
%endmacro

%macro DOUBLELINE16to24MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        movq        mm2, mm0
        movq        mm3, mm1
        psllq       mm2, 24
        por         mm0, mm2
        psllq       mm1, 48
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        prefetchnta [esi + 4]
        movntq      [edi], mm0
        movq        mm2, mm3
        psllq       mm2, 8
        psrlq       mm3, 16
        por         mm2, mm3
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        movq        mm3, mm0
        psllq       mm0, 32
        por         mm0, mm2
        movq        mm2, mm3
        psllq       mm3, 56
        por         mm0, mm3
        HSCANLINE_BRIGHTNESS %1, mm0, mm3, mm6
        movntq      [edi + 8], mm0
        psrlq       mm2, 8
        movq        mm0, mm1
        psllq       mm0, 16
        por         mm0, mm2
        psllq       mm1, 40
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        prefetchnta [esi + 8]
        movntq      [edi + 16], mm0
        add         esi, 8
        add         edi, 24
%endmacro

%macro EXPANDLINE16to24MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        prefetchnta [esi + 4]
        movq        mm2, mm1
        psllq       mm1, 48
        por         mm0, mm1
        movntq      [edi], mm0
        psrlq       mm2, 16
        mov         eax, [esi + 4]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        psllq       mm0, 32
        por         mm0, mm2
        movntq      [edi + 8], mm0
        psllq       mm1, 16
        prefetchnta [esi + 8]
        movntq      [edi + 16], mm1
        add         esi, 8
        add         edi, 24
%endmacro

;         sb db xs ys sl  bl mmx ws nj
blitcore 16, 24, 1, 1, 0,  0,  0, 1, 0
blitcore 16, 24, 1, 2, 0,  0,  0, 1, 1
blitcore 16, 24, 1, 2, 1,  0,  0, 1, 0
blitcore 16, 24, 1, 2, 1, 25,  0, 1, 0 ; dummy
blitcore 16, 24, 1, 2, 1, 50,  0, 1, 0 ; dummy
blitcore 16, 24, 1, 2, 1, 75,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 1, 0,  0,  0, 1, 0
blitcore 16, 24, 2, 1, 2,  0,  0, 1, 0
blitcore 16, 24, 2, 1, 2, 25,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 1, 2, 50,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 1, 2, 75,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 2, 0,  0,  0, 1, 1
blitcore 16, 24, 2, 2, 1,  0,  0, 1, 0
blitcore 16, 24, 2, 2, 1, 25,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 2, 1, 50,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 2, 1, 75,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 2, 2,  0,  0, 1, 1
blitcore 16, 24, 2, 2, 2, 25,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 2, 2, 50,  0, 1, 0 ; dummy
blitcore 16, 24, 2, 2, 2, 75,  0, 1, 0 ; dummy

blitcore 16, 24, 1, 1, 0,  0,  1, 3, 2
blitcore 16, 24, 1, 2, 0,  0,  1, 3, 2
blitcore 16, 24, 1, 2, 1,  0,  1, 3, 2
blitcore 16, 24, 1, 2, 1, 25,  1, 3, 2
blitcore 16, 24, 1, 2, 1, 50,  1, 3, 2
blitcore 16, 24, 1, 2, 1, 75,  1, 3, 2
blitcore 16, 24, 2, 1, 0,  0,  1, 2, 2
blitcore 16, 24, 2, 1, 2,  0,  1, 2, 1
blitcore 16, 24, 2, 1, 2, 25,  1, 2, 0 ; dummy
blitcore 16, 24, 2, 1, 2, 50,  1, 2, 0 ; dummy
blitcore 16, 24, 2, 1, 2, 75,  1, 2, 0 ; dummy
blitcore 16, 24, 2, 2, 0,  0,  1, 2, 2
blitcore 16, 24, 2, 2, 1,  0,  1, 2, 2
blitcore 16, 24, 2, 2, 1, 25,  1, 2, 2
blitcore 16, 24, 2, 2, 1, 50,  1, 2, 2
blitcore 16, 24, 2, 2, 1, 75,  1, 2, 2
blitcore 16, 24, 2, 2, 2,  0,  1, 2, 1
blitcore 16, 24, 2, 2, 2, 25,  1, 2, 0 ; dummy
blitcore 16, 24, 2, 2, 2, 50,  1, 2, 0 ; dummy
blitcore 16, 24, 2, 2, 2, 75,  1, 2, 0 ; dummy

blitcore 16, 24, 1, 1, 0,  0,  2, 3, 2
blitcore 16, 24, 1, 2, 0,  0,  2, 3, 2
blitcore 16, 24, 1, 2, 1,  0,  2, 3, 2
blitcore 16, 24, 1, 2, 1, 25,  2, 3, 2
blitcore 16, 24, 1, 2, 1, 50,  2, 3, 2
blitcore 16, 24, 1, 2, 1, 75,  2, 3, 2
blitcore 16, 24, 2, 1, 0,  0,  2, 2, 2
blitcore 16, 24, 2, 1, 2,  0,  2, 2, 1
blitcore 16, 24, 2, 1, 2, 25,  2, 2, 0 ; dummy
blitcore 16, 24, 2, 1, 2, 50,  2, 2, 0 ; dummy
blitcore 16, 24, 2, 1, 2, 75,  2, 2, 0 ; dummy
blitcore 16, 24, 2, 2, 0,  0,  2, 2, 2
blitcore 16, 24, 2, 2, 1,  0,  2, 2, 2
blitcore 16, 24, 2, 2, 1, 25,  2, 2, 2
blitcore 16, 24, 2, 2, 1, 50,  2, 2, 2
blitcore 16, 24, 2, 2, 1, 75,  2, 2, 2
blitcore 16, 24, 2, 2, 2,  0,  2, 2, 1
blitcore 16, 24, 2, 2, 2, 25,  2, 2, 0 ; dummy
blitcore 16, 24, 2, 2, 2, 50,  2, 2, 0 ; dummy
blitcore 16, 24, 2, 2, 2, 75,  2, 2, 0 ; dummy

;------------------------------------------------------------------
; 16to32
;------------------------------------------------------------------

%macro LINE16to32MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 4], edx
        add         esi, 4
        add         edi, 8
%endmacro

%macro DOUBLELINE16to32MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 4], eax
        mov         [edi + 8], edx
        mov         [edi + 12], edx
        add         esi, 4
        add         edi, 16
%endmacro

%macro EXPANDLINE16to32MMX0 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        mov         eax, [ebx + eax * 4]
        mov         edx, [ebx + edx * 4]
        mov         [edi], eax
        mov         [edi + 8], edx
        add         esi, 4
        add         edi, 16
%endmacro

%macro LINE16to32MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro DOUBLELINE16to32MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpckldq   mm0, mm0
        punpckldq   mm1, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm2, mm6
        HSCANLINE_BRIGHTNESS %1, mm1, mm2, mm6
        movq        [edi], mm0
        movq        [edi + 8], mm1
        add         esi, 4
        add         edi, 16
%endmacro

%macro EXPANDLINE16to32MMX1 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        VSCANLINE_BRIGHTNESS %1, mm0, mm2, mm3, mm6
        VSCANLINE_BRIGHTNESS %1, mm1, mm2, mm3, mm6
        movq        [edi], mm0
        movq        [edi + 8], mm1
        add         esi, 4
        add         edi, 16
%endmacro

%macro LINE16to32MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        prefetchnta [esi + 4]
        movntq      [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro DOUBLELINE16to32MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        punpckldq   mm0, mm0
        punpckldq   mm1, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm2, mm6
        HSCANLINE_BRIGHTNESS %1, mm1, mm2, mm6
        prefetchnta [esi + 4]
        movntq      [edi], mm0
        movntq      [edi + 8], mm1
        add         esi, 4
        add         edi, 16
%endmacro

%macro EXPANDLINE16to32MMX2 1
        mov         eax, [esi]
        mov         edx, eax
        and         eax, 0x0000ffff
        shr         edx, 16
        movd        mm0, [ebx + eax * 4]
        movd        mm1, [ebx + edx * 4]
        VSCANLINE_BRIGHTNESS %1, mm0, mm2, mm3, mm6
        VSCANLINE_BRIGHTNESS %1, mm1, mm2, mm3, mm6
        prefetchnta [esi + 4]
        movntq      [edi], mm0
        movntq      [edi + 8], mm1
        add         esi, 4
        add         edi, 16
%endmacro

;         sb db xs ys sl  bl mmx ws nj
blitcore 16, 32, 1, 1, 0,  0,  0, 1, 0
blitcore 16, 32, 1, 2, 0,  0,  0, 1, 1
blitcore 16, 32, 1, 2, 1,  0,  0, 1, 0
blitcore 16, 32, 1, 2, 1, 25,  0, 1, 0 ; dummy
blitcore 16, 32, 1, 2, 1, 50,  0, 1, 0 ; dummy
blitcore 16, 32, 1, 2, 1, 75,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 1, 0,  0,  0, 1, 0
blitcore 16, 32, 2, 1, 2,  0,  0, 1, 0
blitcore 16, 32, 2, 1, 2, 25,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 1, 2, 50,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 1, 2, 75,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 2, 0,  0,  0, 1, 1
blitcore 16, 32, 2, 2, 1,  0,  0, 1, 0
blitcore 16, 32, 2, 2, 1, 25,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 2, 1, 50,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 2, 1, 75,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 2, 2,  0,  0, 1, 1
blitcore 16, 32, 2, 2, 2, 25,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 2, 2, 50,  0, 1, 0 ; dummy
blitcore 16, 32, 2, 2, 2, 75,  0, 1, 0 ; dummy

blitcore 16, 32, 1, 1, 0,  0,  1, 1, 0
blitcore 16, 32, 1, 2, 0,  0,  1, 1, 1
blitcore 16, 32, 1, 2, 1,  0,  1, 1, 0
blitcore 16, 32, 1, 2, 1, 25,  1, 1, 1
blitcore 16, 32, 1, 2, 1, 50,  1, 1, 1
blitcore 16, 32, 1, 2, 1, 75,  1, 1, 1
blitcore 16, 32, 2, 1, 0,  0,  1, 1, 0
blitcore 16, 32, 2, 1, 2,  0,  1, 1, 0
blitcore 16, 32, 2, 1, 2, 25,  1, 1, 1
blitcore 16, 32, 2, 1, 2, 50,  1, 1, 1
blitcore 16, 32, 2, 1, 2, 75,  1, 1, 1
blitcore 16, 32, 2, 2, 0,  0,  1, 1, 1
blitcore 16, 32, 2, 2, 1,  0,  1, 1, 0
blitcore 16, 32, 2, 2, 1, 25,  1, 1, 1
blitcore 16, 32, 2, 2, 1, 50,  1, 1, 1
blitcore 16, 32, 2, 2, 1, 75,  1, 1, 1
blitcore 16, 32, 2, 2, 2,  0,  1, 1, 1
blitcore 16, 32, 2, 2, 2, 25,  1, 1, 1
blitcore 16, 32, 2, 2, 2, 50,  1, 1, 1
blitcore 16, 32, 2, 2, 2, 75,  1, 1, 1

blitcore 16, 32, 1, 1, 0,  0,  2, 1, 0
blitcore 16, 32, 1, 2, 0,  0,  2, 1, 1
blitcore 16, 32, 1, 2, 1,  0,  2, 1, 0
blitcore 16, 32, 1, 2, 1, 25,  2, 1, 1
blitcore 16, 32, 1, 2, 1, 50,  2, 1, 1
blitcore 16, 32, 1, 2, 1, 75,  2, 1, 1
blitcore 16, 32, 2, 1, 0,  0,  2, 1, 0
blitcore 16, 32, 2, 1, 2,  0,  2, 1, 0
blitcore 16, 32, 2, 1, 2, 25,  2, 1, 1
blitcore 16, 32, 2, 1, 2, 50,  2, 1, 1
blitcore 16, 32, 2, 1, 2, 75,  2, 1, 1
blitcore 16, 32, 2, 2, 0,  0,  2, 1, 1
blitcore 16, 32, 2, 2, 1,  0,  2, 1, 0
blitcore 16, 32, 2, 2, 1, 25,  2, 1, 1
blitcore 16, 32, 2, 2, 1, 50,  2, 1, 1
blitcore 16, 32, 2, 2, 1, 75,  2, 1, 1
blitcore 16, 32, 2, 2, 2,  0,  2, 1, 1
blitcore 16, 32, 2, 2, 2, 25,  2, 1, 1
blitcore 16, 32, 2, 2, 2, 50,  2, 1, 1
blitcore 16, 32, 2, 2, 2, 75,  2, 1, 1

%ifdef SUPPORT_32BPP
;------------------------------------------------------------------
; 32to15
;------------------------------------------------------------------

%macro LINE32to15MMX0 1
        mov         eax, [esi]
        and         eax, 0x000000f8
        shr         eax, shift_r15
        mov         edx, eax
        mov         eax, [esi]
        and         eax, 0x0000f800
        shr         eax, shift_g15
        or          edx, eax
        mov         eax, [esi]
        and         eax, 0x00f80000
        shr         eax, shift_b15
        or          eax, edx
        mov         [edi], ax
        add         esi, 4
        add         edi, 2
%endmacro

%macro DOUBLELINE32to15MMX0 1
        mov         eax, [esi]
        mov         ebx, eax
        and         eax, 0x000000f8
        shr         eax, shift_r15
        mov         edx, eax
        mov         eax, ebx
        and         eax, 0x0000f800
        shr         eax, shift_g15
        or          edx, eax
        mov         eax, ebx
        and         eax, 0x00f80000
        shr         eax, shift_b15
        or          eax, edx
        mov         edx, eax
        shl         eax, 16
        or          eax, edx
        mov         [edi], eax
        add         esi, 4
        add         edi, 4
%endmacro

%macro EXPANDLINE32to15MMX0 1
        mov         eax, [esi]
        mov         ebx, eax
        and         eax, 0x000000f8
        shr         eax, shift_r15
        mov         edx, eax
        mov         eax, ebx
        and         eax, 0x0000f800
        shr         eax, shift_g15
        or          edx, eax
        mov         eax, ebx
        and         eax, 0x00f80000
        shr         eax, shift_b15
        or          eax, edx
        mov         [edi], eax
        add         esi, 4
        add         edi, 4
%endmacro

%macro LINE32to15MMX1 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r15
        pand        mm1, mm6
        psrlq       mm1, shift_g15
        pand        mm2, mm7
        psrlq       mm2, shift_b15
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psrlq       mm1, 16
        por         mm0, mm1
        movq        mm1, [esi + 8]
        movq        mm2, mm1
        movq        mm3, mm1
        pand        mm1, mm5
        psrlq       mm1, shift_r15
        pand        mm2, mm6
        psrlq       mm2, shift_g15
        pand        mm3, mm7
        psrlq       mm3, shift_b15
        por         mm1, mm2
        por         mm1, mm3
        movq        mm2, mm1
        psrlq       mm2, 16
        por         mm1, mm2
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        movq        [edi], mm0
        add         esi, 16
        add         edi, 8
%endmacro

%macro DOUBLELINE32to15MMX1 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r15
        pand        mm1, mm6
        psrlq       mm1, shift_g15
        pand        mm2, mm7
        psrlq       mm2, shift_b15
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psllq       mm0, 16
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        movq        [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro EXPANDLINE32to15MMX1 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r15
        pand        mm1, mm6
        psrlq       mm1, shift_g15
        pand        mm2, mm7
        psrlq       mm2, shift_b15
        por         mm0, mm1
        por         mm0, mm2
        VSCANLINE_BRIGHTNESS %1, mm0, mm1, mm2, mm4
        movq        [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro LINE32to15MMX2 1
        movq        mm0, [esi]
        prefetchnta [esi + 8]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r15
        pand        mm1, mm6
        psrlq       mm1, shift_g15
        pand        mm2, mm7
        psrlq       mm2, shift_b15
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psrlq       mm1, 16
        por         mm0, mm1
        movq        mm1, [esi + 8]
        movq        mm2, mm1
        movq        mm3, mm1
        pand        mm1, mm5
        psrlq       mm1, shift_r15
        pand        mm2, mm6
        psrlq       mm2, shift_g15
        pand        mm3, mm7
        psrlq       mm3, shift_b15
        por         mm1, mm2
        por         mm1, mm3
        movq        mm2, mm1
        psrlq       mm2, 16
        por         mm1, mm2
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        prefetchnta [esi + 16]
        movntq      [edi], mm0
        add         esi, 16
        add         edi, 8
%endmacro

%macro DOUBLELINE32to15MMX2 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r15
        pand        mm1, mm6
        psrlq       mm1, shift_g15
        pand        mm2, mm7
        psrlq       mm2, shift_b15
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psllq       mm0, 16
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        prefetchnta [esi + 8]
        movntq      [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro EXPANDLINE32to15MMX2 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r15
        pand        mm1, mm6
        psrlq       mm1, shift_g15
        pand        mm2, mm7
        psrlq       mm2, shift_b15
        por         mm0, mm1
        por         mm0, mm2
        VSCANLINE_BRIGHTNESS %1, mm0, mm1, mm2, mm4
        prefetchnta [esi + 8]
        movntq      [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

;         sb db xs ys sl  bl mmx ws nj
blitcore 32, 15, 1, 1, 0,  0,  0, 0, 0
blitcore 32, 15, 1, 2, 0,  0,  0, 0, 1
blitcore 32, 15, 1, 2, 1,  0,  0, 0, 0
blitcore 32, 15, 1, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 15, 1, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 15, 1, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 1, 0,  0,  0, 0, 1
blitcore 32, 15, 2, 1, 2,  0,  0, 0, 0
blitcore 32, 15, 2, 1, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 1, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 1, 2, 75,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 2, 0,  0,  0, 0, 1
blitcore 32, 15, 2, 2, 1,  0,  0, 0, 1
blitcore 32, 15, 2, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 2, 2,  0,  0, 0, 1
blitcore 32, 15, 2, 2, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 2, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 15, 2, 2, 2, 75,  0, 0, 0 ; dummy

blitcore 32, 15, 1, 1, 0,  0,  1, 2, 1
blitcore 32, 15, 1, 2, 0,  0,  1, 2, 1
blitcore 32, 15, 1, 2, 1,  0,  1, 2, 1
blitcore 32, 15, 1, 2, 1, 25,  1, 2, 1
blitcore 32, 15, 1, 2, 1, 50,  1, 2, 1
blitcore 32, 15, 1, 2, 1, 75,  1, 2, 2 ; k7
blitcore 32, 15, 2, 1, 0,  0,  1, 2, 0
blitcore 32, 15, 2, 1, 2,  0,  1, 1, 0
blitcore 32, 15, 2, 1, 2, 25,  1, 1, 0
blitcore 32, 15, 2, 1, 2, 50,  1, 1, 0
blitcore 32, 15, 2, 1, 2, 75,  1, 1, 0
blitcore 32, 15, 2, 2, 0,  0,  1, 1, 1
blitcore 32, 15, 2, 2, 1,  0,  1, 1, 0
blitcore 32, 15, 2, 2, 1, 25,  1, 1, 1
blitcore 32, 15, 2, 2, 1, 50,  1, 1, 1
blitcore 32, 15, 2, 2, 1, 75,  1, 1, 1
blitcore 32, 15, 2, 2, 2,  0,  1, 1, 1
blitcore 32, 15, 2, 2, 2, 25,  1, 1, 1
blitcore 32, 15, 2, 2, 2, 50,  1, 1, 1
blitcore 32, 15, 2, 2, 2, 75,  1, 1, 1

blitcore 32, 15, 1, 1, 0,  0,  2, 2, 1
blitcore 32, 15, 1, 2, 0,  0,  2, 2, 1
blitcore 32, 15, 1, 2, 1,  0,  2, 2, 1
blitcore 32, 15, 1, 2, 1, 25,  2, 2, 2
blitcore 32, 15, 1, 2, 1, 50,  2, 2, 1
blitcore 32, 15, 1, 2, 1, 75,  2, 2, 2
blitcore 32, 15, 2, 1, 0,  0,  2, 1, 0
blitcore 32, 15, 2, 1, 2,  0,  2, 1, 0
blitcore 32, 15, 2, 1, 2, 25,  2, 1, 0
blitcore 32, 15, 2, 1, 2, 50,  2, 1, 0
blitcore 32, 15, 2, 1, 2, 75,  2, 1, 0
blitcore 32, 15, 2, 2, 0,  0,  2, 1, 1
blitcore 32, 15, 2, 2, 1,  0,  2, 1, 0
blitcore 32, 15, 2, 2, 1, 25,  2, 1, 1
blitcore 32, 15, 2, 2, 1, 50,  2, 1, 1
blitcore 32, 15, 2, 2, 1, 75,  2, 1, 1
blitcore 32, 15, 2, 2, 2,  0,  2, 1, 1
blitcore 32, 15, 2, 2, 2, 25,  2, 1, 1
blitcore 32, 15, 2, 2, 2, 50,  2, 1, 1
blitcore 32, 15, 2, 2, 2, 75,  2, 1, 1

;------------------------------------------------------------------
; 32to16
;------------------------------------------------------------------

%macro LINE32to16MMX0 1
        mov         eax, [esi]
        mov         ebx, eax
        and         eax, 0x000000f8
        shr         eax, shift_r16
        mov         edx, eax
        mov         eax, ebx
        and         eax, 0x0000fc00
        shr         eax, shift_g16
        or          edx, eax
        mov         eax, ebx
        and         eax, 0x00f80000
        shr         eax, shift_b16
        or          eax, edx
        mov         [edi], ax
        add         esi, 4
        add         edi, 2
%endmacro

%macro DOUBLELINE32to16MMX0 1
        mov         eax, [esi]
        mov         ebx, eax
        and         eax, 0x000000f8
        shr         eax, shift_r16
        mov         edx, eax
        mov         eax, ebx
        and         eax, 0x0000fc00
        shr         eax, shift_g16
        or          edx, eax
        mov         eax, ebx
        and         eax, 0x00f80000
        shr         eax, shift_b16
        or          eax, edx
        mov         edx, eax
        shl         eax, 16
        or          eax, edx
        mov         [edi], eax
        add         esi, 4
        add         edi, 4
%endmacro

%macro EXPANDLINE32to16MMX0 1
        mov         eax, [esi]
        mov         ebx, eax
        and         eax, 0x000000f8
        shr         eax, shift_r16
        mov         edx, eax
        mov         eax, ebx
        and         eax, 0x0000fc00
        shr         eax, shift_g16
        or          edx, eax
        mov         eax, ebx
        and         eax, 0x00f80000
        shr         eax, shift_b16
        or          eax, edx
        mov         [edi], eax
        add         esi, 4
        add         edi, 4
%endmacro

%macro LINE32to16MMX1 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r16
        pand        mm1, mm6
        psrlq       mm1, shift_g16
        pand        mm2, mm7
        psrlq       mm2, shift_b16
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psrlq       mm1, 16
        por         mm0, mm1
        movq        mm1, [esi + 8]
        movq        mm2, mm1
        movq        mm3, mm1
        pand        mm1, mm5
        psrlq       mm1, shift_r16
        pand        mm2, mm6
        psrlq       mm2, shift_g16
        pand        mm3, mm7
        psrlq       mm3, shift_b16
        por         mm1, mm2
        por         mm1, mm3
        movq        mm2, mm1
        psrlq       mm2, 16
        por         mm1, mm2
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        movq        [edi], mm0
        add         esi, 16
        add         edi, 8
%endmacro

%macro DOUBLELINE32to16MMX1 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r16
        pand        mm1, mm6
        psrlq       mm1, shift_g16
        pand        mm2, mm7
        psrlq       mm2, shift_b16
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psllq       mm0, 16
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        movq        [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro EXPANDLINE32to16MMX1 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r16
        pand        mm1, mm6
        psrlq       mm1, shift_g16
        pand        mm2, mm7
        psrlq       mm2, shift_b16
        por         mm0, mm1
        por         mm0, mm2
        VSCANLINE_BRIGHTNESS %1, mm0, mm1, mm2, mm4
        movq        [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro LINE32to16MMX2 1
        movq        mm0, [esi]
        prefetchnta [esi + 8]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r16
        pand        mm1, mm6
        psrlq       mm1, shift_g16
        pand        mm2, mm7
        psrlq       mm2, shift_b16
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psrlq       mm1, 16
        por         mm0, mm1
        movq        mm1, [esi + 8]
        movq        mm2, mm1
        movq        mm3, mm1
        pand        mm1, mm5
        psrlq       mm1, shift_r16
        pand        mm2, mm6
        psrlq       mm2, shift_g16
        pand        mm3, mm7
        psrlq       mm3, shift_b16
        por         mm1, mm2
        por         mm1, mm3
        movq        mm2, mm1
        psrlq       mm2, 16
        por         mm1, mm2
        punpckldq   mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        prefetchnta [esi + 16]
        movntq      [edi], mm0
        add         esi, 16
        add         edi, 8
%endmacro

%macro DOUBLELINE32to16MMX2 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r16
        pand        mm1, mm6
        psrlq       mm1, shift_g16
        pand        mm2, mm7
        psrlq       mm2, shift_b16
        por         mm0, mm1
        por         mm0, mm2
        movq        mm1, mm0
        psllq       mm0, 16
        por         mm0, mm1
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm4
        prefetchnta [esi + 8]
        movntq      [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro EXPANDLINE32to16MMX2 1
        movq        mm0, [esi]
        movq        mm1, mm0
        movq        mm2, mm0
        pand        mm0, mm5
        psrlq       mm0, shift_r16
        pand        mm1, mm6
        psrlq       mm1, shift_g16
        pand        mm2, mm7
        psrlq       mm2, shift_b16
        por         mm0, mm1
        por         mm0, mm2
        VSCANLINE_BRIGHTNESS %1, mm0, mm1, mm2, mm4
        prefetchnta [esi + 8]
        movntq      [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

;         sb db xs ys sl  bl mmx ws nj
blitcore 32, 16, 1, 1, 0,  0,  0, 0, 0
blitcore 32, 16, 1, 2, 0,  0,  0, 0, 1
blitcore 32, 16, 1, 2, 1,  0,  0, 0, 0
blitcore 32, 16, 1, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 16, 1, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 16, 1, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 1, 0,  0,  0, 0, 0
blitcore 32, 16, 2, 1, 2,  0,  0, 0, 0
blitcore 32, 16, 2, 1, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 1, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 1, 2, 75,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 2, 0,  0,  0, 0, 1
blitcore 32, 16, 2, 2, 1,  0,  0, 0, 0
blitcore 32, 16, 2, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 2, 2,  0,  0, 0, 1
blitcore 32, 16, 2, 2, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 2, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 16, 2, 2, 2, 75,  0, 0, 0 ; dummy

blitcore 32, 16, 1, 1, 0,  0,  1, 2, 1
blitcore 32, 16, 1, 2, 0,  0,  1, 2, 1
blitcore 32, 16, 1, 2, 1,  0,  1, 2, 1
blitcore 32, 16, 1, 2, 1, 25,  1, 2, 1
blitcore 32, 16, 1, 2, 1, 50,  1, 2, 1
blitcore 32, 16, 1, 2, 1, 75,  1, 2, 2 ; k7
blitcore 32, 16, 2, 1, 0,  0,  1, 1, 0
blitcore 32, 16, 2, 1, 2,  0,  1, 1, 0
blitcore 32, 16, 2, 1, 2, 25,  1, 1, 0
blitcore 32, 16, 2, 1, 2, 50,  1, 1, 0
blitcore 32, 16, 2, 1, 2, 75,  1, 1, 0
blitcore 32, 16, 2, 2, 0,  0,  1, 1, 1
blitcore 32, 16, 2, 2, 1,  0,  1, 1, 0
blitcore 32, 16, 2, 2, 1, 25,  1, 1, 1
blitcore 32, 16, 2, 2, 1, 50,  1, 1, 1
blitcore 32, 16, 2, 2, 1, 75,  1, 1, 1
blitcore 32, 16, 2, 2, 2,  0,  1, 1, 1
blitcore 32, 16, 2, 2, 2, 25,  1, 1, 1
blitcore 32, 16, 2, 2, 2, 50,  1, 1, 1
blitcore 32, 16, 2, 2, 2, 75,  1, 1, 1

blitcore 32, 16, 1, 1, 0,  0,  2, 2, 1
blitcore 32, 16, 1, 2, 0,  0,  2, 2, 1
blitcore 32, 16, 1, 2, 1,  0,  2, 2, 1
blitcore 32, 16, 1, 2, 1, 25,  2, 2, 2
blitcore 32, 16, 1, 2, 1, 50,  2, 2, 1
blitcore 32, 16, 1, 2, 1, 75,  2, 2, 2
blitcore 32, 16, 2, 1, 0,  0,  2, 1, 0
blitcore 32, 16, 2, 1, 2,  0,  2, 1, 0
blitcore 32, 16, 2, 1, 2, 25,  2, 1, 0
blitcore 32, 16, 2, 1, 2, 50,  2, 1, 0
blitcore 32, 16, 2, 1, 2, 75,  2, 1, 0
blitcore 32, 16, 2, 2, 0,  0,  2, 1, 1
blitcore 32, 16, 2, 2, 1,  0,  2, 1, 0
blitcore 32, 16, 2, 2, 1, 25,  2, 1, 1
blitcore 32, 16, 2, 2, 1, 50,  2, 1, 1
blitcore 32, 16, 2, 2, 1, 75,  2, 1, 1
blitcore 32, 16, 2, 2, 2,  0,  2, 1, 1
blitcore 32, 16, 2, 2, 2, 25,  2, 1, 1
blitcore 32, 16, 2, 2, 2, 50,  2, 1, 1
blitcore 32, 16, 2, 2, 2, 75,  2, 1, 1

;------------------------------------------------------------------
; 32to24
;------------------------------------------------------------------

%macro LINE32to24MMX0 1
        mov         eax, [esi]
        mov         [edi], eax
        add         esi, 4
        add         edi, 3
%endmacro

%macro DOUBLELINE32to24MMX0 1
        mov         eax, [esi]
        mov         [edi], eax
        mov         [edi + 3], eax
        add         esi, 4
        add         edi, 6
%endmacro

%macro EXPANDLINE32to24MMX0 1
        mov         eax, [esi]
        mov         [edi], eax
        add         esi, 4
        add         edi, 6
%endmacro

%macro LINE32to24MMX1 1
        movd        mm0, [esi]
        movd        mm1, [esi + 4]
        psllq       mm1, 24
        por         mm0, mm1
        movd        mm2, [esi + 8]
        movq        mm1, mm2
        psllq       mm1, 48
        movd        mm3, [esi + 12]
        por         mm0, mm1
        psrlq       mm2, 16
        psllq       mm3, 8
        movd        mm4, [esi + 16]
        psllq       mm4, 32
        por         mm2, mm3
        por         mm2, mm4
        movd        mm5, [esi + 20]
        movq        mm1, mm5
        psllq       mm1, 56
        por         mm1, mm2
        movd        mm6, [esi + 24]
        psrlq       mm5, 8
        psllq       mm6, 16
        por         mm5, mm6
        movd        mm7, [esi + 28]
        psllq       mm7, 40
        HSCANLINE_BRIGHTNESS %1, mm0, mm2, [_brightmask]
        movq        [edi], mm0
        por         mm5, mm7
        HSCANLINE_BRIGHTNESS %1, mm1, mm2, [_brightmask]
        movq        [edi + 8], mm1
        HSCANLINE_BRIGHTNESS %1, mm5, mm2, [_brightmask]
        movq        [edi + 16], mm5
        add         esi, 32
        add         edi, 24
%endmacro

%macro DOUBLELINE32to24MMX1 1
        movd        mm0, [esi]
        movq        mm4, mm0
        psllq       mm4, 24
        por         mm0, mm4
        movd        mm1, [esi + 4]
        movq        mm4, mm1
        psllq       mm4, 48
        por         mm0, mm4
        movq        mm4, mm1
        psrlq       mm1, 16
        psllq       mm4, 8
        por         mm1, mm4
        movd        mm2, [esi + 8]
        movq        mm4, mm2
        psllq       mm4, 32
        por         mm1, mm4
        movq        mm4, mm2
        psllq       mm4, 56
        por         mm1, mm4
        psrlq       mm2, 8
        movd        mm3, [esi + 12]
        movq        mm4, mm3
        HSCANLINE_BRIGHTNESS %1, mm0, mm5, mm6
        movq        [edi], mm0
        psllq       mm3, 16
        por         mm2, mm3
        psllq       mm4, 40
        HSCANLINE_BRIGHTNESS %1, mm1, mm5, mm6
        movq        [edi + 8], mm1
        por         mm2, mm4
        HSCANLINE_BRIGHTNESS %1, mm2, mm5, mm6
        movq        [edi + 16], mm2
        add         esi, 16
        add         edi, 24
%endmacro

%macro EXPANDLINE32to24MMX1 1
        movd        mm0, [esi]
        movd        mm1, [esi + 4]
        movd        mm2, [esi + 8]
        movd        mm3, [esi + 12]
        movq        mm4, mm1
        psllq       mm4, 48
        por         mm0, mm4
        psrlq       mm1, 8
        psllq       mm2, 32
        por         mm1, mm2
        psllq       mm3, 16
        movq        [edi], mm0
        movq        [edi + 8], mm1
        movq        [edi + 16], mm3
        add         esi, 16
        add         edi, 24
%endmacro

%macro LINE32to24MMX2 1
        movd        mm0, [esi]
        movd        mm1, [esi + 4]
        prefetchnta [esi + 8]
        psllq       mm1, 24
        por         mm0, mm1
        movd        mm2, [esi + 8]
        prefetchnta [esi + 12]
        movq        mm1, mm2
        psllq       mm1, 48
        movd        mm3, [esi + 12]
        prefetchnta [esi + 16]
        por         mm0, mm1
        psrlq       mm2, 16
        psllq       mm3, 8
        movd        mm4, [esi + 16]
        prefetchnta [esi + 20]
        psllq       mm4, 32
        por         mm2, mm3
        por         mm2, mm4
        movd        mm5, [esi + 20]
        prefetchnta [esi + 24]
        movq        mm1, mm5
        psllq       mm1, 56
        por         mm1, mm2
        movd        mm6, [esi + 24]
        prefetchnta [esi + 28]
        psrlq       mm5, 8
        psllq       mm6, 16
        por         mm5, mm6
        movd        mm7, [esi + 28]
        psllq       mm7, 40
        HSCANLINE_BRIGHTNESS %1, mm0, mm2, [_brightmask]
        movntq      [edi], mm0
        por         mm5, mm7
        HSCANLINE_BRIGHTNESS %1, mm1, mm2, [_brightmask]
        movntq      [edi + 8], mm1
        prefetchnta [esi + 32]
        HSCANLINE_BRIGHTNESS %1, mm5, mm2, [_brightmask]
        movntq      [edi + 16], mm5
        add         esi, 32
        add         edi, 24
%endmacro

%macro DOUBLELINE32to24MMX2 1
        movd        mm0, [esi]
        prefetchnta [esi + 4]
        movq        mm4, mm0
        psllq       mm4, 24
        por         mm0, mm4
        movd        mm1, [esi + 4]
        prefetchnta [esi + 8]
        movq        mm4, mm1
        psllq       mm4, 48
        por         mm0, mm4
        movq        mm4, mm1
        psrlq       mm1, 16
        psllq       mm4, 8
        por         mm1, mm4
        movd        mm2, [esi + 8]
        prefetchnta [esi + 12]
        movq        mm4, mm2
        psllq       mm4, 32
        por         mm1, mm4
        movq        mm4, mm2
        psllq       mm4, 56
        por         mm1, mm4
        psrlq       mm2, 8
        movd        mm3, [esi + 12]
        movq        mm4, mm3
        HSCANLINE_BRIGHTNESS %1, mm0, mm5, mm6
        movntq      [edi], mm0
        psllq       mm3, 16
        por         mm2, mm3
        psllq       mm4, 40
        HSCANLINE_BRIGHTNESS %1, mm1, mm5, mm6
        movntq      [edi + 8], mm1
        por         mm2, mm4
        HSCANLINE_BRIGHTNESS %1, mm2, mm5, mm6
        movntq      [edi + 16], mm2
        prefetchnta [esi + 16]
        add         esi, 16
        add         edi, 24
%endmacro

%macro EXPANDLINE32to24MMX2 1
        movd        mm0, [esi]
        prefetchnta [esi + 4]
        movq        mm4, mm1
        psllq       mm4, 48
        movd        mm1, [esi + 4]
        prefetchnta [esi + 8]
        por         mm0, mm4
        psrlq       mm1, 8
        movd        mm2, [esi + 8]
        prefetchnta [esi + 12]
        psllq       mm2, 32
        por         mm1, mm2
        movd        mm3, [esi + 12]
        movntq      [edi], mm0
        psllq       mm3, 16
        movntq      [edi + 8], mm1
        movntq      [edi + 16], mm3
        prefetchnta [esi + 16]
        add         esi, 16
        add         edi, 24
%endmacro

;         sb db xs ys sl  bl mmx ws nj
blitcore 32, 24, 1, 1, 0,  0,  0, 0, 0
blitcore 32, 24, 1, 2, 0,  0,  0, 0, 0
blitcore 32, 24, 1, 2, 1,  0,  0, 0, 0
blitcore 32, 24, 1, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 24, 1, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 24, 1, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 1, 0,  0,  0, 0, 0
blitcore 32, 24, 2, 1, 2,  0,  0, 0, 0
blitcore 32, 24, 2, 1, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 1, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 1, 2, 75,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 2, 0,  0,  0, 0, 1
blitcore 32, 24, 2, 2, 1,  0,  0, 0, 0
blitcore 32, 24, 2, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 2, 2,  0,  0, 0, 0
blitcore 32, 24, 2, 2, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 2, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 24, 2, 2, 2, 75,  0, 0, 0 ; dummy

blitcore 32, 24, 1, 1, 0,  0,  1, 3, 1
blitcore 32, 24, 1, 2, 0,  0,  1, 3, 1
blitcore 32, 24, 1, 2, 1,  0,  1, 3, 1
blitcore 32, 24, 1, 2, 1, 25,  1, 3, 2
blitcore 32, 24, 1, 2, 1, 50,  1, 3, 2
blitcore 32, 24, 1, 2, 1, 75,  1, 3, 2
blitcore 32, 24, 2, 1, 0,  0,  1, 2, 1
blitcore 32, 24, 2, 1, 2,  0,  1, 2, 0
blitcore 32, 24, 2, 1, 2, 25,  1, 2, 0 ; dummy
blitcore 32, 24, 2, 1, 2, 50,  1, 2, 0 ; dummy
blitcore 32, 24, 2, 1, 2, 75,  1, 2, 0 ; dummy
blitcore 32, 24, 2, 2, 0,  0,  1, 2, 1
blitcore 32, 24, 2, 2, 1,  0,  1, 2, 1
blitcore 32, 24, 2, 2, 1, 25,  1, 2, 2
blitcore 32, 24, 2, 2, 1, 50,  1, 2, 2
blitcore 32, 24, 2, 2, 1, 75,  1, 2, 2
blitcore 32, 24, 2, 2, 2,  0,  1, 2, 1
blitcore 32, 24, 2, 2, 2, 25,  1, 2, 0 ; dummy
blitcore 32, 24, 2, 2, 2, 50,  1, 2, 0 ; dummy
blitcore 32, 24, 2, 2, 2, 75,  1, 2, 0 ; dummy

blitcore 32, 24, 1, 1, 0,  0,  2, 3, 2
blitcore 32, 24, 1, 2, 0,  0,  2, 3, 2
blitcore 32, 24, 1, 2, 1,  0,  2, 3, 2
blitcore 32, 24, 1, 2, 1, 25,  2, 3, 2
blitcore 32, 24, 1, 2, 1, 50,  2, 3, 2
blitcore 32, 24, 1, 2, 1, 75,  2, 3, 2
blitcore 32, 24, 2, 1, 0,  0,  2, 2, 1
blitcore 32, 24, 2, 1, 2,  0,  2, 2, 0
blitcore 32, 24, 2, 1, 2, 25,  2, 2, 0 ; dummy
blitcore 32, 24, 2, 1, 2, 50,  2, 2, 0 ; dummy
blitcore 32, 24, 2, 1, 2, 75,  2, 2, 0 ; dummy
blitcore 32, 24, 2, 2, 0,  0,  2, 2, 1
blitcore 32, 24, 2, 2, 1,  0,  2, 2, 1
blitcore 32, 24, 2, 2, 1, 25,  2, 2, 2
blitcore 32, 24, 2, 2, 1, 50,  2, 2, 2
blitcore 32, 24, 2, 2, 1, 75,  2, 2, 2
blitcore 32, 24, 2, 2, 2,  0,  2, 2, 1
blitcore 32, 24, 2, 2, 2, 25,  2, 2, 0 ; dummy
blitcore 32, 24, 2, 2, 2, 50,  2, 2, 0 ; dummy
blitcore 32, 24, 2, 2, 2, 75,  2, 2, 0 ; dummy

;------------------------------------------------------------------
; 32to32
;------------------------------------------------------------------

%macro LINE32to32MMX0 1
        mov         eax, [esi]
        mov         [edi], eax
        add         esi, 4
        add         edi, 4
%endmacro

%macro DOUBLELINE32to32MMX0 1
        mov         eax, [esi]
        mov         [edi], eax
        mov         [edi + 4], eax
        add         esi, 4
        add         edi, 8
%endmacro

%macro EXPANDLINE32to32MMX0 1
        mov         eax, [esi]
        mov         [edi], eax
        add         esi, 4
        add         edi, 8
%endmacro

%macro LINE32to32MMX1 1
        movq        mm0, [esi]
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro DOUBLELINE32to32MMX1 1
        movd        mm0, [esi]
        punpckldq   mm0, mm0
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movq        [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro EXPANDLINE32to32MMX1 1
        movd        mm0, [esi]
        VSCANLINE_BRIGHTNESS %1, mm0, mm1, mm2, mm6
        movq        [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro LINE32to32MMX2 1
        movq        mm0, [esi]
        prefetchnta [esi + 8]
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movntq      [edi], mm0
        add         esi, 8
        add         edi, 8
%endmacro

%macro DOUBLELINE32to32MMX2 1
        movd        mm0, [esi]
        punpckldq   mm0, mm0
        prefetchnta [esi + 4]
        HSCANLINE_BRIGHTNESS %1, mm0, mm1, mm6
        movntq      [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

%macro EXPANDLINE32to32MMX2 1
        movd        mm0, [esi]
        VSCANLINE_BRIGHTNESS %1, mm0, mm1, mm2, mm6
        prefetchnta [esi + 4]
        movntq      [edi], mm0
        add         esi, 4
        add         edi, 8
%endmacro

;         sb db xs ys sl  bl mmx ws nj
blitcore 32, 32, 1, 1, 0,  0,  0, 0, 0
blitcore 32, 32, 1, 2, 0,  0,  0, 0, 0
blitcore 32, 32, 1, 2, 1,  0,  0, 0, 0
blitcore 32, 32, 1, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 32, 1, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 32, 1, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 1, 0,  0,  0, 0, 0
blitcore 32, 32, 2, 1, 2,  0,  0, 0, 0
blitcore 32, 32, 2, 1, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 1, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 1, 2, 75,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 2, 0,  0,  0, 0, 1
blitcore 32, 32, 2, 2, 1,  0,  0, 0, 0
blitcore 32, 32, 2, 2, 1, 25,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 2, 1, 50,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 2, 1, 75,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 2, 2,  0,  0, 0, 0
blitcore 32, 32, 2, 2, 2, 25,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 2, 2, 50,  0, 0, 0 ; dummy
blitcore 32, 32, 2, 2, 2, 75,  0, 0, 0 ; dummy

blitcore 32, 32, 1, 1, 0,  0,  1, 1, 0
blitcore 32, 32, 1, 2, 0,  0,  1, 1, 1
blitcore 32, 32, 1, 2, 1,  0,  1, 1, 0
blitcore 32, 32, 1, 2, 1, 25,  1, 1, 0
blitcore 32, 32, 1, 2, 1, 50,  1, 1, 0
blitcore 32, 32, 1, 2, 1, 75,  1, 1, 0
blitcore 32, 32, 2, 1, 0,  0,  1, 0, 0
blitcore 32, 32, 2, 1, 2,  0,  1, 0, 0
blitcore 32, 32, 2, 1, 2, 25,  1, 0, 0
blitcore 32, 32, 2, 1, 2, 50,  1, 0, 0
blitcore 32, 32, 2, 1, 2, 75,  1, 0, 0
blitcore 32, 32, 2, 2, 0,  0,  1, 0, 1
blitcore 32, 32, 2, 2, 1,  0,  1, 0, 0
blitcore 32, 32, 2, 2, 1, 25,  1, 0, 0
blitcore 32, 32, 2, 2, 1, 50,  1, 0, 0
blitcore 32, 32, 2, 2, 1, 75,  1, 0, 0
blitcore 32, 32, 2, 2, 2,  0,  1, 0, 1
blitcore 32, 32, 2, 2, 2, 25,  1, 0, 1
blitcore 32, 32, 2, 2, 2, 50,  1, 0, 1
blitcore 32, 32, 2, 2, 2, 75,  1, 0, 1

blitcore 32, 32, 1, 1, 0,  0,  2, 1, 0
blitcore 32, 32, 1, 2, 0,  0,  2, 1, 1
blitcore 32, 32, 1, 2, 1,  0,  2, 1, 0
blitcore 32, 32, 1, 2, 1, 25,  2, 1, 0
blitcore 32, 32, 1, 2, 1, 50,  2, 1, 0
blitcore 32, 32, 1, 2, 1, 75,  2, 1, 0
blitcore 32, 32, 2, 1, 0,  0,  2, 0, 0
blitcore 32, 32, 2, 1, 2,  0,  2, 0, 0
blitcore 32, 32, 2, 1, 2, 25,  2, 0, 0
blitcore 32, 32, 2, 1, 2, 50,  2, 0, 0
blitcore 32, 32, 2, 1, 2, 75,  2, 0, 0
blitcore 32, 32, 2, 2, 0,  0,  2, 0, 1
blitcore 32, 32, 2, 2, 1,  0,  2, 0, 0
blitcore 32, 32, 2, 2, 1, 25,  2, 0, 0
blitcore 32, 32, 2, 2, 1, 50,  2, 0, 0
blitcore 32, 32, 2, 2, 1, 75,  2, 0, 0
blitcore 32, 32, 2, 2, 2,  0,  2, 0, 1
blitcore 32, 32, 2, 2, 2, 25,  2, 0, 1
blitcore 32, 32, 2, 2, 2, 50,  2, 0, 1
blitcore 32, 32, 2, 2, 2, 75,  2, 0, 1
%endif

;---------------------------------------------------------------------------

%define HAS_MMX      0x0001
%define HAS_MMX2     0x0002 ; Enhanced MMX
%define HAS_SSE      0x0010 ; Stream SIMD Extention (Pentium III)
%define HAS_SSE2     0x0020 ; Stream SIMD Extention 2 (Pentium 4)
%define HAS_3DNow    0x0100
%define HAS_E3DNow   0x0200 ; 3DNow! + Enhanced MMX (Athlon / Duron)
%define HAS_3DNowPro 0x0310 ; Enhanced 3DNow! + SSE (AthlonMP)

        GLOBAL _CheckCPUID
_CheckCPUID:
        push    ebx
        push    edx
        push    ecx
        push    esi

        xor     esi, esi
        pushfd
        pop     eax
        mov     edx, eax
        xor     eax, (1 << 21)
        push    eax
        popfd
        pushfd
        pop     eax
        mov     ecx, eax
        xor     eax, eax
        cmp     edx, ecx
        je      finish

        mov     eax, 0x80000001
        cpuid
        test    edx, (1 << 31) ; 3DNow!
        jz      E3DNow_check
        or      esi, HAS_3DNow
E3DNow_check:
        test    edx, (1 << 30) ; Enhanced 3DNow!
        jz      EMMX_check
        or      esi, HAS_E3DNow
EMMX_check:
        test    edx, (1 << 22) ; Enhanced MMX
        jz      MMX_check
        or      esi, HAS_MMX2
MMX_check:
        mov     eax, 1
        cpuid
        test    edx, (1 << 23) ; MMX
        jz      SSE_check
        or      esi, HAS_MMX
SSE_check:
        test    edx, (1 << 25) ; SSE
        jz      SSE2_check
        or      esi, HAS_MMX2
        or      esi, HAS_SSE
SSE2_check:
        test    edx, (1 << 26) ; SSE2
        jz      finish
        or      esi, HAS_SSE2
finish:
        mov     eax, esi
        pop     esi
        pop     ecx
        pop     edx
        pop     ebx
        ret
