.intel_syntax
.ifdef LSSE

####################################################################
#
# Macros
#
.macro mmxcon r, c ## makes mmx constant, uses eax
        mov  eax, \c
        movd \r, eax
        punpckldq \r,\r
.endm

.macro swapm x, y ## swap memory relative to ebp, uses eax, ebx
        mov  eax, [ebp + \x] ## swap a d
        mov  ebx, [ebp + \y]
        mov  [ebp + \y], eax
        mov  [ebp + \x], ebx
.endm

.macro time_b loc # if loc is esp must add 8
        sub esp,8
        mov [esp + 0], eax
        mov [esp + 4], edx
        rdtsc
        mov \loc, eax
        mov eax, [esp + 0]
        mov edx, [esp + 4]
        add esp,8

.endm

.macro time_e loc # clears mmx reg, ebp must point to CompareParms
        sub esp,8
        mov [esp + 0], eax
        mov [esp + 4], edx
        emms
        rdtsc
        mov edx, \loc
        sub  eax, edx
        mov  edx, ebp
        call _print_cs
        mov eax, [esp + 0]
        mov edx, [esp + 4]
        add esp,8
.endm

####################################################################
#
# _subsample_line_yuy2_2
#
        .global _subsample_line_yuy2_2
        .align 16
_subsample_line_yuy2_2:
        prefetchnta [eax]
        lea     ecx, [ecx+eax]
        movd mm0, ebx
        mov  ebx, 0x00FF00FF
        movd mm4, ebx
        movd ebx, mm0
        PUNPCKLDQ mm4,mm4
        pxor mm0, mm0
        .align 16
Lyuy2_loop:
        prefetchnta [eax + 16]

        movq  mm2, [eax]
        movq  mm1, [eax + 8]

        pand  mm2, mm4
        pand  mm1, mm4
        
        packuswb mm2,mm1
        
        movq  mm3, mm2
        pand  mm2, mm4
        psrlq mm3, 8
        pand  mm3, mm4
        movq  mm1, mm3
        psllq mm1, 16
        paddw mm2, mm3
        por   mm1, mm0
        paddw mm2, mm1

        movq [edx], mm2
        
        movq  mm0, mm3
        psrlq mm0, 48 

        add     eax, 16
        add     edx, 8

        cmp     eax, ecx
        jl      Lyuy2_loop
Lyuy2_end:
        ret

####################################################################
#
# _subsample_line_yv12_2
#
        .globl _subsample_line_yv12_2
        .align 16
_subsample_line_yv12_2:
        prefetchnta [eax]
        lea     ecx, [ecx+eax]
        
        movd mm0, ebx
        mov  ebx, 0x00FF00FF
        movd mm4, ebx
        movd ebx, mm0
        PUNPCKLDQ mm4,mm4
        pxor mm0, mm0
        .align 16
Lyv12_loop:
        prefetchnta [eax + 8]

        movq  mm2, [eax]
        movq  mm3, mm2
        pand  mm2, mm4
        psrlq mm3, 8
        pand  mm3, mm4
        movq  mm1, mm3
        psllq mm1, 16
        paddw mm2, mm3
        por   mm1, mm0
        paddw mm2, mm1

        movq [edx], mm2

        movq  mm0, mm3
        psrlq mm0, 48

        add     eax, 8
        add     edx, 8

        cmp     eax, ecx
        jl      Lyv12_loop
Lyv12_end:
        ret
        
####################################################################
#
# Useful constants
#
        subsample_line =  0
        row_size       =  4
        s_height       =  8
        s_width        = 12
        s_pitch        = 16
        scale          = 24
        a              = 32
        b              = 36
        c              = 40
        d              = 44
        e              = 48
        prev0          = 52
        cur0           = 56
        next0          = 60
        prev1          = 64
        cur1           = 68
        next1          = 72
        cp_size        = 76
        line           = cp_size + 0
        
        s_size         = cp_size + 4
        
        parms          = s_size + 20
        
        s_l_data       = s_size + 24
        s_l_pitch      = s_size + 28
        even           = s_size + 24
        odd            = s_size + 32
   
####################################################################
#
# compare_fields
#
        .globl _compare_fields
	.align 16
_compare_fields:
	push  ebp
	push  edi
	push  esi
	push  ebx

	sub   esp, offset flat:s_size
	mov   ebp, esp # ebp = working data ptr, entire file

	mov   esi, [ebp + parms]
	mov   edx,0
Lcopy_loop:
        mov   eax, [esi + edx]
        mov   [ebp + edx], eax
        add   edx, 4
        cmp   edx, offset flat:cp_size
        jl    Lcopy_loop
        
        mov   eax, [ebp + s_pitch]
        
        mov   esi, [ebp + prev0]
        lea   edi, [esi + 2*eax]
        call  memclear
        
        mov   esi, [ebp + prev1]
        lea   edi, [esi + 4*eax]
        call  memclear
        
        mov   DWORD PTR [ebp + line],  0  
 
        mov   edi, [ebp + subsample_line]
        
      	mov   eax, [ebp + even]
        mov   edx, [ebp + d]
	mov   ecx, [ebp + row_size]
	call  edi
	
	mov   eax, [ebp + odd]
	mov   edx, [ebp + e]
	mov   ecx, [ebp + row_size]
	call  edi

	xorps xmm7, xmm7 # xmm7 = accum [entire function]
	
Lcf_outer:
        mov   edx, [ebp + cur1]
        call  compare_fields_mul
        
        mov   edx, [ebp + next1]
        call  compare_fields_mul
        
        mov   eax, [ebp + prev1]
        mov   ebx, [ebp + cur1]
        mov   ecx, [ebp + next1]
        
        mov   esi, 0
        mov   edi, [ebp + s_width]
        shl   edi, 2
        
        .align 16
Lcf_inner:
        movaps xmm0, [eax + esi]
        addps  xmm0, [ecx + esi]
        mulps  xmm0, [ebx + esi]
        addps  xmm7, xmm0
        add    esi, 16
        cmp    esi,edi
        jl     Lcf_inner
        
        SWAPM prev1, next1

        # outer loop inc and test
        mov     eax, [ebp + line]
        mov     ebx, [ebp + s_height]
        cmp     eax,ebx
	jl	Lcf_outer
	
Lend:
        emms

	movhlps xmm0, xmm7 # sum up 4 floats in xmm7
	addps   xmm0, xmm7
        movaps  xmm1, xmm0
        shufps  xmm1, xmm1, 0x01
        addss   xmm0, xmm1
	mulss	xmm0, [ebp + scale]
	movss	[ebp+0], xmm0

	fld	[ebp+0]

	add	esp, offset flat:s_size
	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	ret

####################################################################/
#
# compare_fields_mul
#
        .align 16
compare_fields_mul: # edx = dest
                    # will clobber all gp registers except ebp, esp
                    # clobbers mm0-mm6, xmm0-xmm6
        push edx
    
        mov   edx, [ebp + cur0]
        call  compare_fields_row
        
        mov   edx, [ebp + next0]
        call  compare_fields_row
        
        mov edx, [esp]
               
        MMXCON mm0, 0x00000000
                
        mov  eax, [ebp + prev0]
        mov  ebx, [ebp + cur0]
        mov  ecx, [ebp + next0]
        mov  esi, 0
        mov  edi, [ebp + s_width]
        shl  edi, 1
        
        .align 16
Lcfm_loop:
        movq      mm1, [eax + esi]
        movq      mm2, [ecx + esi]
        paddusw   mm1, mm2
        
        movq      mm2, mm1
        punpckhwd mm1, mm0  # high 
        cvtpi2ps xmm0, mm1 
        movlhps  xmm0, xmm0
        punpcklwd mm2, mm0  # low
        cvtpi2ps xmm0, mm2  
        
        movq      mm1, [ebx + esi]
        
        movq      mm2, mm1
        punpckhwd mm1, mm0  # high
        cvtpi2ps xmm1, mm1 
        movlhps  xmm1, xmm1
        punpcklwd mm2, mm0  # low
        cvtpi2ps xmm1, mm2  
       
        mulps   xmm0, xmm1
        
        movaps  [edx + 2*esi], xmm0
        
        add     esi, 8
        cmp     esi, edi
        jl      Lcfm_loop
             
        SWAPM  prev0, next0
        
        add esp, 4
        ret
        
####################################################################/
#
# compare_fields_row
#
        .align 16
compare_fields_row: # edx = dest, 
                    # will clobber all gp registers except ebp, esp
                    # clobbers mm0-mm6
        sub esp, 8
        mov [esp + 0], edx
        mov [esp + 4], ebp
                    
        SWAPM  a, d
        SWAPM  b, e
        
        mov  edi, [ebp + subsample_line]

	mov  esi, [ebp + line]
	and  esi, 1

	mov  ebx, [ebp + s_l_data  + esi*8]
	add  ebx, [ebp + s_l_pitch + esi*8]
	mov  eax, ebx
	mov  edx, [ebp + c]
	mov  ecx, [ebp + row_size]
	call edi

	add  ebx, [ebp + s_l_pitch + esi*8]
	mov  eax, ebx
	mov  edx, [ebp + e]
	mov  ecx, [ebp + row_size]
	call edi

	mov  [ebp + s_l_data + esi*8], ebx
	
	xor  esi, 1

	mov  eax, [ebp + s_l_data  + esi*8]
	add  eax, [ebp + s_l_pitch + esi*8]
	mov  [ebp + s_l_data  + esi*8], eax
	mov  edx, [ebp + d]
	mov  ecx, [ebp + row_size]
	call edi
	
	# loop prep
	MMXCON mm5, 0x00010001
	MMXCON mm6, 0x00030003

        mov     edi, [ebp + s_width]
        shl     edi, 1
        mov     eax, [ebp+ a] # eax = a [for inner loop]
        add     eax, edi
        mov     ebx, [ebp+ b] # ebx = b [for inner loop]
        add     ebx, edi
        mov     ecx, [ebp+ c] # ecx = c [for inner loop]
        add     ecx, edi
        mov     edx, [ebp+ d] # edx = d [for inner loop]
        add     edx, edi
        mov     esi, [ebp+ e] # esi = e [for inner loop]
        add     esi, edi
        mov     ebp, [esp +0]
        add     ebp, edi

        neg     edi       # edi = x [for inner loop]
        
        .align 16
Lcfr_loop:
        movq    mm1, [ebx + edi] # b  = B
        paddsw  mm1, [edx + edi] # b += D
        pmullw  mm1, mm6         # b *= 3
        movq    mm2, [ecx + edi] # c  = C
        psllw   mm2, 2           # c *= 4
        paddsw  mm2, [eax + edi] # c += A
        paddsw  mm2, [esi + edi] # c += E
        movq    mm3, mm2         # c2 = c
        psubusw mm2, mm1         # c = c - b (with unsigned sat)
        psubusw mm1, mm3         # b = b - c2 (with unsigned sat)
        por     mm1, mm2         # b = b ^ c
        paddusw mm1, mm5         # b += 1
        movq    [ebp + edi], mm1

        add	edi, 8
	jl	Lcfr_loop
	
	mov     ebp, [esp + 4]
	
	inc     dword ptr [ebp + line]
	
        add     esp, 8
	ret

####################################################################/
#
# memclear
#
        .align 16
memclear: # esi = start, edi = stop
        mov dword ptr [esi], 0
        mov dword ptr [esi + 4], 0
        add esi, 8
        cmp esi, edi
        jl  memclear
        ret

.endif # DEF SSE

