51Testing软件测试论坛

 找回密码
 (注-册)加入51Testing

QQ登录

只需一步,快速开始

微信登录,快人一步

手机号码,快捷登录

查看: 2008|回复: 0
打印 上一主题 下一主题

可能是最快的算法alpha blend汇编源代码,Intel官方提供

[复制链接]
  • TA的每日心情
    慵懒
    2015-1-8 08:46
  • 签到天数: 2 天

    连续签到: 1 天

    [LV.1]测试小兵

    跳转到指定楼层
    1#
    发表于 2008-1-9 12:19:40 | 只看该作者 回帖奖励 |倒序浏览 |阅读模式
    Intel官方网站有一个ablend_565的快速汇编算法,理论上是是把一块32bit RGBA渲染到16bit的buffer上,我的机器是PIII800,函数在system menory中进行,640*480的256级alpha blending,达到100fps,我想可以满足绝大部分的要求了,在这里,我提供了这个算法的应用,希望可以对大家有所帮助。
    ablend_565函数,源代码可以直接编译使用,无需其他库函数,感谢intel提供这么好的东西。
    首先,我提供一些本人编写的把32bit tga文件读入pRGBABuffer的函数
    文件尺寸保存在 width,height
    //-----------------------------------------------------------------------
    // Name: LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )
    // Desc: 读取32bit tga文件到DWORD缓冲里,返回其尺寸
    // Time: 2002.06.22 00:36
    // Author: RealRender
    // Para:
    // Return:
    // Note: 这段代码来自directx 7.0 sample中的d3dtextr.cpp,我把他提取了出来
    // 方便使用
    //-----------------------------------------------------------------------
    BOOL LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )
    {
        FILE* file = fopen( strPathname, "rb" );
        if( NULL == file )
        return false;
        struct TargaHeader
        {
            BYTE IDLength;
            BYTE ColormapType;
            BYTE ImageType;
            BYTE ColormapSpecification[5];
            WORD XOrigin;
            WORD YOrigin;
            WORD ImageWidth;
            WORD ImageHeight;
            BYTE PixelDepth;
            BYTE ImageDescriptor;
        } tga;
        fread( &tga, sizeof(TargaHeader), 1, file );
        // Only true color, non-mapped images are supported
        if( ( 0 != tga.ColormapType ) ||
            ( tga.ImageType != 10 && tga.ImageType != 2 ) )
        {
            fclose( file );
            return false;
        }
        // Skip the ID field. The first byte of the header is the length of this field
        if( tga.IDLength )
            fseek( file, tga.IDLength, SEEK_CUR );
        DWORD m_dwWidth = tga.ImageWidth;
        DWORD m_dwHeight = tga.ImageHeight;
        DWORD m_dwBPP = tga.PixelDepth;
        DWORD *m_pRGBAData = new DWORD[m_dwWidth*m_dwHeight];
        if( m_pRGBAData == NULL )
        {
            fclose(file);
            return false;
        }
        for( DWORD y=0; y<m_dwHeight; y++ )
        {
            DWORD dwOffset = y*m_dwWidth;
            if( 0 == ( tga.ImageDescriptor & 0x0010 ) )
                dwOffset = (m_dwHeight-y-1)*m_dwWidth;
            for( DWORD x=0; x<m_dwWidth; x )
            {
                if( tga.ImageType == 10 )
                {
                    BYTE PacketInfo = getc( file );
                    WORD PacketType = 0x80 & PacketInfo;
                    WORD PixelCount = ( 0x007f & PacketInfo ) + 1;
                    if( PacketType )
                    {
                        DWORD b = getc( file );
                        DWORD g = getc( file );
                        DWORD r = getc( file );
                        DWORD a = 0xff;
                        if( m_dwBPP == 32 )
                            a = getc( file );
                        while( PixelCount-- )
                        {
                            m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);
                            x++;
                        }
                    }
                    else
                    {
                        while( PixelCount-- )
                        {
                            BYTE b = getc( file );
                            BYTE g = getc( file );
                            BYTE r = getc( file );
                            BYTE a = 0xff;
                            if( m_dwBPP == 32 )
                            a = getc( file );
                            m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);
                            x++;
                        }
                    }
                }
                else
                {
                    BYTE b = getc( file );
                    BYTE g = getc( file );
                    BYTE r = getc( file );
                    BYTE a = 0xff;
                    if( m_dwBPP == 32 )
                        a = getc( file );
                    m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);
                    x++;
                }
            }
        }
        fclose( file );
        // Check for alpha content
        for( DWORD i=0; i<(m_dwWidth*m_dwHeight); i++ )
        {
            if( m_pRGBAData & 0x000000ff != 0xff )
            {
                //m_bHasAlpha = TRUE;
                break;
            }
        }
        *pRGBABuffer = m_pRGBAData;
        *width = m_dwWidth;
        *height = m_dwHeight;
        return true;
    }
    把32bit buffer分割为rgb和alpha的代码。
    注意,分割后的pBitmap一定要是8字节对齐,这是优化的一个重要条件,所以,我的算法中:
    BYTE* p = new BYTE[lSize*2+8];
    BYTE* pOrig = p;
    p += (DWORD)p%8;
    WORD* color = (WORD*)p;
    这是不规范的写法,把指针强行改变为8位对齐,实际使用的时候,要记住释放的原始指针不是p,而是pOrig,在这里,我没有释放分配的内存,请谅解。
    //-----------------------------------------------------------------------
    // Name: SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )
    // Desc:
    // Time: 2002.06.22 00:36
    // Author: RealRender
    // Para:
    // Return:
    // Note: 把从32bit的缓冲建立16bit的565缓冲和8bit的alpha通道
    //-----------------------------------------------------------------------
    void SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )
    {
        long lSize = lWidth*lHeight;
        BYTE* alpha = new BYTE[lSize];
        BYTE* p = new BYTE[lSize*2+8];
        // 强行转换为8字节对齐
        p += (DWORD)p%8;
        WORD* color = (WORD*)p;
        DWORD dwPixel;
        DWORD r, g, b, a;
        for( int i = 0; i < lSize; i++ )
        {
            dwPixel = pRGBABuffer;
            r = ((dwPixel>>24)&0x000000ff);
            g = ((dwPixel>>16)&0x000000ff);
            b = ((dwPixel>> 8)&0x000000ff);
            a = ((dwPixel>> 0)&0x000000ff);
            alpha = a;
            // 888i转化为565
            color = RGBTo16( r, g, b );
        }
        *pAlpha = alpha;
        *pBitmap = color;
    }
    //
    这个视intel官方提供的函数,函数的描述,用我的话来说就是把一个带有256级alpha通道的565颜色数据绘制到16位目标页面。
    函数说明:
    unsigned char *lpAlpha, // 256 级alpha通道
    unsigned int iAlpPitch, // alpha通道的pitch
    unsigned char *lpSrc, // 原色彩缓冲
    unsigned int iSrcX, //
    unsigned int iSrcY, // 原色彩位置
    unsigned int iSrcPitch, // 原色彩pitch
    unsigned char *lpDst, // 目标缓冲
    unsigned int iDstX,
    unsigned int iDstY, // 目标位置
    unsigned int iDstW,
    unsigned int iDstH, // 目标缓冲的尺寸
    unsigned int iDstPitch // 目标缓冲的pitch
    void ablend_565(unsigned char *lpAlpha,unsigned int iAlpPitch,
    unsigned char *lpSrc,unsigned int iSrcX, unsigned int iSrcY,
    unsigned int iSrcPitch, unsigned char *lpDst,
    unsigned int iDstX, unsigned int iDstY,
    unsigned int iDstW, unsigned int iDstH,
    unsigned int iDstPitch)
    {
    //Mask for isolating the red,green, and blue components
    static __int64 MASKB=0x001F001F001F001F;
    static __int64 MASKG=0x07E007E007E007E0;
    static __int64 MASKSHIFTG=0x03F003F003F003F0;
    static __int64 MASKR=0xF800F800F800F800;
    //constants used by the integer alpha blending equation
    static __int64 SIXTEEN=0x0010001000100010;
    static __int64 FIVETWELVE=0x0200020002000200;
    static __int64 SIXONES=0x003F003F003F003F;
    unsigned char *lpLinearDstBp=(iDstX<<1)+(iDstY*iDstPitch)+lpDst; //base pointer for linear destination
    unsigned char *lpLinearSrcBp=(iSrcX<<1)+(iSrcY*iSrcPitch)+lpSrc; //base pointer for linear source
    unsigned char *lpLinearAlpBp=iSrcX+(iSrcY*iAlpPitch)+lpAlpha; //base pointer for linear alpha
    _asm{
    mov esi,lpLinearSrcBp; //src
    mov edi,lpLinearDstBp; //dst
    mov eax,lpLinearAlpBp; //alpha
    mov ecx,iDstH; //ecx=number of lines to copy
    mov ebx,iDstW; //ebx=span width to copy
    test esi,6; //check if source address is qword aligned
    //since addr coming in is always word aligned(16bit)
    jnz done; //if not qword aligned we don't do anything
    primeloop:
    movd mm1,[eax]; //mm1=00 00 00 00 a3 a2 a1 a0
    pxor mm2,mm2; //mm2=0;
    movq mm4,[esi]; //g1: mm4=src3 src2 src1 src0
    punpcklbw mm1,mm2; //mm1=00a3 00a2 00a1 00a0
    loopqword:
    mov edx,[eax];
    test ebx,0xFFFFFFFC; //check if only 3 pixels left
    jz checkback; //3 or less pixels left
    //early out tests
    cmp edx,0xffffffff; //test for alpha value of 1
    je copyback; //if 1's copy the source pixels to the destination
    test edx,0xffffffff; //test for alpha value of 0
    jz leavefront; //if so go to the next 4 pixels
    //the alpha blend starts
    //green
    //i=a*sg+(63-a)*dg;
    //i=(i+32)+((i+32)>>6)>>6;
    //red
    //i=a*sr+(31-a)*dr;
    //i=(i+16)+((i+16)>>5)>>5;
    movq mm5,[edi]; //g2: mm5=dst3 dst2 dst1 dst0
    psrlw mm1,2; //mm1=a?>>2 nuke out lower 2 bits
    movq mm7,MASKSHIFTG; //g3: mm7=1 bit shifted green mask
    psrlw mm4,1; //g3a: move src green down by 1 so that we won't overflow
    movq mm0,mm1; //mm0=00a3 00a2 00a1 00a0
    psrlw mm5,1; //g3b: move dst green down by 1 so that we won't overflow
    psrlw mm1,1; //mm1=a?>>1 nuke out lower 1 bits
    pand mm4,mm7; //g5: mm4=sg3 sg2 sg1 sg0
    movq mm2,SIXONES;//g4: mm2=63
    pand mm5,mm7; //g7: mm5=dg3 dg2 dg1 dg0
    movq mm3,[esi]; //b1: mm3=src3 src2 src1 src0
    psubsb mm2,mm0; //g6: mm2=63-a3 63-a2 63-a1 63-a0
    movq mm7,MASKB; //b2: mm7=BLUE MASK
    pmullw mm4,mm0; //g8: mm4=sg?*a?
    movq mm0,[edi]; //b3: mm0=dst3 dst2 dst1 dst0
    pmullw mm5,mm2; //g9: mm5=dg?*(1-a?)
    movq mm2,mm7; //b4: mm2=fiveones
    pand mm3,mm7; //b4: mm3=sb3 sb2 sb1 sb0
    pmullw mm3,mm1; //b6: mm3=sb?*a?
    pand mm0,mm7; //b5: mm0=db3 db2 db1 db0
    movq mm7,[esi]; //r1: mm7=src3 src2 src1 src0
    paddw mm4,mm5; //g10: mm4=sg?*a?+dg?*(1-a?)
    pand mm7,MASKR; //r2: mm7=sr3 sr2 sr1 sr0 
    psubsb mm2,mm1; //b5a: mm2=31-a3 31-a2 31-a1 31-a0
    paddw mm4,FIVETWELVE; //g11: mm4=(mm4+512) green
    pmullw mm0,mm2; //b7: mm0=db?*(1-a?)
    movq mm5,mm4; //g12: mm5=mm4 green
    psrlw mm7,11; //r4: shift src red down to position 0
    psrlw mm4,6; //g13: mm4=mm4>>6
    paddw mm4,mm5; //g14: mm4=mm4+mm5 green
    paddw mm0,mm3; //b8: mm0=sb?*a?+db?*(1-a?)
    movq mm5,[edi]; //r3: mm5=dst3 dst2 dst1 dst0
    paddw mm0,SIXTEEN; //b9: mm0=(mm0+16) blue
    pand mm5,MASKR; //r5: mm5=dr3 dr2 dr1 dr0
    psrlw mm4,5; //g15: mm4=0?g0 0?g0 0?g0 0?g0 green
    movq mm3,mm0; //b10: mm3=mm0 blue
    psrlw mm0,5; //b11: mm0=mm0>>5 blue
    psrlw mm5,11; //r6: shift dst red down to position 0
    paddw mm0,mm3; //b12: mm0=mm3+mm0 blue
    psrlw mm0,5; //b13: mm0=000b 000b 000b 000b blue
    pmullw mm7,mm1; //mm7=sr?*a?
    pand mm4,MASKG; //g16: mm4=00g0 00g0 00g0 00g0 green
    pmullw mm5,mm2; //r7: mm5=dr?*(31-a?)
    por mm0,mm4; //mm0=00gb 00gb 00gb 00gb
    add eax,4; //move to next 4 alphas
    add esi,8; //move to next 4 pixels in src
    add edi,8; //move to next 4 pixels in dst
    movd mm1,[eax]; //mm1=00 00 00 00 a3 a2 a1 a0
    paddw mm5,mm7; //r8: mm5=sr?*a?+dr?*(31-a?)
    paddw mm5,SIXTEEN; //r9: mm5=(mm5+16) red
    pxor mm2,mm2; //mm2=0;
    movq mm7,mm5; //r10: mm7=mm5 red
    psrlw mm5,5; //r11: mm5=mm5>>5 red
    movq mm4,[esi]; //g1: mm4=src3 src2 src1 src0
    paddw mm5,mm7; //r12: mm5=mm7+mm5 red
    punpcklbw mm1,mm2; //mm1=00a3 00a2 00a1 00a0
    psrlw mm5,5; //r13: mm5=mm5>>5 red
    psllw mm5,11; //r14: mm5=mm5<<10 red
    por mm0,mm5; //mm0=0rgb 0rgb 0rgb 0rgb
    sub ebx,4; //polished off 4 pixels
    movq [edi-8],mm0; //dst=0rgb 0rgb 0rgb 0rgb
    jmp loopqword; //go back to start
    copyback:
    movq [edi],mm4; //copy source to destination
    leavefront:
    add edi,8; //advance destination by 4 pixels
    add eax,4; //advance alpha by 4
    add esi,8; //advance source by 4 pixels
    sub ebx,4; //decrease pixel count by 4
    jmp primeloop;
    checkback:
    test ebx,0xFF; //check if 0 pixels left
    jz nextline; //done with this span
    //backalign: //work out back end pixels
    movq mm5,[edi]; //g2: mm5=dst3 dst2 dst1 dst0
    psrlw mm1,2; //mm1=a?>>2 nuke out lower 2 bits
    movq mm7,MASKSHIFTG; //g3: mm7=shift 1 bit green mask
    psrlw mm4,1; //g3a: move src green down by 1 so that we won't overflow
    movq mm0,mm1; //mm0=00a3 00a2 00a1 00a0
    psrlw mm5,1; //g3b: move dst green down by 1 so that we won't overflow
    psrlw mm1,1; //mm1=a?>>1 nuke out lower 1 bits
    pand mm4,mm7; //g5: mm4=sg3 sg2 sg1 sg0
    movq mm2,SIXONES;//g4: mm2=63
    pand mm5,mm7; //g7: mm5=dg3 dg2 dg1 dg0
    movq mm3,[esi]; //b1: mm3=src3 src2 src1 src0
    psubsb mm2,mm0; //g6: mm2=63-a3 63-a2 63-a1 63-a0
    movq mm7,MASKB; //b2: mm7=BLUE MASK
    pmullw mm4,mm0; //g8: mm4=sg?*a?
    movq mm0,[edi]; //b3: mm0=dst3 dst2 dst1 dst0
    pmullw mm5,mm2; //g9: mm5=dg?*(1-a?)
    movq mm2,mm7; //b4: mm2=fiveones
    pand mm3,mm7; //b4: mm3=sr3 sr2 sr1 sr0
    pmullw mm3,mm1; //b6: mm3=sb?*a?
    pand mm0,mm7; //b5: mm0=db3 db2 db1 db0
    movq mm7,[esi]; //r1: mm7=src3 src2 src1 src0
    paddw mm4,mm5; //g10: mm4=sg?*a?+dg?*(1-a?)
    pand mm7,MASKR; //r2: mm7=sr3 sr2 sr1 sr0
    psubsb mm2,mm1; //b5a: mm2=31-a3 31-a2 31-a1 31-a0
    paddw mm4,FIVETWELVE; //g11: mm4=(i+512) green
    pmullw mm0,mm2; //b7: mm0=db?*(1-a?)
    movq mm5,mm4; //g12: mm5=(i+512) green
    psrlw mm7,11; //r4: shift src red down to position 0
    psrlw mm4,6; //g13: mm4=(i+512)>>6
    paddw mm4,mm5; //g14: mm4=(i+512)+((i+512)>>6) green
    paddw mm0,mm3; //b8: mm0=sb?*a?+db?*(1-a?)
    movq mm5,[edi]; //r3: mm5=dst3 dst2 dst1 dst0
    paddw mm0,SIXTEEN; //b9: mm0=(i+16) blue
    pand mm5,MASKR; //r5: mm5=dr3 dr2 dr1 dr0
    psrlw mm4,5; //g15: mm4=0?g0 0?g0 0?g0 0?g0 green
    movq mm3,mm0; //b10: mm3=(i+16) blue
    psrlw mm0,5; //b11: mm0=(i+16)>>5 blue
    psrlw mm5,11; //r6: shift dst red down to position 0
    paddw mm0,mm3; //b12: mm0=(i+16)+(i+16)>>5 blue
    psrlw mm0,5; //b13: mm0=000r 000r 000r 000r blue
    pmullw mm7,mm1; //mm7=sr?*a?
    pand mm4,MASKG; //g16: mm4=00g0 00g0 00g0 00g0 green
    pmullw mm5,mm2; //r7: mm5=dr?*(31-a?)
    por mm0,mm4; //mm0=00gb 00gb 00gb 00gb
    add eax,4; //move to next 4 alphas
    //stall
    paddw mm5,mm7; //r8: mm5=sr?*a?+dr?*(31-a?)
    paddw mm5,SIXTEEN; //r9: mm5=(i+16) red
    movq mm7,mm5; //r10: mm7=(i+16) red
    psrlw mm5,5; //r11: mm5=(i+16)>>5 red
    paddw mm5,mm7; //r12: mm5=(i+16)+((i+16)>>5) red
    psrlw mm5,5; //r13: mm5=(i+16)+((i+16)>>5)>>5 red
    psllw mm5,11; //r14: mm5=mm5<<10 red
    por mm0,mm5; //mm0=0rgb 0rgb 0rgb 0rgb
    test ebx,2; //check if there are 2 pixels
    jz oneendpixel; //goto one pixel if that's it
    movd [edi],mm0; //dst=0000 0000 0rgb 0rgb
    psrlq mm0,32; //mm0>>32
    add edi,4; //edi=edi+4
    sub ebx,2; //saved 2 pixels
    jz nextline; //all done goto next line
    oneendpixel: //work on last pixel
    movd edx,mm0; //edx=0rgb
    mov [edi],dx; //dst=0rgb
    nextline: //goto next line
    dec ecx; //nuke one line
    jz done; //all done
    mov eax,lpLinearAlpBp; //alpha
    mov esi,lpLinearSrcBp; //src
    mov edi,lpLinearDstBp; //dst
    add eax,iAlpPitch; //inc alpha ptr by 1 line
    add esi,iSrcPitch; //inc src ptr by 1 line
    add edi,iDstPitch; //inc dst ptr by 1 line
    mov lpLinearAlpBp,eax; //save new alpha base ptr
    mov ebx,iDstW; //ebx=span width to copy
    mov lpLinearSrcBp,esi; //save new src base ptr
    mov lpLinearDstBp,edi; //save new dst base ptr
    jmp primeloop; //start the next span
    done:
    emms
    }
    }
     
    分享到:  QQ好友和群QQ好友和群 QQ空间QQ空间 腾讯微博腾讯微博 腾讯朋友腾讯朋友
    收藏收藏
    回复

    使用道具 举报

    本版积分规则

    关闭

    站长推荐上一条 /1 下一条

    小黑屋|手机版|Archiver|51Testing软件测试网 ( 沪ICP备05003035号 关于我们

    GMT+8, 2024-11-25 23:16 , Processed in 0.065179 second(s), 27 queries .

    Powered by Discuz! X3.2

    © 2001-2024 Comsenz Inc.

    快速回复 返回顶部 返回列表