Subversion Repositories pentevo

Rev

Rev 716 | Blame | Last modification | View Log | Download | RSS feed

  1. // http://scale2x.sourceforge.net/algorithm.html
  2.  
  3. #include "std.h"
  4.  
  5. #include "emul.h"
  6. #include "vars.h"
  7. #include "draw.h"
  8. #include "dxrend.h"
  9. #include "dxrcopy.h"
  10. #include "dxrframe.h"
  11. #include "dxr_advm.h"
  12.  
  13. inline void line_8_any(unsigned char *dst, unsigned char *src)
  14. {
  15.    if (conf.noflic) line8_nf(dst, src, t.sctab8[0]);
  16.    else line8(dst, src, t.sctab8[0]);
  17. }
  18.  
  19. inline void line_32_any(unsigned char *dst, unsigned char *src)
  20. {
  21.    if (conf.noflic) line32_nf(dst, src, t.sctab32[0]);
  22.    else line32(dst, src, t.sctab32[0]);
  23. }
  24.  
  25. #if 1   // switch between vectorized and branched code
  26.  
  27. #ifdef MOD_SSE2
  28.  
  29. void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  30. {
  31.    const unsigned char
  32.       *u = src + ((y-1) & 7)*sc2lines_width,
  33.       *m = src + ((y+0) & 7)*sc2lines_width,
  34.       *l = src + ((y+1) & 7)*sc2lines_width;
  35.  
  36.    for (unsigned i = 0; i < nPix; i += 8) {
  37.  
  38.       __m64 uu = *(__m64*)(u+i);
  39.       __m64 ll = *(__m64*)(l+i);
  40.       __m64 cmp = _mm_cmpeq_pi8(uu,ll);
  41.  
  42.       if (_mm_movemask_pi8(cmp) != 0xFF) {
  43.  
  44.          __m128i mm = _mm_loadu_si128((__m128i*)(m+i-4));
  45.          __m128i uu = _mm_loadu_si128((__m128i*)(u+i-4));
  46.          __m128i ll = _mm_loadu_si128((__m128i*)(l+i-4));
  47.  
  48.          __m128i md = _mm_slli_si128(mm,1);
  49.          __m128i mf = _mm_srli_si128(mm,1);
  50.          __m128i maskall = _mm_or_si128(_mm_cmpeq_epi8(md,mf), _mm_cmpeq_epi8(uu,ll));
  51.  
  52.          __m128i e0, e1, v1, v2, v3;
  53.  
  54.          e0 = _mm_cmpeq_epi8(md,uu);
  55.          e0 = _mm_andnot_si128(maskall, e0);
  56.          e0 = _mm_srli_si128(e0,4);
  57.          e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());
  58.  
  59.          e1 = _mm_cmpeq_epi8(mf,uu);
  60.          e1 = _mm_andnot_si128(maskall, e1);
  61.          e1 = _mm_srli_si128(e1,4);
  62.          e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);
  63.  
  64.          e0 = _mm_or_si128(e0, e1);
  65.  
  66.          v1 = _mm_srli_si128(mm,4);
  67.          v1 = _mm_unpacklo_epi8(v1,v1);
  68.          v2 = _mm_srli_si128(uu,4);
  69.          v2 = _mm_unpacklo_epi8(v2,v2);
  70.  
  71.          _mm_store_si128((__m128i*)(dst1 + 2*i), _mm_or_si128( _mm_and_si128(e0,v2), _mm_andnot_si128(e0,v1) ) );
  72.  
  73.          e0 = _mm_cmpeq_epi8(md,ll);
  74.          e0 = _mm_andnot_si128(maskall, e0);
  75.          e0 = _mm_srli_si128(e0,4);
  76.          e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());
  77.  
  78.          e1 = _mm_cmpeq_epi8(mf,ll);
  79.          e1 = _mm_andnot_si128(maskall, e1);
  80.          e1 = _mm_srli_si128(e1,4);
  81.          e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);
  82.  
  83.          e0 = _mm_or_si128(e0, e1);
  84.  
  85.          v3 = _mm_srli_si128(ll,4);
  86.          v3 = _mm_unpacklo_epi8(v3,v3);
  87.  
  88.          _mm_store_si128((__m128i*)(dst2 + 2*i), _mm_or_si128( _mm_and_si128(e0,v3), _mm_andnot_si128(e0,v1) ) );
  89.  
  90.       } else {
  91.  
  92.          __m64 v0 = *(__m64*)(m+i);
  93.          __m128i v1 = _mm_movpi64_epi64(v0);
  94.          v1 = _mm_unpacklo_epi8(v1,v1);
  95.          _mm_store_si128((__m128i*)(dst1 + 2*i), v1);
  96.          _mm_store_si128((__m128i*)(dst2 + 2*i), v1);
  97.       }
  98.    }
  99. }
  100.  
  101. #else // MMX vectorized
  102.  
  103. void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  104. {
  105.    const unsigned char
  106.       *u = src + ((y-1) & 7)*sc2lines_width,
  107.       *m = src + ((y+0) & 7)*sc2lines_width,
  108.       *l = src + ((y+1) & 7)*sc2lines_width;
  109.  
  110.    for (unsigned i = 0; i < nPix; i += 4) {
  111.  
  112.       if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) {
  113.  
  114.          __m64 mm = *(__m64*)(m+i-2);
  115.          __m64 uu = *(__m64*)(u+i-2);
  116.          __m64 ll = *(__m64*)(l+i-2);
  117.          __m64 md = _mm_slli_si64(mm,8);
  118.          __m64 mf = _mm_srli_si64(mm,8);
  119.          __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll));
  120.  
  121.          __m64 e0, e1, v1, v2;
  122.  
  123.          e0 = _mm_cmpeq_pi8(md,uu);
  124.          e0 = _mm_andnot_si64(maskall, e0);
  125.          e0 = _mm_srli_si64(e0,16);
  126.          e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());
  127.  
  128.          e1 = _mm_cmpeq_pi8(mf,uu);
  129.          e1 = _mm_andnot_si64(maskall, e1);
  130.          e1 = _mm_srli_si64(e1,16);
  131.          e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);
  132.  
  133.          e0 = _mm_or_si64(e0, e1);
  134.  
  135.          v1 = _m_from_int(*(unsigned*)(m+i));
  136.          v2 = _m_from_int(*(unsigned*)(u+i));
  137.          v1 = _mm_unpacklo_pi8(v1,v1);
  138.          v2 = _mm_unpacklo_pi8(v2,v2);
  139.  
  140.          *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );
  141.  
  142.          e0 = _mm_cmpeq_pi8(md,ll);
  143.          e0 = _mm_andnot_si64(maskall, e0);
  144.          e0 = _mm_srli_si64(e0,16);
  145.          e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());
  146.  
  147.          e1 = _mm_cmpeq_pi8(mf,ll);
  148.          e1 = _mm_andnot_si64(maskall, e1);
  149.          e1 = _mm_srli_si64(e1,16);
  150.          e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);
  151.  
  152.          e0 = _mm_or_si64(e0, e1);
  153.  
  154.          v1 = _m_from_int(*(unsigned*)(m+i));
  155.          v2 = _m_from_int(*(unsigned*)(l+i));
  156.          v1 = _mm_unpacklo_pi8(v1,v1);
  157.          v2 = _mm_unpacklo_pi8(v2,v2);
  158.  
  159.          *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );
  160.  
  161.       } else {
  162.  
  163.          __m64 v1 = _m_from_int(*(unsigned*)(m+i));
  164.          v1 = _mm_unpacklo_pi8(v1,v1);
  165.          *(__m64*)(dst1 + 2*i) = v1;
  166.          *(__m64*)(dst2 + 2*i) = v1;
  167.  
  168.       }
  169.  
  170.    }
  171. }
  172.  
  173. #endif // SSE2
  174.  
  175. #else // MMX branched
  176. // src       dst
  177. // ABC       e0e1
  178. // DEF       e2e3
  179. // GHI
  180.  
  181. /*
  182. if(B != H && D != F)
  183. {                               E0 = E;
  184.     E0 = D == B ? D : E;        E1 = E;
  185.     E1 = B == F ? F : E;        E2 = E;
  186.     E2 = D == H ? D : E;        E3 = E;
  187.     E3 = H == F ? F : E;        if(B != H) continue;
  188. }                          =>   if(D != F)
  189. else                            {
  190. {                                   E0 = D == B ? D : E;
  191.     E0 = E;                         E1 = B == F ? F : E;
  192.     E1 = E;                         E2 = D == H ? D : E;
  193.     E2 = E;                         E3 = H == F ? F : E;
  194.     E3 = E;                     }
  195. }
  196. */
  197.  
  198. void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  199. {
  200.    const unsigned char
  201.       *u = src + ((y-1) & 7)*sc2lines_width,
  202.       *m = src + ((y+0) & 7)*sc2lines_width,
  203.       *l = src + ((y+1) & 7)*sc2lines_width;
  204.  
  205.    // process 4pix per iteration
  206.    for (unsigned i = 0; i < nPix; i += 4)
  207.    {
  208.       unsigned dw = *(unsigned*)(m+i);
  209.       __m64 v1 = _mm_cvtsi32_si64(dw); // v1   =     0|    0|    0|    0|dw[3]|dw[2]|dw[1]|dw[0]
  210.       v1 = _mm_unpacklo_pi8(v1,v1);    // v1   = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
  211.       *(__m64*)(dst1 + 2*i) = v1;      // e0e1 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
  212.       *(__m64*)(dst2 + 2*i) = v1;      // e2e3 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
  213.  
  214.       dw = *(unsigned*)(u+i) ^ *(unsigned*)(l+i);
  215.       if (!dw)
  216.           continue; // u == l
  217.  
  218.    #define process_pix(n)                                       \
  219.       if ((dw & (0xFF << (8*n))) && m[i+n-1] != m[i+n+1])       \
  220.       {                                                         \
  221.          if (u[i+n] == m[i+n-1])                                \
  222.              dst1[2*(i+n)] = u[i+n];                            \
  223.          if (u[i+n] == m[i+n+1])                                \
  224.              dst1[2*(i+n)+1] = u[i+n];                          \
  225.          if (l[i+n] == m[i+n-1])                                \
  226.              dst2[2*(i+n)] = l[i+n];                            \
  227.          if (l[i+n] == m[i+n+1])                                \
  228.              dst2[2*(i+n)+1] = l[i+n];                          \
  229.       }
  230.  
  231.       process_pix(0);
  232.       process_pix(1);
  233.       process_pix(2);
  234.       process_pix(3);
  235.    #undef process_pix
  236.    }
  237. }
  238.  
  239. #endif // MMX branched
  240.  
  241. void lines_scale2_32(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  242. {
  243.    const u32 *s = (u32 *)src;
  244.    const u32 *u = s + ((y-1) & 7)*sc2lines_width;
  245.    const u32 *m = s + ((y+0) & 7)*sc2lines_width;
  246.    const u32 *l = s + ((y+1) & 7)*sc2lines_width;
  247.    u32 *d1 = (u32 *)dst1;
  248.    u32 *d2 = (u32 *)dst2;
  249.  
  250.    for (unsigned i = 0; i < nPix; i++)
  251.    {
  252.       d1[2*i] = d1[2*i+1] = d2[2*i] = d2[2*i+1] = m[i];
  253.  
  254.       if (u[i] != l[i] && m[i-1] != m[i+1])
  255.       {
  256.          if (u[i] == m[i-1])
  257.              d1[2*i] = u[i];
  258.          if (u[i] == m[i+1])
  259.              d1[2*i+1] = u[i];
  260.          if (l[i] == m[i-1])
  261.              d2[2*i] = l[i];
  262.          if (l[i] == m[i+1])
  263.              d2[2*i+1] = l[i];
  264.       }
  265.    }
  266. }
  267.  
  268. // 8bpp
  269. void render_scale2(unsigned char *dst, unsigned pitch)
  270. {
  271.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  272.    line_8_any(t.scale2buf[0], src);
  273.    // assume 'above' screen line same as line 0
  274.    memcpy(t.scale2buf[(0-1) & 7], t.scale2buf[0], temp.scx);
  275.    for (unsigned y = 0; y < temp.scy; y++)
  276.    {
  277.       src += delta;
  278.       line_8_any(t.scale2buf[(y+1) & 7], src);
  279.       lines_scale2(t.scale2buf[0], y, dst, dst+pitch, temp.scx);
  280.       dst += 2*pitch;
  281.    }
  282. }
  283.  
  284. // 32bpp
  285. void render_scale2_32(unsigned char *dst, unsigned pitch)
  286. {
  287.    unsigned char *src = rbuf;
  288.    unsigned delta = temp.scx/4;
  289.    line_32_any((u8 *)t.scale2buf32[0], src);
  290.  
  291.    // assume 'above' screen line same as line 0
  292.    memcpy(t.scale2buf32[(0-1) & 7], t.scale2buf32[0], temp.scx);
  293.    for (unsigned y = 0; y < temp.scy; y++)
  294.    {
  295.       src += delta;
  296.       line_32_any((u8 *)t.scale2buf32[(y+1) & 7], src);
  297.       lines_scale2_32((u8 *)t.scale2buf32[0], y, dst, dst+pitch, temp.scx);
  298.       dst += 2*pitch;
  299.    }
  300. }
  301.  
  302. // MMX-vectorized version is not ready yet :(
  303. // 8bpp
  304. void lines_scale3(unsigned y, unsigned char *dst, unsigned pitch)
  305. {
  306.  
  307.    const unsigned char
  308.       *u = t.scale2buf[(y-1) & 3],
  309.       *m = t.scale2buf[(y+0) & 3],
  310.       *l = t.scale2buf[(y+1) & 3];
  311.  
  312.    for (unsigned i = 0; i < temp.scx; i += 4)
  313.    {
  314.       unsigned char c;
  315.  
  316.       c = m[i];
  317.       dst[3*i+0+0*pitch+ 0] = dst[3*i+1+0*pitch+ 0] = dst[3*i+2+0*pitch+ 0] = c;
  318.       dst[3*i+0+1*pitch+ 0] = dst[3*i+1+1*pitch+ 0] = dst[3*i+2+1*pitch+ 0] = c;
  319.       dst[3*i+0+2*pitch+ 0] = dst[3*i+1+2*pitch+ 0] = dst[3*i+2+2*pitch+ 0] = c;
  320.  
  321.       c = m[i+1];
  322.       dst[3*i+0+0*pitch+ 3] = dst[3*i+1+0*pitch+ 3] = dst[3*i+2+0*pitch+ 3] = c;
  323.       dst[3*i+0+1*pitch+ 3] = dst[3*i+1+1*pitch+ 3] = dst[3*i+2+1*pitch+ 3] = c;
  324.       dst[3*i+0+2*pitch+ 3] = dst[3*i+1+2*pitch+ 3] = dst[3*i+2+2*pitch+ 3] = c;
  325.  
  326.       c = m[i+2];
  327.       dst[3*i+0+0*pitch+ 6] = dst[3*i+1+0*pitch+ 6] = dst[3*i+2+0*pitch+ 6] = c;
  328.       dst[3*i+0+1*pitch+ 6] = dst[3*i+1+1*pitch+ 6] = dst[3*i+2+1*pitch+ 6] = c;
  329.       dst[3*i+0+2*pitch+ 6] = dst[3*i+1+2*pitch+ 6] = dst[3*i+2+2*pitch+ 6] = c;
  330.  
  331.       c = m[i+3];
  332.       dst[3*i+0+0*pitch+ 9] = dst[3*i+1+0*pitch+ 9] = dst[3*i+2+0*pitch+ 9] = c;
  333.       dst[3*i+0+1*pitch+ 9] = dst[3*i+1+1*pitch+ 9] = dst[3*i+2+1*pitch+ 9] = c;
  334.       dst[3*i+0+2*pitch+ 9] = dst[3*i+1+2*pitch+ 9] = dst[3*i+2+2*pitch+ 9] = c;
  335.  
  336.       unsigned dw = *(unsigned*)(u+i) ^ *(unsigned*)(l+i);
  337.       if (!dw) continue;
  338.  
  339.    #define process_pix(n)                                                                              \
  340.       if ((dw & (0xFF << (8*n))) && m[i+n-1] != m[i+n+1])                                              \
  341.       {                                                                                                \
  342.          if (u[i+n] == m[i+n-1])                                                                       \
  343.              dst[0*pitch+3*(i+n)] = u[i+n];                                                            \
  344.          if ((u[i+n] == m[i+n-1] && m[i+n] != u[i+n+1]) || (u[i+n] == m[i+n+1] && m[i+n] != u[i+n-1])) \
  345.              dst[0*pitch+3*(i+n)+1] = u[i+n];                                                          \
  346.          if (u[i+n] == m[i+n+1])                                                                       \
  347.              dst[0*pitch+3*(i+n)+2] = u[i+n];                                                          \
  348.          if ((u[i+n] == m[i+n-1] && m[i+n] != l[i+n-1]) || (l[i+n] == m[i+n-1] && m[i+n] != u[i+n-1])) \
  349.              dst[1*pitch+3*(i+n)+0] = m[i+n-1];                                                        \
  350.          if ((u[i+n] == m[i+n+1] && m[i+n] != l[i+n+1]) || (l[i+n] == m[i+n+1] && m[i+n] != u[i+n+1])) \
  351.              dst[1*pitch+3*(i+n)+2] = m[i+n+1];                                                        \
  352.          if (l[i+n] == m[i+n-1])                                                                       \
  353.              dst[2*pitch+3*(i+n)] = l[i+n];                                                            \
  354.          if ((l[i+n] == m[i+n-1] && m[i+n] != l[i+n+1]) || (l[i+n] == m[i+n+1] && m[i+n] != l[i+n-1])) \
  355.              dst[2*pitch+3*(i+n)+1] = l[i+n];                                                          \
  356.          if (l[i+n] == m[i+n+1])                                                                       \
  357.              dst[2*pitch+3*(i+n)+2] = l[i+n];                                                          \
  358.       }
  359.  
  360.       process_pix(0);
  361.       process_pix(1);
  362.       process_pix(2);
  363.       process_pix(3);
  364.    #undef process_pix
  365.    }
  366. }
  367.  
  368. // 8bpp
  369. void render_scale3(unsigned char *dst, unsigned pitch)
  370. {
  371.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  372.    line_8_any(t.scale2buf[0], src);
  373.    // assume 'above' screen line same as line 0
  374.    memcpy(t.scale2buf[(0-1) & 3], t.scale2buf[0], temp.scx);
  375.    for (unsigned y = 0; y < temp.scy; y++) {
  376.       src += delta;
  377.       line_8_any(t.scale2buf[(y+1) & 3], src);
  378.       lines_scale3(y, dst, pitch);
  379.       dst += 3*pitch;
  380.    }
  381. }
  382.  
  383. // 32bpp
  384. void lines_scale3_32(unsigned y, unsigned char *dst, unsigned pitch)
  385. {
  386.    const u32 *u = t.scale2buf32[(y-1) & 3];
  387.    const u32 *m = t.scale2buf32[(y+0) & 3];
  388.    const u32 *l = t.scale2buf32[(y+1) & 3];
  389.    u32 *d = (u32 *)dst;
  390.    pitch /= sizeof(u32);
  391.  
  392.    for (unsigned i = 0; i < temp.scx; i++)
  393.    {
  394.       d[0*pitch+3*i+0] = d[0*pitch+3*i+1] = d[0*pitch+3*i+2] = m[i];
  395.       d[1*pitch+3*i+0] = d[1*pitch+3*i+1] = d[1*pitch+3*i+2] = m[i];
  396.       d[2*pitch+3*i+0] = d[2*pitch+3*i+1] = d[2*pitch+3*i+2] = m[i];
  397.  
  398.       if (u[i] != l[i] && m[i-1] != m[i+1])
  399.       {
  400.          if (u[i] == m[i-1])
  401.              d[0*pitch+3*i+0] = u[i];
  402.          if ((u[i] == m[i-1] && m[i] != u[i+1]) || (u[i] == m[i+1] && m[i] != u[i-1]))
  403.              d[0*pitch+3*i+1] = u[i];
  404.          if (u[i] == m[i+1])
  405.              d[0*pitch+3*i+2] = u[i];
  406.          if ((u[i] == m[i-1] && m[i] != l[i-1]) || (l[i] == m[i-1] && m[i] != u[i-1]))
  407.              d[1*pitch+3*i+0] = m[i-1];
  408.          if ((u[i] == m[i+1] && m[i] != l[i+1]) || (l[i] == m[i+1] && m[i] != u[i+1]))
  409.              d[1*pitch+3*i+2] = m[i+1];
  410.          if (l[i] == m[i-1])
  411.              d[2*pitch+3*i+0] = l[i];
  412.          if ((l[i] == m[i-1] && m[i] != l[i+1]) || (l[i] == m[i+1] && m[i] != l[i-1]))
  413.              d[2*pitch+3*i+1] = l[i];
  414.          if (l[i] == m[i+1])
  415.              d[2*pitch+3*i+2] = l[i];
  416.       }
  417.    }
  418. }
  419.  
  420. // 32bpp
  421. void render_scale3_32(unsigned char *dst, unsigned pitch)
  422. {
  423.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  424.    line_32_any((u8 *)t.scale2buf32[0], src);
  425.    // assume 'above' screen line same as line 0
  426.    memcpy(t.scale2buf32[(0-1) & 3], t.scale2buf32[0], temp.scx);
  427.    for (unsigned y = 0; y < temp.scy; y++)
  428.    {
  429.       src += delta;
  430.       line_32_any((u8 *)t.scale2buf32[(y+1) & 3], src);
  431.       lines_scale3_32(y, dst, pitch);
  432.       dst += 3*pitch;
  433.    }
  434. }
  435.  
  436. void render_scale4(unsigned char *dst, unsigned pitch)
  437. {
  438.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  439.  
  440.    line_8_any(t.scale2buf[0], src); src += delta;
  441.    line_8_any(t.scale2buf[1], src); src += delta;
  442.    // assume 'above' screen line same as line 0
  443.    memcpy(t.scale2buf[(0-1) & 7], t.scale2buf[0], temp.scx);
  444.    lines_scale2(t.scale2buf[0], 0, t.scale4buf[0], t.scale4buf[1], temp.scx);
  445.  
  446.    for (unsigned y = 0; y < temp.scy; y++) {
  447.  
  448.       line_8_any(t.scale2buf[(y+2) & 7], src); src += delta;
  449.  
  450.       unsigned char *dst1 = t.scale4buf[(2*y+2) & 7];
  451.       unsigned char *dst2 = t.scale4buf[(2*y+3) & 7];
  452.       lines_scale2(t.scale2buf[0], y+1, dst1, dst2, temp.scx);
  453.  
  454.       lines_scale2(t.scale4buf[0], 2*y,   dst+0*pitch, dst+1*pitch, temp.scx*2);
  455.       lines_scale2(t.scale4buf[0], 2*y+1, dst+2*pitch, dst+3*pitch, temp.scx*2);
  456.  
  457.       dst += 4*pitch;
  458.    }
  459. }
  460.  
  461. void __fastcall render_advmame(unsigned char *dst, unsigned pitch)
  462. {
  463.    switch (conf.videoscale)
  464.    {
  465.       case 2:
  466.           if(temp.obpp == 8) render_scale2(dst, pitch);
  467.           else if(temp.obpp == 32) render_scale2_32(dst, pitch);
  468.       break;
  469.       case 3:
  470.           if(temp.obpp == 8) render_scale3(dst, pitch);
  471.           else if(temp.obpp == 32) render_scale3_32(dst, pitch);
  472.       break;
  473.       case 4: render_scale4(dst, pitch); break;
  474.       default: render_small(dst, pitch); return; // skip noflic test
  475.    }
  476.    if (conf.noflic)
  477.        memcpy(rbuf_s, rbuf, temp.scy*temp.scx/4);
  478.    _mm_empty();
  479. }
  480.