Subversion Repositories pentevo

Rev

Rev 716 | Blame | Last modification | View Log | Download | RSS feed

  1. #include "std.h"
  2.  
  3. #include "emul.h"
  4. #include "vars.h"
  5. #include "draw.h"
  6. #include "dxr_rsm.h"
  7.  
  8. RSM_DATA rsm;
  9.  
  10. void RSM_DATA::prepare_line_32(unsigned char *src0)
  11. {
  12.    unsigned i;
  13.    for (i = 0; i < line_size_d; i++)
  14.        line_buffer_d[i] = bias;
  15.  
  16.    unsigned line_size = line_size_d / 2, frame = 0;
  17.    __m64 *tab = colortab;
  18.    for (;;)
  19.    {
  20.       unsigned char *src = src0;
  21.       for (i = 0; i < line_size; )
  22.       {
  23.          unsigned s = *(unsigned*)src, attr = (s >> 6) & 0x3FC;
  24.          line_buffer[i+0] = _mm_add_pi8(line_buffer[i+0], tab[((s >> 6) & 3) + attr]);
  25.          line_buffer[i+1] = _mm_add_pi8(line_buffer[i+1], tab[((s >> 4) & 3) + attr]);
  26.          line_buffer[i+2] = _mm_add_pi8(line_buffer[i+2], tab[((s >> 2) & 3) + attr]);
  27.          line_buffer[i+3] = _mm_add_pi8(line_buffer[i+3], tab[((s >> 0) & 3) + attr]);
  28.          attr = (s >> 22) & 0x3FC;
  29.          line_buffer[i+4] = _mm_add_pi8(line_buffer[i+4], tab[((s >>22) & 3) + attr]);
  30.          line_buffer[i+5] = _mm_add_pi8(line_buffer[i+5], tab[((s >>20) & 3) + attr]);
  31.          line_buffer[i+6] = _mm_add_pi8(line_buffer[i+6], tab[((s >>18) & 3) + attr]);
  32.          line_buffer[i+7] = _mm_add_pi8(line_buffer[i+7], tab[((s >>16) & 3) + attr]);
  33.          i += 8, src += 4;
  34.       }
  35.       if (++frame == mix_frames)
  36.           break;
  37.       src0 += rb2_offs;
  38.       if (src0 >= rbuf_s + rb2_offs * mix_frames)
  39.           src0 -= rb2_offs * mix_frames;
  40.       tab += 0x100*4;
  41.    }
  42. }
  43.  
  44. void RSM_DATA::prepare_line_16(unsigned char *src0)
  45. {
  46.    unsigned i;
  47.    for (i = 0; i < line_size_d; i++)
  48.        line_buffer_d[i] = bias;
  49.  
  50.    unsigned line_size = line_size_d / 2, frame = 0;
  51.    __m64 *tab = colortab;
  52.    for (;;) {
  53.       unsigned char *src = src0;
  54.       for (i = 0; i < line_size; ) {
  55.          unsigned s = *(unsigned*)src, attr = (s >> 4) & 0xFF0;
  56.          line_buffer[i+0] = _mm_add_pi8(line_buffer[i+0], tab[((s >> 4)  & 0xF) + attr]);
  57.          line_buffer[i+1] = _mm_add_pi8(line_buffer[i+1], tab[((s >> 0)  & 0xF) + attr]);
  58.          attr = (s >> 20) & 0xFF0;
  59.          line_buffer[i+2] = _mm_add_pi8(line_buffer[i+2], tab[((s >> 20) & 0xF) + attr]);
  60.          line_buffer[i+3] = _mm_add_pi8(line_buffer[i+3], tab[((s >> 16) & 0xF) + attr]);
  61.          src += 4; i += 4;
  62.       }
  63.       if (++frame == mix_frames) break;
  64.       src0 += rb2_offs;
  65.       if (src0 >= rbuf_s + rb2_offs * mix_frames) src0 -= rb2_offs * mix_frames;
  66.       tab += 0x100*16;
  67.    }
  68. }
  69.  
  70. void RSM_DATA::prepare_line_8(unsigned char *src0)
  71. {
  72.    unsigned i;
  73.    for (i = 0; i < line_size_d; i++)
  74.        line_buffer_d[i] = bias;
  75.  
  76.    unsigned frame = 0;
  77.    DWORD *tab = (DWORD*)colortab;
  78.    for (;;) {
  79.       unsigned char *src = src0;
  80.       for (i = 0; i < line_size_d; ) {
  81.          unsigned s = *(unsigned*)src, attr = (s >> 4) & 0xFF0;
  82.          line_buffer_d[i+0] += tab[((s >> 4)  & 0xF) + attr];
  83.          line_buffer_d[i+1] += tab[((s >> 0)  & 0xF) + attr];
  84.          attr = (s >> 20) & 0xFF0;
  85.          line_buffer_d[i+2] += tab[((s >> 20) & 0xF) + attr];
  86.          line_buffer_d[i+3] += tab[((s >> 16) & 0xF) + attr];
  87.          s = *(unsigned*)(src+4), attr = (s >> 4) & 0xFF0;
  88.          line_buffer_d[i+4] += tab[((s >> 4)  & 0xF) + attr];
  89.          line_buffer_d[i+5] += tab[((s >> 0)  & 0xF) + attr];
  90.          attr = (s >> 20) & 0xFF0;
  91.          line_buffer_d[i+6] += tab[((s >> 20) & 0xF) + attr];
  92.          line_buffer_d[i+7] += tab[((s >> 16) & 0xF) + attr];
  93.          src += 8, i += 8;
  94.       }
  95.       if (++frame == mix_frames) break;
  96.       src0 += rb2_offs;
  97.       if (src0 >= rbuf_s + rb2_offs * mix_frames) src0 -= rb2_offs * mix_frames;
  98.       tab += 0x100*16;
  99.    }
  100. }
  101.  
  102. void rend_rsm_8(unsigned char *dst, unsigned pitch, unsigned char *src)
  103. {
  104.    unsigned delta = temp.scx/4;
  105.    for (unsigned y = 0; y < temp.scy; y++) {
  106.       rsm.prepare_line_8(src);
  107.       for (unsigned i = 0; i < rsm.line_size_d; i++)
  108.          *(unsigned*)(dst + i*4) = rsm.line_buffer_d[i];
  109.       dst += pitch; src += delta;
  110.    }
  111. }
  112.  
  113. void rend_rsm_16(unsigned char *dst, unsigned pitch, unsigned char *src)
  114. {
  115.    unsigned delta = temp.scx/4;
  116.    for (unsigned y = 0; y < temp.scy; y++) {
  117.       rsm.prepare_line_32(src);
  118.       // pack truecolor pixel to 16 bit
  119.       if (temp.hi15 == 0)
  120.          for (unsigned i = 0; i < rsm.line_size_d; i+=2) {
  121.             unsigned c1 = rsm.line_buffer_d[i];
  122.             unsigned c2 = rsm.line_buffer_d[i+1];
  123.             *(unsigned*)(dst + i*2) =
  124.               ((c1 >> 3) & 0x1F) + ((c1 >> 5) & 0x07E0) + ((c1 >> 8) & 0xF800) +
  125.               ((c2 << 13) & 0x1F0000) + ((c2 << 11) & 0x07E00000) + ((c2 << 8) & 0xF8000000);
  126.          }
  127.       else /* if (temp.hi15 == 1) */
  128.          for (unsigned i = 0; i < rsm.line_size_d; i+=2) {
  129.             unsigned c1 = rsm.line_buffer_d[i];
  130.             unsigned c2 = rsm.line_buffer_d[i+1];
  131.             *(unsigned*)(dst + i*2) =
  132.               ((c1 >> 3) & 0x1F) + ((c1 >> 6) & 0x03E0) + ((c1 >> 9) & 0x7C00) +
  133.               ((c2 << 13) & 0x1F0000) + ((c2 << 10) & 0x03E00000) + ((c2 << 7) & 0x7C000000);
  134.          }
  135.  
  136.       dst += pitch; src += delta;
  137.    }
  138. }
  139.  
  140. void rend_rsm_16o(unsigned char *dst, unsigned pitch, unsigned char *src)
  141. {
  142.    unsigned delta = temp.scx/4;
  143.    for (unsigned y = 0; y < temp.scy; y++) {
  144.       rsm.prepare_line_16(src);
  145.       for (unsigned i = 0; i < rsm.line_size_d; i++)
  146.          *(unsigned*)(dst + i*4) = rsm.line_buffer_d[i];
  147.  
  148.       dst += pitch; src += delta;
  149.    }
  150. }
  151.  
  152. void rend_rsm_32(unsigned char *dst, unsigned pitch, unsigned char *src)
  153. {
  154.    unsigned delta = temp.scx/4;
  155.    for (unsigned y = 0; y < temp.scy; y++) {
  156.       rsm.prepare_line_32(src);
  157.       for (unsigned i = 0; i < rsm.line_size_d; i++)
  158.          *(unsigned*)(dst + i*4) = rsm.line_buffer_d[i];
  159.       dst += pitch; src += delta;
  160.    }
  161. }
  162.  
  163. void __fastcall render_rsm(unsigned char *dst, unsigned pitch)
  164. {
  165.    rsm.colortab = (__m64*)((int)rsm.tables + rsm.frame * rsm.frame_table_size);
  166.    unsigned char *src = rbuf_s + rb2_offs * rsm.rbuf_dst;
  167.  
  168.    if (temp.obpp ==  8) rend_rsm_8 (dst, pitch, src);
  169.    if (temp.obpp == 16) { if (rsm.mode == 0) rend_rsm_16(dst, pitch, src); else rend_rsm_16o(dst, pitch, src); }
  170.    if (temp.obpp == 32) rend_rsm_32(dst, pitch, src);
  171.  
  172.    _mm_empty(); // EMMS
  173. }
  174.  
  175. unsigned gcd(unsigned x, unsigned y)
  176. {
  177.    while (x != y) if (x > y) x -= y; else y -= x;
  178.    return x;
  179. }
  180.  
  181. unsigned lcm(unsigned x, unsigned y)
  182. {
  183.    return x*y / gcd(x,y);
  184. }
  185.  
  186. void calc_rsm_tables()
  187. {
  188.    rsm.rbuf_dst = rsm.frame = 0;
  189.  
  190.    if (renders[conf.render].func != render_rsm) {
  191.       rsm.mix_frames = rsm.period = 1;
  192.       static unsigned char one = 1;
  193.       rsm.needframes = &one; // rsm.needframes[0]=1
  194.       return;
  195.    }
  196.  
  197.    rsm.mode = (temp.obpp == 8)? 2 : 0;
  198.    if (temp.obpp == 16 && temp.hi15 == 2) rsm.mode = 1;
  199.  
  200.    rsm.line_size_d = (temp.scx >> rsm.mode);
  201.  
  202.    unsigned fmax = lcm(conf.intfq, temp.ofq);
  203.    rsm.period = fmax / conf.intfq;
  204.    unsigned step = fmax / temp.ofq;
  205.  
  206.    rsm.mix_frames = (conf.rsm.mode == RSM_SIMPLE)? 2 : conf.rsm.mix_frames;
  207.  
  208.    rsm.frame_table_size = rsm.mix_frames * 0x100;
  209.    if (rsm.mode == 0) rsm.frame_table_size *= 4*sizeof(__m64);
  210.    if (rsm.mode == 1) rsm.frame_table_size *= 16*sizeof(__m64);
  211.    if (rsm.mode == 2) rsm.frame_table_size *= 16*sizeof(DWORD);
  212.  
  213.    rsm.data = (unsigned char*)realloc(rsm.data, rsm.period * (rsm.frame_table_size + 1));
  214.    rsm.tables = (__m64*)rsm.data;
  215.    rsm.needframes = rsm.data + rsm.frame_table_size * rsm.period;
  216.  
  217.    double *weights = (double*)alloca(rsm.period * rsm.mix_frames * sizeof(double));
  218.    double *dst = weights;
  219.  
  220.    unsigned low_bias = 0, dynamic_range = 0xFF;
  221.  
  222.    if (conf.rsm.mode != RSM_SIMPLE) {
  223.       unsigned fsize = rsm.period * rsm.mix_frames;
  224.       double *flt = (double*)alloca((fsize+1) * sizeof(double));
  225.  
  226.       double cutoff = 0.9;
  227.       if (conf.rsm.mode == RSM_FIR1) cutoff = 0.5;
  228.       if (conf.rsm.mode == RSM_FIR2) cutoff = 0.33333333;
  229.  
  230.       double pi = 4*atan(1.0);
  231.       cutoff *= 1 / (double)rsm.period; // cutoff scale = inftq/maxfq = 1/rsm.period
  232.       double c1 = 0.54 / pi, c2 = 0.46 / pi;
  233.       for (unsigned i = 0; i <= fsize; i++) {
  234.          if (i == fsize/2) flt[i] = cutoff;
  235.          else flt[i] = sin(pi*cutoff*((double)i - fsize/2)) * (c1 - c2*cos(2*pi*(double)i/fsize)) / ((double)i - fsize/2);
  236.       }
  237.  
  238.       double low_b = 0, high_b = 0;
  239.       for (unsigned frame = 0; frame < rsm.period; frame++)
  240.       {
  241.          unsigned pos = frame * step, srcframe = pos / rsm.period;
  242.          if (frame) srcframe++; // (pos % rsm.period) != 0
  243.  
  244.          unsigned nextpos = pos + step, nextframe = nextpos / rsm.period;
  245.          if (frame+1 != rsm.period) nextframe++; // (nextpos % rsm.period) != 0
  246.  
  247.          rsm.needframes[frame] = (nextframe - srcframe);
  248.          double low = 0, high = 0;
  249.          unsigned offset = (srcframe * rsm.period) - pos;
  250.          for (unsigned ch = 0; ch < rsm.mix_frames; ch++) {
  251.             double weight = flt[offset] * rsm.period;
  252.             if (weight < 0) low += weight; else high += weight;
  253.             *dst++ = weight;
  254.             offset += rsm.period;
  255.          }
  256.          if (low < low_b) low_b = low;
  257.          if (high > high_b) high_b = high;
  258.       }
  259.       low_bias = (unsigned)((-low_b)*0xFF);
  260.       dynamic_range = (0xFF - low_bias);
  261.    } else { // RSM_SIMPLE
  262.  
  263.       double div = 1 / (double)step;
  264.       for (unsigned frame = 0; frame < rsm.period; frame++)
  265.       {
  266.          unsigned pos = frame * step, srcframe = pos / rsm.period;
  267.          unsigned nextpos = pos + step, nextframe = nextpos / rsm.period;
  268.          unsigned offset = (srcframe == nextframe)? step : (nextpos - nextframe*rsm.period);
  269.          rsm.needframes[frame] = (nextframe - srcframe);
  270.          *dst++ = offset * div;
  271.          *dst++ = (step - offset) * div;
  272.       }
  273.    }
  274.  
  275.    rsm.bias = 0x01010101 * low_bias;
  276.  
  277.    unsigned char *dst32 = (unsigned char*)rsm.tables;
  278.    for (unsigned frame = 0; frame < rsm.period; frame++)
  279.       for (unsigned ch = 0; ch < rsm.mix_frames; ch++)
  280.       {
  281.          unsigned char *start_frame = dst32;
  282.          switch (rsm.mode)
  283.          {
  284.             case 0: // rgb 16/32bit - reorder table, MMX processes 2 truecolor pixels at one op
  285.                for (unsigned a = 0; a < 0x100; a++)
  286.                   for (unsigned pix = 0; pix < 4; pix++) {
  287.                      *(DWORD*)(dst32+0) = t.sctab32[0][(pix >>1)*0x100 + a];
  288.                      *(DWORD*)(dst32+4) = t.sctab32[0][(pix & 1)*0x100 + a];
  289.                      dst32 += 8;
  290.                   }
  291.                break;
  292.             case 1: // YUY2 overlay - reorder table, MMX processes 4 overlay pixels at one op
  293.                for (unsigned a = 0; a < 0x100; a++)
  294.                   for (unsigned pix = 0; pix < 16; pix++) {
  295.                      *(DWORD*)(dst32+0) = t.sctab16[0][a*4 + (pix >> 2)];
  296.                      *(DWORD*)(dst32+4) = t.sctab16[0][a*4 + (pix &  3)];
  297.                      dst32 += 8;
  298.                   }
  299.                break;
  300.             case 2:
  301.                memcpy(dst32, t.sctab8[0], 0x100*16*sizeof(DWORD)), dst32 += 0x100*16*sizeof(DWORD);
  302.                break;
  303.             default:
  304.                __assume(0);
  305.          }
  306.  
  307.          double scale = *weights++ * dynamic_range * (1/256.0);
  308.          for (unsigned char *ptr = start_frame; ptr < dst32; ptr++) {
  309.             double color = scale * *ptr;
  310.             if (color > 255) color = 255;
  311.             if (color < 0) color += 256; // color = 0;
  312.             *ptr = (unsigned char)color;
  313.          }
  314.       }
  315. }
  316.