Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
716 | lvd | 1 | #include "std.h" |
2 | |||
3 | #include "emul.h" |
||
4 | #include "vars.h" |
||
5 | #include "draw.h" |
||
6 | #include "dxr_rsm.h" |
||
7 | |||
8 | RSM_DATA rsm; |
||
9 | |||
10 | void RSM_DATA::prepare_line_32(unsigned char *src0) |
||
11 | { |
||
12 | unsigned i; |
||
13 | for (i = 0; i < line_size_d; i++) |
||
14 | line_buffer_d[i] = bias; |
||
15 | |||
16 | unsigned line_size = line_size_d / 2, frame = 0; |
||
17 | __m64 *tab = colortab; |
||
18 | for (;;) |
||
19 | { |
||
20 | unsigned char *src = src0; |
||
21 | for (i = 0; i < line_size; ) |
||
22 | { |
||
23 | unsigned s = *(unsigned*)src, attr = (s >> 6) & 0x3FC; |
||
24 | line_buffer[i+0] = _mm_add_pi8(line_buffer[i+0], tab[((s >> 6) & 3) + attr]); |
||
25 | line_buffer[i+1] = _mm_add_pi8(line_buffer[i+1], tab[((s >> 4) & 3) + attr]); |
||
26 | line_buffer[i+2] = _mm_add_pi8(line_buffer[i+2], tab[((s >> 2) & 3) + attr]); |
||
27 | line_buffer[i+3] = _mm_add_pi8(line_buffer[i+3], tab[((s >> 0) & 3) + attr]); |
||
28 | attr = (s >> 22) & 0x3FC; |
||
29 | line_buffer[i+4] = _mm_add_pi8(line_buffer[i+4], tab[((s >>22) & 3) + attr]); |
||
30 | line_buffer[i+5] = _mm_add_pi8(line_buffer[i+5], tab[((s >>20) & 3) + attr]); |
||
31 | line_buffer[i+6] = _mm_add_pi8(line_buffer[i+6], tab[((s >>18) & 3) + attr]); |
||
32 | line_buffer[i+7] = _mm_add_pi8(line_buffer[i+7], tab[((s >>16) & 3) + attr]); |
||
33 | i += 8, src += 4; |
||
34 | } |
||
35 | if (++frame == mix_frames) |
||
36 | break; |
||
37 | src0 += rb2_offs; |
||
38 | if (src0 >= rbuf_s + rb2_offs * mix_frames) |
||
39 | src0 -= rb2_offs * mix_frames; |
||
40 | tab += 0x100*4; |
||
41 | } |
||
42 | } |
||
43 | |||
44 | void RSM_DATA::prepare_line_16(unsigned char *src0) |
||
45 | { |
||
46 | unsigned i; |
||
47 | for (i = 0; i < line_size_d; i++) |
||
48 | line_buffer_d[i] = bias; |
||
49 | |||
50 | unsigned line_size = line_size_d / 2, frame = 0; |
||
51 | __m64 *tab = colortab; |
||
52 | for (;;) { |
||
53 | unsigned char *src = src0; |
||
54 | for (i = 0; i < line_size; ) { |
||
55 | unsigned s = *(unsigned*)src, attr = (s >> 4) & 0xFF0; |
||
56 | line_buffer[i+0] = _mm_add_pi8(line_buffer[i+0], tab[((s >> 4) & 0xF) + attr]); |
||
57 | line_buffer[i+1] = _mm_add_pi8(line_buffer[i+1], tab[((s >> 0) & 0xF) + attr]); |
||
58 | attr = (s >> 20) & 0xFF0; |
||
59 | line_buffer[i+2] = _mm_add_pi8(line_buffer[i+2], tab[((s >> 20) & 0xF) + attr]); |
||
60 | line_buffer[i+3] = _mm_add_pi8(line_buffer[i+3], tab[((s >> 16) & 0xF) + attr]); |
||
61 | src += 4; i += 4; |
||
62 | } |
||
63 | if (++frame == mix_frames) break; |
||
64 | src0 += rb2_offs; |
||
65 | if (src0 >= rbuf_s + rb2_offs * mix_frames) src0 -= rb2_offs * mix_frames; |
||
66 | tab += 0x100*16; |
||
67 | } |
||
68 | } |
||
69 | |||
70 | void RSM_DATA::prepare_line_8(unsigned char *src0) |
||
71 | { |
||
72 | unsigned i; |
||
73 | for (i = 0; i < line_size_d; i++) |
||
74 | line_buffer_d[i] = bias; |
||
75 | |||
76 | unsigned frame = 0; |
||
77 | DWORD *tab = (DWORD*)colortab; |
||
78 | for (;;) { |
||
79 | unsigned char *src = src0; |
||
80 | for (i = 0; i < line_size_d; ) { |
||
81 | unsigned s = *(unsigned*)src, attr = (s >> 4) & 0xFF0; |
||
82 | line_buffer_d[i+0] += tab[((s >> 4) & 0xF) + attr]; |
||
83 | line_buffer_d[i+1] += tab[((s >> 0) & 0xF) + attr]; |
||
84 | attr = (s >> 20) & 0xFF0; |
||
85 | line_buffer_d[i+2] += tab[((s >> 20) & 0xF) + attr]; |
||
86 | line_buffer_d[i+3] += tab[((s >> 16) & 0xF) + attr]; |
||
87 | s = *(unsigned*)(src+4), attr = (s >> 4) & 0xFF0; |
||
88 | line_buffer_d[i+4] += tab[((s >> 4) & 0xF) + attr]; |
||
89 | line_buffer_d[i+5] += tab[((s >> 0) & 0xF) + attr]; |
||
90 | attr = (s >> 20) & 0xFF0; |
||
91 | line_buffer_d[i+6] += tab[((s >> 20) & 0xF) + attr]; |
||
92 | line_buffer_d[i+7] += tab[((s >> 16) & 0xF) + attr]; |
||
93 | src += 8, i += 8; |
||
94 | } |
||
95 | if (++frame == mix_frames) break; |
||
96 | src0 += rb2_offs; |
||
97 | if (src0 >= rbuf_s + rb2_offs * mix_frames) src0 -= rb2_offs * mix_frames; |
||
98 | tab += 0x100*16; |
||
99 | } |
||
100 | } |
||
101 | |||
102 | void rend_rsm_8(unsigned char *dst, unsigned pitch, unsigned char *src) |
||
103 | { |
||
104 | unsigned delta = temp.scx/4; |
||
105 | for (unsigned y = 0; y < temp.scy; y++) { |
||
106 | rsm.prepare_line_8(src); |
||
107 | for (unsigned i = 0; i < rsm.line_size_d; i++) |
||
108 | *(unsigned*)(dst + i*4) = rsm.line_buffer_d[i]; |
||
109 | dst += pitch; src += delta; |
||
110 | } |
||
111 | } |
||
112 | |||
113 | void rend_rsm_16(unsigned char *dst, unsigned pitch, unsigned char *src) |
||
114 | { |
||
115 | unsigned delta = temp.scx/4; |
||
116 | for (unsigned y = 0; y < temp.scy; y++) { |
||
117 | rsm.prepare_line_32(src); |
||
118 | // pack truecolor pixel to 16 bit |
||
119 | if (temp.hi15 == 0) |
||
120 | for (unsigned i = 0; i < rsm.line_size_d; i+=2) { |
||
121 | unsigned c1 = rsm.line_buffer_d[i]; |
||
122 | unsigned c2 = rsm.line_buffer_d[i+1]; |
||
123 | *(unsigned*)(dst + i*2) = |
||
124 | ((c1 >> 3) & 0x1F) + ((c1 >> 5) & 0x07E0) + ((c1 >> 8) & 0xF800) + |
||
125 | ((c2 << 13) & 0x1F0000) + ((c2 << 11) & 0x07E00000) + ((c2 << 8) & 0xF8000000); |
||
126 | } |
||
127 | else /* if (temp.hi15 == 1) */ |
||
128 | for (unsigned i = 0; i < rsm.line_size_d; i+=2) { |
||
129 | unsigned c1 = rsm.line_buffer_d[i]; |
||
130 | unsigned c2 = rsm.line_buffer_d[i+1]; |
||
131 | *(unsigned*)(dst + i*2) = |
||
132 | ((c1 >> 3) & 0x1F) + ((c1 >> 6) & 0x03E0) + ((c1 >> 9) & 0x7C00) + |
||
133 | ((c2 << 13) & 0x1F0000) + ((c2 << 10) & 0x03E00000) + ((c2 << 7) & 0x7C000000); |
||
134 | } |
||
135 | |||
136 | dst += pitch; src += delta; |
||
137 | } |
||
138 | } |
||
139 | |||
140 | void rend_rsm_16o(unsigned char *dst, unsigned pitch, unsigned char *src) |
||
141 | { |
||
142 | unsigned delta = temp.scx/4; |
||
143 | for (unsigned y = 0; y < temp.scy; y++) { |
||
144 | rsm.prepare_line_16(src); |
||
145 | for (unsigned i = 0; i < rsm.line_size_d; i++) |
||
146 | *(unsigned*)(dst + i*4) = rsm.line_buffer_d[i]; |
||
147 | |||
148 | dst += pitch; src += delta; |
||
149 | } |
||
150 | } |
||
151 | |||
152 | void rend_rsm_32(unsigned char *dst, unsigned pitch, unsigned char *src) |
||
153 | { |
||
154 | unsigned delta = temp.scx/4; |
||
155 | for (unsigned y = 0; y < temp.scy; y++) { |
||
156 | rsm.prepare_line_32(src); |
||
157 | for (unsigned i = 0; i < rsm.line_size_d; i++) |
||
158 | *(unsigned*)(dst + i*4) = rsm.line_buffer_d[i]; |
||
159 | dst += pitch; src += delta; |
||
160 | } |
||
161 | } |
||
162 | |||
163 | void __fastcall render_rsm(unsigned char *dst, unsigned pitch) |
||
164 | { |
||
165 | rsm.colortab = (__m64*)((int)rsm.tables + rsm.frame * rsm.frame_table_size); |
||
166 | unsigned char *src = rbuf_s + rb2_offs * rsm.rbuf_dst; |
||
167 | |||
168 | if (temp.obpp == 8) rend_rsm_8 (dst, pitch, src); |
||
169 | if (temp.obpp == 16) { if (rsm.mode == 0) rend_rsm_16(dst, pitch, src); else rend_rsm_16o(dst, pitch, src); } |
||
170 | if (temp.obpp == 32) rend_rsm_32(dst, pitch, src); |
||
171 | |||
172 | _mm_empty(); // EMMS |
||
173 | } |
||
174 | |||
175 | unsigned gcd(unsigned x, unsigned y) |
||
176 | { |
||
177 | while (x != y) if (x > y) x -= y; else y -= x; |
||
178 | return x; |
||
179 | } |
||
180 | |||
181 | unsigned lcm(unsigned x, unsigned y) |
||
182 | { |
||
183 | return x*y / gcd(x,y); |
||
184 | } |
||
185 | |||
186 | void calc_rsm_tables() |
||
187 | { |
||
188 | rsm.rbuf_dst = rsm.frame = 0; |
||
189 | |||
190 | if (renders[conf.render].func != render_rsm) { |
||
191 | rsm.mix_frames = rsm.period = 1; |
||
192 | static unsigned char one = 1; |
||
193 | rsm.needframes = &one; // rsm.needframes[0]=1 |
||
194 | return; |
||
195 | } |
||
196 | |||
197 | rsm.mode = (temp.obpp == 8)? 2 : 0; |
||
198 | if (temp.obpp == 16 && temp.hi15 == 2) rsm.mode = 1; |
||
199 | |||
200 | rsm.line_size_d = (temp.scx >> rsm.mode); |
||
201 | |||
202 | unsigned fmax = lcm(conf.intfq, temp.ofq); |
||
203 | rsm.period = fmax / conf.intfq; |
||
204 | unsigned step = fmax / temp.ofq; |
||
205 | |||
206 | rsm.mix_frames = (conf.rsm.mode == RSM_SIMPLE)? 2 : conf.rsm.mix_frames; |
||
207 | |||
208 | rsm.frame_table_size = rsm.mix_frames * 0x100; |
||
209 | if (rsm.mode == 0) rsm.frame_table_size *= 4*sizeof(__m64); |
||
210 | if (rsm.mode == 1) rsm.frame_table_size *= 16*sizeof(__m64); |
||
211 | if (rsm.mode == 2) rsm.frame_table_size *= 16*sizeof(DWORD); |
||
212 | |||
213 | rsm.data = (unsigned char*)realloc(rsm.data, rsm.period * (rsm.frame_table_size + 1)); |
||
214 | rsm.tables = (__m64*)rsm.data; |
||
215 | rsm.needframes = rsm.data + rsm.frame_table_size * rsm.period; |
||
216 | |||
217 | double *weights = (double*)alloca(rsm.period * rsm.mix_frames * sizeof(double)); |
||
218 | double *dst = weights; |
||
219 | |||
220 | unsigned low_bias = 0, dynamic_range = 0xFF; |
||
221 | |||
222 | if (conf.rsm.mode != RSM_SIMPLE) { |
||
223 | unsigned fsize = rsm.period * rsm.mix_frames; |
||
224 | double *flt = (double*)alloca((fsize+1) * sizeof(double)); |
||
225 | |||
226 | double cutoff = 0.9; |
||
227 | if (conf.rsm.mode == RSM_FIR1) cutoff = 0.5; |
||
228 | if (conf.rsm.mode == RSM_FIR2) cutoff = 0.33333333; |
||
229 | |||
230 | double pi = 4*atan(1.0); |
||
231 | cutoff *= 1 / (double)rsm.period; // cutoff scale = inftq/maxfq = 1/rsm.period |
||
232 | double c1 = 0.54 / pi, c2 = 0.46 / pi; |
||
233 | for (unsigned i = 0; i <= fsize; i++) { |
||
234 | if (i == fsize/2) flt[i] = cutoff; |
||
235 | else flt[i] = sin(pi*cutoff*((double)i - fsize/2)) * (c1 - c2*cos(2*pi*(double)i/fsize)) / ((double)i - fsize/2); |
||
236 | } |
||
237 | |||
238 | double low_b = 0, high_b = 0; |
||
239 | for (unsigned frame = 0; frame < rsm.period; frame++) |
||
240 | { |
||
241 | unsigned pos = frame * step, srcframe = pos / rsm.period; |
||
242 | if (frame) srcframe++; // (pos % rsm.period) != 0 |
||
243 | |||
244 | unsigned nextpos = pos + step, nextframe = nextpos / rsm.period; |
||
245 | if (frame+1 != rsm.period) nextframe++; // (nextpos % rsm.period) != 0 |
||
246 | |||
247 | rsm.needframes[frame] = (nextframe - srcframe); |
||
248 | double low = 0, high = 0; |
||
249 | unsigned offset = (srcframe * rsm.period) - pos; |
||
250 | for (unsigned ch = 0; ch < rsm.mix_frames; ch++) { |
||
251 | double weight = flt[offset] * rsm.period; |
||
252 | if (weight < 0) low += weight; else high += weight; |
||
253 | *dst++ = weight; |
||
254 | offset += rsm.period; |
||
255 | } |
||
256 | if (low < low_b) low_b = low; |
||
257 | if (high > high_b) high_b = high; |
||
258 | } |
||
259 | low_bias = (unsigned)((-low_b)*0xFF); |
||
260 | dynamic_range = (0xFF - low_bias); |
||
261 | } else { // RSM_SIMPLE |
||
262 | |||
263 | double div = 1 / (double)step; |
||
264 | for (unsigned frame = 0; frame < rsm.period; frame++) |
||
265 | { |
||
266 | unsigned pos = frame * step, srcframe = pos / rsm.period; |
||
267 | unsigned nextpos = pos + step, nextframe = nextpos / rsm.period; |
||
268 | unsigned offset = (srcframe == nextframe)? step : (nextpos - nextframe*rsm.period); |
||
269 | rsm.needframes[frame] = (nextframe - srcframe); |
||
270 | *dst++ = offset * div; |
||
271 | *dst++ = (step - offset) * div; |
||
272 | } |
||
273 | } |
||
274 | |||
275 | rsm.bias = 0x01010101 * low_bias; |
||
276 | |||
277 | unsigned char *dst32 = (unsigned char*)rsm.tables; |
||
278 | for (unsigned frame = 0; frame < rsm.period; frame++) |
||
279 | for (unsigned ch = 0; ch < rsm.mix_frames; ch++) |
||
280 | { |
||
281 | unsigned char *start_frame = dst32; |
||
282 | switch (rsm.mode) |
||
283 | { |
||
284 | case 0: // rgb 16/32bit - reorder table, MMX processes 2 truecolor pixels at one op |
||
285 | for (unsigned a = 0; a < 0x100; a++) |
||
286 | for (unsigned pix = 0; pix < 4; pix++) { |
||
287 | *(DWORD*)(dst32+0) = t.sctab32[0][(pix >>1)*0x100 + a]; |
||
288 | *(DWORD*)(dst32+4) = t.sctab32[0][(pix & 1)*0x100 + a]; |
||
289 | dst32 += 8; |
||
290 | } |
||
291 | break; |
||
292 | case 1: // YUY2 overlay - reorder table, MMX processes 4 overlay pixels at one op |
||
293 | for (unsigned a = 0; a < 0x100; a++) |
||
294 | for (unsigned pix = 0; pix < 16; pix++) { |
||
295 | *(DWORD*)(dst32+0) = t.sctab16[0][a*4 + (pix >> 2)]; |
||
296 | *(DWORD*)(dst32+4) = t.sctab16[0][a*4 + (pix & 3)]; |
||
297 | dst32 += 8; |
||
298 | } |
||
299 | break; |
||
300 | case 2: |
||
301 | memcpy(dst32, t.sctab8[0], 0x100*16*sizeof(DWORD)), dst32 += 0x100*16*sizeof(DWORD); |
||
302 | break; |
||
303 | default: |
||
304 | __assume(0); |
||
305 | } |
||
306 | |||
307 | double scale = *weights++ * dynamic_range * (1/256.0); |
||
308 | for (unsigned char *ptr = start_frame; ptr < dst32; ptr++) { |
||
309 | double color = scale * *ptr; |
||
310 | if (color > 255) color = 255; |
||
311 | if (color < 0) color += 256; // color = 0; |
||
312 | *ptr = (unsigned char)color; |
||
313 | } |
||
314 | } |
||
315 | } |