Subversion Repositories pentevo

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
716 lvd 1
// http://scale2x.sourceforge.net/algorithm.html
2
 
3
#include "std.h"
4
 
5
#include "emul.h"
6
#include "vars.h"
7
#include "draw.h"
8
#include "dxrend.h"
9
#include "dxrcopy.h"
10
#include "dxrframe.h"
11
#include "dxr_advm.h"
12
 
13
inline void line_8_any(unsigned char *dst, unsigned char *src)
14
{
15
   if (conf.noflic) line8_nf(dst, src, t.sctab8[0]);
16
   else line8(dst, src, t.sctab8[0]);
17
}
18
 
19
inline void line_32_any(unsigned char *dst, unsigned char *src)
20
{
21
   if (conf.noflic) line32_nf(dst, src, t.sctab32[0]);
22
   else line32(dst, src, t.sctab32[0]);
23
}
24
 
25
#if 1   // switch between vectorized and branched code
26
 
27
#ifdef MOD_SSE2
28
 
29
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
30
{
31
   const unsigned char
32
      *u = src + ((y-1) & 7)*sc2lines_width,
33
      *m = src + ((y+0) & 7)*sc2lines_width,
34
      *l = src + ((y+1) & 7)*sc2lines_width;
35
 
36
   for (unsigned i = 0; i < nPix; i += 8) {
37
 
38
      __m64 uu = *(__m64*)(u+i);
39
      __m64 ll = *(__m64*)(l+i);
40
      __m64 cmp = _mm_cmpeq_pi8(uu,ll);
41
 
42
      if (_mm_movemask_pi8(cmp) != 0xFF) {
43
 
44
         __m128i mm = _mm_loadu_si128((__m128i*)(m+i-4));
45
         __m128i uu = _mm_loadu_si128((__m128i*)(u+i-4));
46
         __m128i ll = _mm_loadu_si128((__m128i*)(l+i-4));
47
 
48
         __m128i md = _mm_slli_si128(mm,1);
49
         __m128i mf = _mm_srli_si128(mm,1);
50
         __m128i maskall = _mm_or_si128(_mm_cmpeq_epi8(md,mf), _mm_cmpeq_epi8(uu,ll));
51
 
52
         __m128i e0, e1, v1, v2, v3;
53
 
54
         e0 = _mm_cmpeq_epi8(md,uu);
55
         e0 = _mm_andnot_si128(maskall, e0);
56
         e0 = _mm_srli_si128(e0,4);
57
         e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());
58
 
59
         e1 = _mm_cmpeq_epi8(mf,uu);
60
         e1 = _mm_andnot_si128(maskall, e1);
61
         e1 = _mm_srli_si128(e1,4);
62
         e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);
63
 
64
         e0 = _mm_or_si128(e0, e1);
65
 
66
         v1 = _mm_srli_si128(mm,4);
67
         v1 = _mm_unpacklo_epi8(v1,v1);
68
         v2 = _mm_srli_si128(uu,4);
69
         v2 = _mm_unpacklo_epi8(v2,v2);
70
 
71
         _mm_store_si128((__m128i*)(dst1 + 2*i), _mm_or_si128( _mm_and_si128(e0,v2), _mm_andnot_si128(e0,v1) ) );
72
 
73
         e0 = _mm_cmpeq_epi8(md,ll);
74
         e0 = _mm_andnot_si128(maskall, e0);
75
         e0 = _mm_srli_si128(e0,4);
76
         e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());
77
 
78
         e1 = _mm_cmpeq_epi8(mf,ll);
79
         e1 = _mm_andnot_si128(maskall, e1);
80
         e1 = _mm_srli_si128(e1,4);
81
         e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);
82
 
83
         e0 = _mm_or_si128(e0, e1);
84
 
85
         v3 = _mm_srli_si128(ll,4);
86
         v3 = _mm_unpacklo_epi8(v3,v3);
87
 
88
         _mm_store_si128((__m128i*)(dst2 + 2*i), _mm_or_si128( _mm_and_si128(e0,v3), _mm_andnot_si128(e0,v1) ) );
89
 
90
      } else {
91
 
92
         __m64 v0 = *(__m64*)(m+i);
93
         __m128i v1 = _mm_movpi64_epi64(v0);
94
         v1 = _mm_unpacklo_epi8(v1,v1);
95
         _mm_store_si128((__m128i*)(dst1 + 2*i), v1);
96
         _mm_store_si128((__m128i*)(dst2 + 2*i), v1);
97
      }
98
   }
99
}
100
 
101
#else // MMX vectorized
102
 
103
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
104
{
105
   const unsigned char
106
      *u = src + ((y-1) & 7)*sc2lines_width,
107
      *m = src + ((y+0) & 7)*sc2lines_width,
108
      *l = src + ((y+1) & 7)*sc2lines_width;
109
 
110
   for (unsigned i = 0; i < nPix; i += 4) {
111
 
112
      if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) {
113
 
114
         __m64 mm = *(__m64*)(m+i-2);
115
         __m64 uu = *(__m64*)(u+i-2);
116
         __m64 ll = *(__m64*)(l+i-2);
117
         __m64 md = _mm_slli_si64(mm,8);
118
         __m64 mf = _mm_srli_si64(mm,8);
119
         __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll));
120
 
121
         __m64 e0, e1, v1, v2;
122
 
123
         e0 = _mm_cmpeq_pi8(md,uu);
124
         e0 = _mm_andnot_si64(maskall, e0);
125
         e0 = _mm_srli_si64(e0,16);
126
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());
127
 
128
         e1 = _mm_cmpeq_pi8(mf,uu);
129
         e1 = _mm_andnot_si64(maskall, e1);
130
         e1 = _mm_srli_si64(e1,16);
131
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);
132
 
133
         e0 = _mm_or_si64(e0, e1);
134
 
135
         v1 = _m_from_int(*(unsigned*)(m+i));
136
         v2 = _m_from_int(*(unsigned*)(u+i));
137
         v1 = _mm_unpacklo_pi8(v1,v1);
138
         v2 = _mm_unpacklo_pi8(v2,v2);
139
 
140
         *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );
141
 
142
         e0 = _mm_cmpeq_pi8(md,ll);
143
         e0 = _mm_andnot_si64(maskall, e0);
144
         e0 = _mm_srli_si64(e0,16);
145
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());
146
 
147
         e1 = _mm_cmpeq_pi8(mf,ll);
148
         e1 = _mm_andnot_si64(maskall, e1);
149
         e1 = _mm_srli_si64(e1,16);
150
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);
151
 
152
         e0 = _mm_or_si64(e0, e1);
153
 
154
         v1 = _m_from_int(*(unsigned*)(m+i));
155
         v2 = _m_from_int(*(unsigned*)(l+i));
156
         v1 = _mm_unpacklo_pi8(v1,v1);
157
         v2 = _mm_unpacklo_pi8(v2,v2);
158
 
159
         *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );
160
 
161
      } else {
162
 
163
         __m64 v1 = _m_from_int(*(unsigned*)(m+i));
164
         v1 = _mm_unpacklo_pi8(v1,v1);
165
         *(__m64*)(dst1 + 2*i) = v1;
166
         *(__m64*)(dst2 + 2*i) = v1;
167
 
168
      }
169
 
170
   }
171
}
172
 
173
#endif // SSE2
174
 
175
#else // MMX branched
176
// src       dst
177
// ABC       e0e1
178
// DEF       e2e3
179
// GHI
180
 
181
/*
182
if(B != H && D != F)
183
{                               E0 = E;
184
    E0 = D == B ? D : E;        E1 = E;
185
    E1 = B == F ? F : E;        E2 = E;
186
    E2 = D == H ? D : E;        E3 = E;
187
    E3 = H == F ? F : E;        if(B != H) continue;
188
}                          =>   if(D != F)
189
else                            {
190
{                                   E0 = D == B ? D : E;
191
    E0 = E;                         E1 = B == F ? F : E;
192
    E1 = E;                         E2 = D == H ? D : E;
193
    E2 = E;                         E3 = H == F ? F : E;
194
    E3 = E;                     }
195
}
196
*/
197
 
198
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
199
{
200
   const unsigned char
201
      *u = src + ((y-1) & 7)*sc2lines_width,
202
      *m = src + ((y+0) & 7)*sc2lines_width,
203
      *l = src + ((y+1) & 7)*sc2lines_width;
204
 
205
   // process 4pix per iteration
206
   for (unsigned i = 0; i < nPix; i += 4)
207
   {
208
      unsigned dw = *(unsigned*)(m+i);
209
      __m64 v1 = _mm_cvtsi32_si64(dw); // v1   =     0|    0|    0|    0|dw[3]|dw[2]|dw[1]|dw[0]
210
      v1 = _mm_unpacklo_pi8(v1,v1);    // v1   = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
211
      *(__m64*)(dst1 + 2*i) = v1;      // e0e1 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
212
      *(__m64*)(dst2 + 2*i) = v1;      // e2e3 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
213
 
214
      dw = *(unsigned*)(u+i) ^ *(unsigned*)(l+i);
215
      if (!dw)
216
          continue; // u == l
217
 
218
   #define process_pix(n)                                       \
219
      if ((dw & (0xFF << (8*n))) && m[i+n-1] != m[i+n+1])       \
220
      {                                                         \
221
         if (u[i+n] == m[i+n-1])                                \
222
             dst1[2*(i+n)] = u[i+n];                            \
223
         if (u[i+n] == m[i+n+1])                                \
224
             dst1[2*(i+n)+1] = u[i+n];                          \
225
         if (l[i+n] == m[i+n-1])                                \
226
             dst2[2*(i+n)] = l[i+n];                            \
227
         if (l[i+n] == m[i+n+1])                                \
228
             dst2[2*(i+n)+1] = l[i+n];                          \
229
      }
230
 
231
      process_pix(0);
232
      process_pix(1);
233
      process_pix(2);
234
      process_pix(3);
235
   #undef process_pix
236
   }
237
}
238
 
239
#endif // MMX branched
240
 
241
void lines_scale2_32(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
242
{
243
   const u32 *s = (u32 *)src;
244
   const u32 *u = s + ((y-1) & 7)*sc2lines_width;
245
   const u32 *m = s + ((y+0) & 7)*sc2lines_width;
246
   const u32 *l = s + ((y+1) & 7)*sc2lines_width;
247
   u32 *d1 = (u32 *)dst1;
248
   u32 *d2 = (u32 *)dst2;
249
 
250
   for (unsigned i = 0; i < nPix; i++)
251
   {
252
      d1[2*i] = d1[2*i+1] = d2[2*i] = d2[2*i+1] = m[i];
253
 
254
      if (u[i] != l[i] && m[i-1] != m[i+1])
255
      {
256
         if (u[i] == m[i-1])
257
             d1[2*i] = u[i];
258
         if (u[i] == m[i+1])
259
             d1[2*i+1] = u[i];
260
         if (l[i] == m[i-1])
261
             d2[2*i] = l[i];
262
         if (l[i] == m[i+1])
263
             d2[2*i+1] = l[i];
264
      }
265
   }
266
}
267
 
268
// 8bpp
269
void render_scale2(unsigned char *dst, unsigned pitch)
270
{
271
   unsigned char *src = rbuf; unsigned delta = temp.scx/4;
272
   line_8_any(t.scale2buf[0], src);
273
   // assume 'above' screen line same as line 0
274
   memcpy(t.scale2buf[(0-1) & 7], t.scale2buf[0], temp.scx);
275
   for (unsigned y = 0; y < temp.scy; y++)
276
   {
277
      src += delta;
278
      line_8_any(t.scale2buf[(y+1) & 7], src);
279
      lines_scale2(t.scale2buf[0], y, dst, dst+pitch, temp.scx);
280
      dst += 2*pitch;
281
   }
282
}
283
 
284
// 32bpp
285
void render_scale2_32(unsigned char *dst, unsigned pitch)
286
{
287
   unsigned char *src = rbuf;
288
   unsigned delta = temp.scx/4;
289
   line_32_any((u8 *)t.scale2buf32[0], src);
290
 
291
   // assume 'above' screen line same as line 0
292
   memcpy(t.scale2buf32[(0-1) & 7], t.scale2buf32[0], temp.scx);
293
   for (unsigned y = 0; y < temp.scy; y++)
294
   {
295
      src += delta;
296
      line_32_any((u8 *)t.scale2buf32[(y+1) & 7], src);
297
      lines_scale2_32((u8 *)t.scale2buf32[0], y, dst, dst+pitch, temp.scx);
298
      dst += 2*pitch;
299
   }
300
}
301
 
302
// MMX-vectorized version is not ready yet :(
303
// 8bpp
304
void lines_scale3(unsigned y, unsigned char *dst, unsigned pitch)
305
{
306
 
307
   const unsigned char
308
      *u = t.scale2buf[(y-1) & 3],
309
      *m = t.scale2buf[(y+0) & 3],
310
      *l = t.scale2buf[(y+1) & 3];
311
 
312
   for (unsigned i = 0; i < temp.scx; i += 4)
313
   {
314
      unsigned char c;
315
 
316
      c = m[i];
317
      dst[3*i+0+0*pitch+ 0] = dst[3*i+1+0*pitch+ 0] = dst[3*i+2+0*pitch+ 0] = c;
318
      dst[3*i+0+1*pitch+ 0] = dst[3*i+1+1*pitch+ 0] = dst[3*i+2+1*pitch+ 0] = c;
319
      dst[3*i+0+2*pitch+ 0] = dst[3*i+1+2*pitch+ 0] = dst[3*i+2+2*pitch+ 0] = c;
320
 
321
      c = m[i+1];
322
      dst[3*i+0+0*pitch+ 3] = dst[3*i+1+0*pitch+ 3] = dst[3*i+2+0*pitch+ 3] = c;
323
      dst[3*i+0+1*pitch+ 3] = dst[3*i+1+1*pitch+ 3] = dst[3*i+2+1*pitch+ 3] = c;
324
      dst[3*i+0+2*pitch+ 3] = dst[3*i+1+2*pitch+ 3] = dst[3*i+2+2*pitch+ 3] = c;
325
 
326
      c = m[i+2];
327
      dst[3*i+0+0*pitch+ 6] = dst[3*i+1+0*pitch+ 6] = dst[3*i+2+0*pitch+ 6] = c;
328
      dst[3*i+0+1*pitch+ 6] = dst[3*i+1+1*pitch+ 6] = dst[3*i+2+1*pitch+ 6] = c;
329
      dst[3*i+0+2*pitch+ 6] = dst[3*i+1+2*pitch+ 6] = dst[3*i+2+2*pitch+ 6] = c;
330
 
331
      c = m[i+3];
332
      dst[3*i+0+0*pitch+ 9] = dst[3*i+1+0*pitch+ 9] = dst[3*i+2+0*pitch+ 9] = c;
333
      dst[3*i+0+1*pitch+ 9] = dst[3*i+1+1*pitch+ 9] = dst[3*i+2+1*pitch+ 9] = c;
334
      dst[3*i+0+2*pitch+ 9] = dst[3*i+1+2*pitch+ 9] = dst[3*i+2+2*pitch+ 9] = c;
335
 
336
      unsigned dw = *(unsigned*)(u+i) ^ *(unsigned*)(l+i);
337
      if (!dw) continue;
338
 
339
   #define process_pix(n)                                                                              \
340
      if ((dw & (0xFF << (8*n))) && m[i+n-1] != m[i+n+1])                                              \
341
      {                                                                                                \
342
         if (u[i+n] == m[i+n-1])                                                                       \
343
             dst[0*pitch+3*(i+n)] = u[i+n];                                                            \
344
         if ((u[i+n] == m[i+n-1] && m[i+n] != u[i+n+1]) || (u[i+n] == m[i+n+1] && m[i+n] != u[i+n-1])) \
345
             dst[0*pitch+3*(i+n)+1] = u[i+n];                                                          \
346
         if (u[i+n] == m[i+n+1])                                                                       \
347
             dst[0*pitch+3*(i+n)+2] = u[i+n];                                                          \
348
         if ((u[i+n] == m[i+n-1] && m[i+n] != l[i+n-1]) || (l[i+n] == m[i+n-1] && m[i+n] != u[i+n-1])) \
349
             dst[1*pitch+3*(i+n)+0] = m[i+n-1];                                                        \
350
         if ((u[i+n] == m[i+n+1] && m[i+n] != l[i+n+1]) || (l[i+n] == m[i+n+1] && m[i+n] != u[i+n+1])) \
351
             dst[1*pitch+3*(i+n)+2] = m[i+n+1];                                                        \
352
         if (l[i+n] == m[i+n-1])                                                                       \
353
             dst[2*pitch+3*(i+n)] = l[i+n];                                                            \
354
         if ((l[i+n] == m[i+n-1] && m[i+n] != l[i+n+1]) || (l[i+n] == m[i+n+1] && m[i+n] != l[i+n-1])) \
355
             dst[2*pitch+3*(i+n)+1] = l[i+n];                                                          \
356
         if (l[i+n] == m[i+n+1])                                                                       \
357
             dst[2*pitch+3*(i+n)+2] = l[i+n];                                                          \
358
      }
359
 
360
      process_pix(0);
361
      process_pix(1);
362
      process_pix(2);
363
      process_pix(3);
364
   #undef process_pix
365
   }
366
}
367
 
368
// 8bpp
369
void render_scale3(unsigned char *dst, unsigned pitch)
370
{
371
   unsigned char *src = rbuf; unsigned delta = temp.scx/4;
372
   line_8_any(t.scale2buf[0], src);
373
   // assume 'above' screen line same as line 0
374
   memcpy(t.scale2buf[(0-1) & 3], t.scale2buf[0], temp.scx);
375
   for (unsigned y = 0; y < temp.scy; y++) {
376
      src += delta;
377
      line_8_any(t.scale2buf[(y+1) & 3], src);
378
      lines_scale3(y, dst, pitch);
379
      dst += 3*pitch;
380
   }
381
}
382
 
383
// 32bpp
384
void lines_scale3_32(unsigned y, unsigned char *dst, unsigned pitch)
385
{
386
   const u32 *u = t.scale2buf32[(y-1) & 3];
387
   const u32 *m = t.scale2buf32[(y+0) & 3];
388
   const u32 *l = t.scale2buf32[(y+1) & 3];
389
   u32 *d = (u32 *)dst;
390
   pitch /= sizeof(u32);
391
 
392
   for (unsigned i = 0; i < temp.scx; i++)
393
   {
394
      d[0*pitch+3*i+0] = d[0*pitch+3*i+1] = d[0*pitch+3*i+2] = m[i];
395
      d[1*pitch+3*i+0] = d[1*pitch+3*i+1] = d[1*pitch+3*i+2] = m[i];
396
      d[2*pitch+3*i+0] = d[2*pitch+3*i+1] = d[2*pitch+3*i+2] = m[i];
397
 
398
      if (u[i] != l[i] && m[i-1] != m[i+1])
399
      {
400
         if (u[i] == m[i-1])
401
             d[0*pitch+3*i+0] = u[i];
402
         if ((u[i] == m[i-1] && m[i] != u[i+1]) || (u[i] == m[i+1] && m[i] != u[i-1]))
403
             d[0*pitch+3*i+1] = u[i];
404
         if (u[i] == m[i+1])
405
             d[0*pitch+3*i+2] = u[i];
406
         if ((u[i] == m[i-1] && m[i] != l[i-1]) || (l[i] == m[i-1] && m[i] != u[i-1]))
407
             d[1*pitch+3*i+0] = m[i-1];
408
         if ((u[i] == m[i+1] && m[i] != l[i+1]) || (l[i] == m[i+1] && m[i] != u[i+1]))
409
             d[1*pitch+3*i+2] = m[i+1];
410
         if (l[i] == m[i-1])
411
             d[2*pitch+3*i+0] = l[i];
412
         if ((l[i] == m[i-1] && m[i] != l[i+1]) || (l[i] == m[i+1] && m[i] != l[i-1]))
413
             d[2*pitch+3*i+1] = l[i];
414
         if (l[i] == m[i+1])
415
             d[2*pitch+3*i+2] = l[i];
416
      }
417
   }
418
}
419
 
420
// 32bpp
421
void render_scale3_32(unsigned char *dst, unsigned pitch)
422
{
423
   unsigned char *src = rbuf; unsigned delta = temp.scx/4;
424
   line_32_any((u8 *)t.scale2buf32[0], src);
425
   // assume 'above' screen line same as line 0
426
   memcpy(t.scale2buf32[(0-1) & 3], t.scale2buf32[0], temp.scx);
427
   for (unsigned y = 0; y < temp.scy; y++)
428
   {
429
      src += delta;
430
      line_32_any((u8 *)t.scale2buf32[(y+1) & 3], src);
431
      lines_scale3_32(y, dst, pitch);
432
      dst += 3*pitch;
433
   }
434
}
435
 
436
void render_scale4(unsigned char *dst, unsigned pitch)
437
{
438
   unsigned char *src = rbuf; unsigned delta = temp.scx/4;
439
 
440
   line_8_any(t.scale2buf[0], src); src += delta;
441
   line_8_any(t.scale2buf[1], src); src += delta;
442
   // assume 'above' screen line same as line 0
443
   memcpy(t.scale2buf[(0-1) & 7], t.scale2buf[0], temp.scx);
444
   lines_scale2(t.scale2buf[0], 0, t.scale4buf[0], t.scale4buf[1], temp.scx);
445
 
446
   for (unsigned y = 0; y < temp.scy; y++) {
447
 
448
      line_8_any(t.scale2buf[(y+2) & 7], src); src += delta;
449
 
450
      unsigned char *dst1 = t.scale4buf[(2*y+2) & 7];
451
      unsigned char *dst2 = t.scale4buf[(2*y+3) & 7];
452
      lines_scale2(t.scale2buf[0], y+1, dst1, dst2, temp.scx);
453
 
454
      lines_scale2(t.scale4buf[0], 2*y,   dst+0*pitch, dst+1*pitch, temp.scx*2);
455
      lines_scale2(t.scale4buf[0], 2*y+1, dst+2*pitch, dst+3*pitch, temp.scx*2);
456
 
457
      dst += 4*pitch;
458
   }
459
}
460
 
461
void __fastcall render_advmame(unsigned char *dst, unsigned pitch)
462
{
463
   switch (conf.videoscale)
464
   {
465
      case 2:
466
          if(temp.obpp == 8) render_scale2(dst, pitch);
467
          else if(temp.obpp == 32) render_scale2_32(dst, pitch);
468
      break;
469
      case 3:
470
          if(temp.obpp == 8) render_scale3(dst, pitch);
471
          else if(temp.obpp == 32) render_scale3_32(dst, pitch);
472
      break;
473
      case 4: render_scale4(dst, pitch); break;
474
      default: render_small(dst, pitch); return; // skip noflic test
475
   }
476
   if (conf.noflic)
477
       memcpy(rbuf_s, rbuf, temp.scy*temp.scx/4);
478
   _mm_empty();
479
}