2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
21 * and improved by Zdenek Kabelac <kabi@users.sf.net>
25 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
29 "lea (%3, %3), %%eax \n\t"
32 "movq (%1), %%mm0 \n\t"
33 "movq 1(%1), %%mm1 \n\t"
34 "movq (%1, %3), %%mm2 \n\t"
35 "movq 1(%1, %3), %%mm3 \n\t"
36 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
37 "movq %%mm4, (%2) \n\t"
38 "movq %%mm5, (%2, %3) \n\t"
41 "movq (%1), %%mm0 \n\t"
42 "movq 1(%1), %%mm1 \n\t"
43 "movq (%1, %3), %%mm2 \n\t"
44 "movq 1(%1, %3), %%mm3 \n\t"
45 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
46 "movq %%mm4, (%2) \n\t"
47 "movq %%mm5, (%2, %3) \n\t"
52 :"+g"(h), "+S"(pixels), "+D"(block)
57 static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
63 "movq (%1), %%mm0 \n\t"
64 "movq (%2), %%mm1 \n\t"
67 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
68 "movq %%mm4, (%3) \n\t"
73 "movq (%1), %%mm0 \n\t"
74 "movq (%2), %%mm1 \n\t"
76 "movq (%1), %%mm2 \n\t"
77 "movq 8(%2), %%mm3 \n\t"
79 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
80 "movq %%mm4, (%3) \n\t"
82 "movq %%mm5, (%3) \n\t"
84 "movq (%1), %%mm0 \n\t"
85 "movq 16(%2), %%mm1 \n\t"
87 "movq (%1), %%mm2 \n\t"
88 "movq 24(%2), %%mm3 \n\t"
91 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
92 "movq %%mm4, (%3) \n\t"
94 "movq %%mm5, (%3) \n\t"
98 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
99 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
101 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
103 :"S"(src1Stride), "D"(dstStride)
107 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
111 "lea (%3, %3), %%eax \n\t"
114 "movq (%1), %%mm0 \n\t"
115 "movq 1(%1), %%mm1 \n\t"
116 "movq (%1, %3), %%mm2 \n\t"
117 "movq 1(%1, %3), %%mm3 \n\t"
118 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
119 "movq %%mm4, (%2) \n\t"
120 "movq %%mm5, (%2, %3) \n\t"
121 "movq 8(%1), %%mm0 \n\t"
122 "movq 9(%1), %%mm1 \n\t"
123 "movq 8(%1, %3), %%mm2 \n\t"
124 "movq 9(%1, %3), %%mm3 \n\t"
125 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
126 "movq %%mm4, 8(%2) \n\t"
127 "movq %%mm5, 8(%2, %3) \n\t"
128 "addl %%eax, %1 \n\t"
129 "addl %%eax, %2 \n\t"
130 "movq (%1), %%mm0 \n\t"
131 "movq 1(%1), %%mm1 \n\t"
132 "movq (%1, %3), %%mm2 \n\t"
133 "movq 1(%1, %3), %%mm3 \n\t"
134 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
135 "movq %%mm4, (%2) \n\t"
136 "movq %%mm5, (%2, %3) \n\t"
137 "movq 8(%1), %%mm0 \n\t"
138 "movq 9(%1), %%mm1 \n\t"
139 "movq 8(%1, %3), %%mm2 \n\t"
140 "movq 9(%1, %3), %%mm3 \n\t"
141 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
142 "movq %%mm4, 8(%2) \n\t"
143 "movq %%mm5, 8(%2, %3) \n\t"
144 "addl %%eax, %1 \n\t"
145 "addl %%eax, %2 \n\t"
148 :"+g"(h), "+S"(pixels), "+D"(block)
153 static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
159 "movq (%1), %%mm0 \n\t"
160 "movq (%2), %%mm1 \n\t"
161 "movq 8(%1), %%mm2 \n\t"
162 "movq 8(%2), %%mm3 \n\t"
165 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
166 "movq %%mm4, (%3) \n\t"
167 "movq %%mm5, 8(%3) \n\t"
172 "movq (%1), %%mm0 \n\t"
173 "movq (%2), %%mm1 \n\t"
174 "movq 8(%1), %%mm2 \n\t"
175 "movq 8(%2), %%mm3 \n\t"
177 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
178 "movq %%mm4, (%3) \n\t"
179 "movq %%mm5, 8(%3) \n\t"
181 "movq (%1), %%mm0 \n\t"
182 "movq 16(%2), %%mm1 \n\t"
183 "movq 8(%1), %%mm2 \n\t"
184 "movq 24(%2), %%mm3 \n\t"
186 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
187 "movq %%mm4, (%3) \n\t"
188 "movq %%mm5, 8(%3) \n\t"
193 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
194 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
196 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
198 :"S"(src1Stride), "D"(dstStride)
202 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
206 "lea (%3, %3), %%eax \n\t"
207 "movq (%1), %%mm0 \n\t"
210 "movq (%1, %3), %%mm1 \n\t"
211 "movq (%1, %%eax),%%mm2 \n\t"
212 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
213 "movq %%mm4, (%2) \n\t"
214 "movq %%mm5, (%2, %3) \n\t"
215 "addl %%eax, %1 \n\t"
216 "addl %%eax, %2 \n\t"
217 "movq (%1, %3), %%mm1 \n\t"
218 "movq (%1, %%eax),%%mm0 \n\t"
219 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
220 "movq %%mm4, (%2) \n\t"
221 "movq %%mm5, (%2, %3) \n\t"
222 "addl %%eax, %1 \n\t"
223 "addl %%eax, %2 \n\t"
226 :"+g"(h), "+S"(pixels), "+D"(block)
231 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
234 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
236 "movq (%1), %%mm0 \n\t"
237 "movq 1(%1), %%mm4 \n\t"
238 "movq %%mm0, %%mm1 \n\t"
239 "movq %%mm4, %%mm5 \n\t"
240 "punpcklbw %%mm7, %%mm0 \n\t"
241 "punpcklbw %%mm7, %%mm4 \n\t"
242 "punpckhbw %%mm7, %%mm1 \n\t"
243 "punpckhbw %%mm7, %%mm5 \n\t"
244 "paddusw %%mm0, %%mm4 \n\t"
245 "paddusw %%mm1, %%mm5 \n\t"
246 "xorl %%eax, %%eax \n\t"
250 "movq (%1, %%eax), %%mm0 \n\t"
251 "movq 1(%1, %%eax), %%mm2 \n\t"
252 "movq %%mm0, %%mm1 \n\t"
253 "movq %%mm2, %%mm3 \n\t"
254 "punpcklbw %%mm7, %%mm0 \n\t"
255 "punpcklbw %%mm7, %%mm2 \n\t"
256 "punpckhbw %%mm7, %%mm1 \n\t"
257 "punpckhbw %%mm7, %%mm3 \n\t"
258 "paddusw %%mm2, %%mm0 \n\t"
259 "paddusw %%mm3, %%mm1 \n\t"
260 "paddusw %%mm6, %%mm4 \n\t"
261 "paddusw %%mm6, %%mm5 \n\t"
262 "paddusw %%mm0, %%mm4 \n\t"
263 "paddusw %%mm1, %%mm5 \n\t"
264 "psrlw $2, %%mm4 \n\t"
265 "psrlw $2, %%mm5 \n\t"
266 "packuswb %%mm5, %%mm4 \n\t"
267 "movq %%mm4, (%2, %%eax) \n\t"
268 "addl %3, %%eax \n\t"
270 "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
271 "movq 1(%1, %%eax), %%mm4 \n\t"
272 "movq %%mm2, %%mm3 \n\t"
273 "movq %%mm4, %%mm5 \n\t"
274 "punpcklbw %%mm7, %%mm2 \n\t"
275 "punpcklbw %%mm7, %%mm4 \n\t"
276 "punpckhbw %%mm7, %%mm3 \n\t"
277 "punpckhbw %%mm7, %%mm5 \n\t"
278 "paddusw %%mm2, %%mm4 \n\t"
279 "paddusw %%mm3, %%mm5 \n\t"
280 "paddusw %%mm6, %%mm0 \n\t"
281 "paddusw %%mm6, %%mm1 \n\t"
282 "paddusw %%mm4, %%mm0 \n\t"
283 "paddusw %%mm5, %%mm1 \n\t"
284 "psrlw $2, %%mm0 \n\t"
285 "psrlw $2, %%mm1 \n\t"
286 "packuswb %%mm1, %%mm0 \n\t"
287 "movq %%mm0, (%2, %%eax) \n\t"
288 "addl %3, %%eax \n\t"
292 :"+g"(h), "+S"(pixels)
293 :"D"(block), "r"(line_size)
298 // in case more speed is needed - unroling would certainly help
299 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
305 "movq %0, %%mm0 \n\t"
306 "movq %1, %%mm1 \n\t"
307 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
308 "movq %%mm2, %0 \n\t"
318 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
324 "movq %0, %%mm0 \n\t"
325 "movq %1, %%mm1 \n\t"
326 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
327 "movq %%mm2, %0 \n\t"
328 "movq 8%0, %%mm0 \n\t"
329 "movq 8%1, %%mm1 \n\t"
330 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
331 "movq %%mm2, 8%0 \n\t"
341 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
347 "movq %1, %%mm0 \n\t"
348 "movq 1%1, %%mm1 \n\t"
349 "movq %0, %%mm3 \n\t"
350 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
351 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
352 "movq %%mm0, %0 \n\t"
361 static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
367 "movq %1, %%mm0 \n\t"
368 "movq %2, %%mm1 \n\t"
369 "movq %0, %%mm3 \n\t"
370 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
371 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
372 "movq %%mm0, %0 \n\t"
374 :"m"(*src1), "m"(*src2)
382 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
388 "movq %1, %%mm0 \n\t"
389 "movq 1%1, %%mm1 \n\t"
390 "movq %0, %%mm3 \n\t"
391 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
392 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
393 "movq %%mm0, %0 \n\t"
394 "movq 8%1, %%mm0 \n\t"
395 "movq 9%1, %%mm1 \n\t"
396 "movq 8%0, %%mm3 \n\t"
397 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
398 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
399 "movq %%mm0, 8%0 \n\t"
408 static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
414 "movq %1, %%mm0 \n\t"
415 "movq %2, %%mm1 \n\t"
416 "movq %0, %%mm3 \n\t"
417 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
418 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
419 "movq %%mm0, %0 \n\t"
420 "movq 8%1, %%mm0 \n\t"
421 "movq 8%2, %%mm1 \n\t"
422 "movq 8%0, %%mm3 \n\t"
423 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
424 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
425 "movq %%mm0, 8%0 \n\t"
427 :"m"(*src1), "m"(*src2)
435 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
439 "lea (%3, %3), %%eax \n\t"
440 "movq (%1), %%mm0 \n\t"
443 "movq (%1, %3), %%mm1 \n\t"
444 "movq (%1, %%eax), %%mm2 \n\t"
445 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
446 "movq (%2), %%mm3 \n\t"
447 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
448 "movq (%2, %3), %%mm3 \n\t"
449 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
450 "movq %%mm0, (%2) \n\t"
451 "movq %%mm1, (%2, %3) \n\t"
452 "addl %%eax, %1 \n\t"
453 "addl %%eax, %2 \n\t"
455 "movq (%1, %3), %%mm1 \n\t"
456 "movq (%1, %%eax), %%mm0 \n\t"
457 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
458 "movq (%2), %%mm3 \n\t"
459 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
460 "movq (%2, %3), %%mm3 \n\t"
461 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
462 "movq %%mm2, (%2) \n\t"
463 "movq %%mm1, (%2, %3) \n\t"
464 "addl %%eax, %1 \n\t"
465 "addl %%eax, %2 \n\t"
469 :"+g"(h), "+S"(pixels), "+D"(block)
474 // this routine is 'slightly' suboptimal but mostly unused
475 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
478 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
480 "movq (%1), %%mm0 \n\t"
481 "movq 1(%1), %%mm4 \n\t"
482 "movq %%mm0, %%mm1 \n\t"
483 "movq %%mm4, %%mm5 \n\t"
484 "punpcklbw %%mm7, %%mm0 \n\t"
485 "punpcklbw %%mm7, %%mm4 \n\t"
486 "punpckhbw %%mm7, %%mm1 \n\t"
487 "punpckhbw %%mm7, %%mm5 \n\t"
488 "paddusw %%mm0, %%mm4 \n\t"
489 "paddusw %%mm1, %%mm5 \n\t"
490 "xorl %%eax, %%eax \n\t"
494 "movq (%1, %%eax), %%mm0 \n\t"
495 "movq 1(%1, %%eax), %%mm2 \n\t"
496 "movq %%mm0, %%mm1 \n\t"
497 "movq %%mm2, %%mm3 \n\t"
498 "punpcklbw %%mm7, %%mm0 \n\t"
499 "punpcklbw %%mm7, %%mm2 \n\t"
500 "punpckhbw %%mm7, %%mm1 \n\t"
501 "punpckhbw %%mm7, %%mm3 \n\t"
502 "paddusw %%mm2, %%mm0 \n\t"
503 "paddusw %%mm3, %%mm1 \n\t"
504 "paddusw %%mm6, %%mm4 \n\t"
505 "paddusw %%mm6, %%mm5 \n\t"
506 "paddusw %%mm0, %%mm4 \n\t"
507 "paddusw %%mm1, %%mm5 \n\t"
508 "psrlw $2, %%mm4 \n\t"
509 "psrlw $2, %%mm5 \n\t"
510 "movq (%2, %%eax), %%mm3 \n\t"
511 "packuswb %%mm5, %%mm4 \n\t"
512 "pcmpeqd %%mm2, %%mm2 \n\t"
513 "paddb %%mm2, %%mm2 \n\t"
514 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
515 "movq %%mm5, (%2, %%eax) \n\t"
516 "addl %3, %%eax \n\t"
518 "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
519 "movq 1(%1, %%eax), %%mm4 \n\t"
520 "movq %%mm2, %%mm3 \n\t"
521 "movq %%mm4, %%mm5 \n\t"
522 "punpcklbw %%mm7, %%mm2 \n\t"
523 "punpcklbw %%mm7, %%mm4 \n\t"
524 "punpckhbw %%mm7, %%mm3 \n\t"
525 "punpckhbw %%mm7, %%mm5 \n\t"
526 "paddusw %%mm2, %%mm4 \n\t"
527 "paddusw %%mm3, %%mm5 \n\t"
528 "paddusw %%mm6, %%mm0 \n\t"
529 "paddusw %%mm6, %%mm1 \n\t"
530 "paddusw %%mm4, %%mm0 \n\t"
531 "paddusw %%mm5, %%mm1 \n\t"
532 "psrlw $2, %%mm0 \n\t"
533 "psrlw $2, %%mm1 \n\t"
534 "movq (%2, %%eax), %%mm3 \n\t"
535 "packuswb %%mm1, %%mm0 \n\t"
536 "pcmpeqd %%mm2, %%mm2 \n\t"
537 "paddb %%mm2, %%mm2 \n\t"
538 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
539 "movq %%mm1, (%2, %%eax) \n\t"
540 "addl %3, %%eax \n\t"
544 :"+g"(h), "+S"(pixels)
545 :"D"(block), "r"(line_size)
550 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
551 DEF(put, pixels8_y2)(block , pixels , line_size, h);
552 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
555 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
556 DEF(put, pixels8_xy2)(block , pixels , line_size, h);
557 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
560 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
561 DEF(avg, pixels8_y2)(block , pixels , line_size, h);
562 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
565 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
566 DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
567 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);