Fix bugs in previous commit that caused FTBFS in synfig and ETL FTBFS with older...
[synfig.git] / synfig-core / tags / synfig_0_61_05 / synfig-core / src / modules / mod_libavcodec / libavcodec / i386 / simple_idct_mmx.c
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20 #include "../dsputil.h"
21
22 /*
23 23170.475006
24 22725.260826
25 21406.727617
26 19265.545870
27 16384.000000
28 12872.826198
29 8866.956905
30 4520.335430
31 */
32 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #if 0
37 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38 #else
39 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
40 #endif
41 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44
45 #define ROW_SHIFT 11
46 #define COL_SHIFT 20 // 6
47
48 static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
49 static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
50
51 static const int16_t __attribute__((aligned(8))) coeffs[]= {
52         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
53 //      1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
54 //      0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
55         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
56         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
57 //      0, 0, 0, 0,
58 //      0, 0, 0, 0,
59
60  C4,  C4,  C4,  C4,
61  C4, -C4,  C4, -C4,
62  
63  C2,  C6,  C2,  C6,
64  C6, -C2,  C6, -C2,
65  
66  C1,  C3,  C1,  C3,
67  C5,  C7,  C5,  C7,
68  
69  C3, -C7,  C3, -C7,
70 -C1, -C5, -C1, -C5,
71  
72  C5, -C1,  C5, -C1,
73  C7,  C3,  C7,  C3,
74  
75  C7, -C5,  C7, -C5,
76  C3, -C1,  C3, -C1
77 };
78
79 #if 0
80 static void unused_var_killer(){
81         int a= wm1010 + d40000;
82         temp[0]=a;
83 }
84
85 static void inline idctCol (int16_t * col, int16_t *input)
86 {
87 #undef C0
88 #undef C1
89 #undef C2
90 #undef C3
91 #undef C4
92 #undef C5
93 #undef C6
94 #undef C7
95         int a0, a1, a2, a3, b0, b1, b2, b3;
96         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
97         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102         const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103         const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104 /*
105         if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
106                 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
107                         col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
108                 return;
109         }*/
110
111 col[8*0] = input[8*0 + 0];
112 col[8*1] = input[8*2 + 0];
113 col[8*2] = input[8*0 + 1];
114 col[8*3] = input[8*2 + 1];
115 col[8*4] = input[8*4 + 0];
116 col[8*5] = input[8*6 + 0];
117 col[8*6] = input[8*4 + 1];
118 col[8*7] = input[8*6 + 1];
119
120         a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
121         a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
122         a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
123         a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
124
125         b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
126         b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
127         b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
128         b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
129
130         col[8*0] = (a0 + b0) >> COL_SHIFT;
131         col[8*1] = (a1 + b1) >> COL_SHIFT;
132         col[8*2] = (a2 + b2) >> COL_SHIFT;
133         col[8*3] = (a3 + b3) >> COL_SHIFT;
134         col[8*4] = (a3 - b3) >> COL_SHIFT;
135         col[8*5] = (a2 - b2) >> COL_SHIFT;
136         col[8*6] = (a1 - b1) >> COL_SHIFT;
137         col[8*7] = (a0 - b0) >> COL_SHIFT;
138 }
139
140 static void inline idctRow (int16_t * output, int16_t * input)
141 {
142         int16_t row[8];
143
144         int a0, a1, a2, a3, b0, b1, b2, b3;
145         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
146         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151         const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152         const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
154 row[0] = input[0];
155 row[2] = input[1];
156 row[4] = input[4];
157 row[6] = input[5];
158 row[1] = input[8];
159 row[3] = input[9];
160 row[5] = input[12];
161 row[7] = input[13];
162
163         if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
164                 row[0] = row[1] = row[2] = row[3] = row[4] =
165                         row[5] = row[6] = row[7] = row[0]<<3;
166         output[0] = row[0];
167         output[2] = row[1];
168         output[4] = row[2];
169         output[6] = row[3];
170         output[8] = row[4];
171         output[10] = row[5];
172         output[12] = row[6];
173         output[14] = row[7];
174                 return;
175         }
176
177         a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
178         a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
179         a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
180         a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
181
182         b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
183         b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
184         b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
185         b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
186
187         row[0] = (a0 + b0) >> ROW_SHIFT;
188         row[1] = (a1 + b1) >> ROW_SHIFT;
189         row[2] = (a2 + b2) >> ROW_SHIFT;
190         row[3] = (a3 + b3) >> ROW_SHIFT;
191         row[4] = (a3 - b3) >> ROW_SHIFT;
192         row[5] = (a2 - b2) >> ROW_SHIFT;
193         row[6] = (a1 - b1) >> ROW_SHIFT;
194         row[7] = (a0 - b0) >> ROW_SHIFT;
195
196         output[0] = row[0];
197         output[2] = row[1];
198         output[4] = row[2];
199         output[6] = row[3];
200         output[8] = row[4];
201         output[10] = row[5];
202         output[12] = row[6];
203         output[14] = row[7];
204 }
205 #endif
206
207 static inline void idct(int16_t *block)
208 {
209         int64_t __attribute__((aligned(8))) align_tmp[16];
210         int16_t * const temp= (int16_t*)align_tmp;
211
212         asm volatile(
213 #if 0 //Alternative, simpler variant
214
215 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
216         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
217         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
218         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
219         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
220         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
221         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
222         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
223         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
224         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
225         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
226         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
227         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
228         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
229         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
230         #rounder ", %%mm4                       \n\t"\
231         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
232         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
233         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
234         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
235         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
236         #rounder ", %%mm0                       \n\t"\
237         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
238         "paddd %%mm0, %%mm0                     \n\t" \
239         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
240         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
241         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
242         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
243         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
244         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
245         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
246         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
247         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
248         "psrad $" #shift ", %%mm7               \n\t"\
249         "psrad $" #shift ", %%mm4               \n\t"\
250         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
251         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
252         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
253         "psrad $" #shift ", %%mm1               \n\t"\
254         "psrad $" #shift ", %%mm2               \n\t"\
255         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
256         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
257         "movq %%mm7, " #dst "                   \n\t"\
258         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
259         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
260         "movq %%mm2, 24+" #dst "                \n\t"\
261         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
262         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
263         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
264         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
265         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
266         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
267         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
268         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
269         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
270         "psrad $" #shift ", %%mm2               \n\t"\
271         "psrad $" #shift ", %%mm0               \n\t"\
272         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
273         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
274         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
275         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
276         "psrad $" #shift ", %%mm6               \n\t"\
277         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
278         "movq %%mm2, 8+" #dst "                 \n\t"\
279         "psrad $" #shift ", %%mm4               \n\t"\
280         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
281         "movq %%mm4, 16+" #dst "                \n\t"\
282
283 #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
284         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
285         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
286         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
287         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
288         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
289         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
290         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
291         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
292         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
293         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
294         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
295         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
296         #rounder ", %%mm4                       \n\t"\
297         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
298         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
299         #rounder ", %%mm0                       \n\t"\
300         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
301         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
302         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
303         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
304         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
305         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
306         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
307         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
308         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
309         "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
310         "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
311         "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
312         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
313         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
314         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
315         "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
316         "psrad $" #shift ", %%mm7               \n\t"\
317         "psrad $" #shift ", %%mm4               \n\t"\
318         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
319         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
320         "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
321         "psrad $" #shift ", %%mm0               \n\t"\
322         "psrad $" #shift ", %%mm2               \n\t"\
323         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
324         "movd %%mm7, " #dst "                   \n\t"\
325         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
326         "movd %%mm0, 16+" #dst "                \n\t"\
327         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
328         "movd %%mm2, 96+" #dst "                \n\t"\
329         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
330         "movd %%mm4, 112+" #dst "               \n\t"\
331         "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
332         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
333         "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
334         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
335         "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
336         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
337         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
338         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
339         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
340         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
341         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
342         "psrad $" #shift ", %%mm2               \n\t"\
343         "psrad $" #shift ", %%mm5               \n\t"\
344         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
345         "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
346         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
347         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
348         "psrad $" #shift ", %%mm6               \n\t"\
349         "psrad $" #shift ", %%mm4               \n\t"\
350         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
351         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
352         "movd %%mm2, 32+" #dst "                \n\t"\
353         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
354         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
355         "movd %%mm6, 48+" #dst "                \n\t"\
356         "movd %%mm4, 64+" #dst "                \n\t"\
357         "movd %%mm5, 80+" #dst "                \n\t"\
358
359         
360 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
361         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
362         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
363         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
364         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
365         "movq "MANGLE(wm1010)", %%mm4           \n\t"\
366         "pand %%mm0, %%mm4                      \n\t"\
367         "por %%mm1, %%mm4                       \n\t"\
368         "por %%mm2, %%mm4                       \n\t"\
369         "por %%mm3, %%mm4                       \n\t"\
370         "packssdw %%mm4,%%mm4                   \n\t"\
371         "movd %%mm4, %%eax                      \n\t"\
372         "orl %%eax, %%eax                       \n\t"\
373         "jz 1f                                  \n\t"\
374         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
375         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
376         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
377         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
378         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
379         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
380         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
381         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
382         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
383         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
384         #rounder ", %%mm4                       \n\t"\
385         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
386         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
387         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
388         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
389         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
390         #rounder ", %%mm0                       \n\t"\
391         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
392         "paddd %%mm0, %%mm0                     \n\t" \
393         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
394         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
395         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
396         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
397         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
398         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
399         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
400         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
401         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
402         "psrad $" #shift ", %%mm7               \n\t"\
403         "psrad $" #shift ", %%mm4               \n\t"\
404         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
405         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
406         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
407         "psrad $" #shift ", %%mm1               \n\t"\
408         "psrad $" #shift ", %%mm2               \n\t"\
409         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
410         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
411         "movq %%mm7, " #dst "                   \n\t"\
412         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
413         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
414         "movq %%mm2, 24+" #dst "                \n\t"\
415         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
416         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
417         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
418         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
419         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
420         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
421         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
422         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
423         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
424         "psrad $" #shift ", %%mm2               \n\t"\
425         "psrad $" #shift ", %%mm0               \n\t"\
426         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
427         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
428         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
429         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
430         "psrad $" #shift ", %%mm6               \n\t"\
431         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
432         "movq %%mm2, 8+" #dst "                 \n\t"\
433         "psrad $" #shift ", %%mm4               \n\t"\
434         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
435         "movq %%mm4, 16+" #dst "                \n\t"\
436         "jmp 2f                                 \n\t"\
437         "1:                                     \n\t"\
438         "pslld $16, %%mm0                       \n\t"\
439         "#paddd "MANGLE(d40000)", %%mm0         \n\t"\
440         "psrad $13, %%mm0                       \n\t"\
441         "packssdw %%mm0, %%mm0                  \n\t"\
442         "movq %%mm0, " #dst "                   \n\t"\
443         "movq %%mm0, 8+" #dst "                 \n\t"\
444         "movq %%mm0, 16+" #dst "                \n\t"\
445         "movq %%mm0, 24+" #dst "                \n\t"\
446         "2:                                     \n\t"
447
448
449 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
450 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
451 /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
452 ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
453 ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
454
455 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
456 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
457 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
458
459
460 //IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
461 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
462 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
463 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
464 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
465
466 #else
467
468 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
469         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
470         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
471         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
472         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
473         "movq "MANGLE(wm1010)", %%mm4           \n\t"\
474         "pand %%mm0, %%mm4                      \n\t"\
475         "por %%mm1, %%mm4                       \n\t"\
476         "por %%mm2, %%mm4                       \n\t"\
477         "por %%mm3, %%mm4                       \n\t"\
478         "packssdw %%mm4,%%mm4                   \n\t"\
479         "movd %%mm4, %%eax                      \n\t"\
480         "orl %%eax, %%eax                       \n\t"\
481         "jz 1f                                  \n\t"\
482         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
483         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
484         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
485         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
486         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
487         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
488         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
489         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
490         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
491         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
492         #rounder ", %%mm4                       \n\t"\
493         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
494         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
495         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
496         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
497         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
498         #rounder ", %%mm0                       \n\t"\
499         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
500         "paddd %%mm0, %%mm0                     \n\t" \
501         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
502         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
503         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
504         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
505         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
506         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
507         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
508         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
509         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
510         "psrad $" #shift ", %%mm7               \n\t"\
511         "psrad $" #shift ", %%mm4               \n\t"\
512         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
513         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
514         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
515         "psrad $" #shift ", %%mm1               \n\t"\
516         "psrad $" #shift ", %%mm2               \n\t"\
517         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
518         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
519         "movq %%mm7, " #dst "                   \n\t"\
520         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
521         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
522         "movq %%mm2, 24+" #dst "                \n\t"\
523         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
524         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
525         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
526         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
527         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
528         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
529         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
530         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
531         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
532         "psrad $" #shift ", %%mm2               \n\t"\
533         "psrad $" #shift ", %%mm0               \n\t"\
534         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
535         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
536         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
537         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
538         "psrad $" #shift ", %%mm6               \n\t"\
539         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
540         "movq %%mm2, 8+" #dst "                 \n\t"\
541         "psrad $" #shift ", %%mm4               \n\t"\
542         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
543         "movq %%mm4, 16+" #dst "                \n\t"\
544         "jmp 2f                                 \n\t"\
545         "1:                                     \n\t"\
546         "pslld $16, %%mm0                       \n\t"\
547         "paddd "MANGLE(d40000)", %%mm0          \n\t"\
548         "psrad $13, %%mm0                       \n\t"\
549         "packssdw %%mm0, %%mm0                  \n\t"\
550         "movq %%mm0, " #dst "                   \n\t"\
551         "movq %%mm0, 8+" #dst "                 \n\t"\
552         "movq %%mm0, 16+" #dst "                \n\t"\
553         "movq %%mm0, 24+" #dst "                \n\t"\
554         "2:                                     \n\t"
555
556 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
557         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
558         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
559         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
560         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
561         "movq %%mm0, %%mm4                      \n\t"\
562         "por %%mm1, %%mm4                       \n\t"\
563         "por %%mm2, %%mm4                       \n\t"\
564         "por %%mm3, %%mm4                       \n\t"\
565         "packssdw %%mm4,%%mm4                   \n\t"\
566         "movd %%mm4, %%eax                      \n\t"\
567         "orl %%eax, %%eax                       \n\t"\
568         "jz " #bt "                             \n\t"\
569         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
570         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
571         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
572         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
573         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
574         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
575         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
576         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
577         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
578         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
579         #rounder ", %%mm4                       \n\t"\
580         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
581         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
582         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
583         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
584         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
585         #rounder ", %%mm0                       \n\t"\
586         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
587         "paddd %%mm0, %%mm0                     \n\t" \
588         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
589         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
590         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
591         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
592         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
593         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
594         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
595         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
596         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
597         "psrad $" #shift ", %%mm7               \n\t"\
598         "psrad $" #shift ", %%mm4               \n\t"\
599         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
600         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
601         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
602         "psrad $" #shift ", %%mm1               \n\t"\
603         "psrad $" #shift ", %%mm2               \n\t"\
604         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
605         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
606         "movq %%mm7, " #dst "                   \n\t"\
607         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
608         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
609         "movq %%mm2, 24+" #dst "                \n\t"\
610         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
611         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
612         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
613         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
614         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
615         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
616         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
617         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
618         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
619         "psrad $" #shift ", %%mm2               \n\t"\
620         "psrad $" #shift ", %%mm0               \n\t"\
621         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
622         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
623         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
624         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
625         "psrad $" #shift ", %%mm6               \n\t"\
626         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
627         "movq %%mm2, 8+" #dst "                 \n\t"\
628         "psrad $" #shift ", %%mm4               \n\t"\
629         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
630         "movq %%mm4, 16+" #dst "                \n\t"\
631
632 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
633         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
634         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
635         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
636         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
637         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
638         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
639         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
640         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
641         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
642         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
643         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
644         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
645         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
646         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
647         #rounder ", %%mm4                       \n\t"\
648         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
649         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
650         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
651         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
652         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
653         #rounder ", %%mm0                       \n\t"\
654         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
655         "paddd %%mm0, %%mm0                     \n\t" \
656         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
657         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
658         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
659         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
660         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
661         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
662         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
663         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
664         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
665         "psrad $" #shift ", %%mm7               \n\t"\
666         "psrad $" #shift ", %%mm4               \n\t"\
667         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
668         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
669         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
670         "psrad $" #shift ", %%mm1               \n\t"\
671         "psrad $" #shift ", %%mm2               \n\t"\
672         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
673         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
674         "movq %%mm7, " #dst "                   \n\t"\
675         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
676         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
677         "movq %%mm2, 24+" #dst "                \n\t"\
678         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
679         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
680         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
681         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
682         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
683         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
684         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
685         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
686         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
687         "psrad $" #shift ", %%mm2               \n\t"\
688         "psrad $" #shift ", %%mm0               \n\t"\
689         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
690         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
691         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
692         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
693         "psrad $" #shift ", %%mm6               \n\t"\
694         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
695         "movq %%mm2, 8+" #dst "                 \n\t"\
696         "psrad $" #shift ", %%mm4               \n\t"\
697         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
698         "movq %%mm4, 16+" #dst "                \n\t"\
699
700 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
701 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
702 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
703 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
704 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
705
706 #undef IDCT
707 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
708         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
709         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
710         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
711         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
712         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
713         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
714         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
715         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
716         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
717         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
718         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
719         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
720         #rounder ", %%mm4                       \n\t"\
721         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
722         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
723         #rounder ", %%mm0                       \n\t"\
724         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
725         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
726         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
727         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
728         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
729         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
730         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
731         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
732         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
733         "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
734         "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
735         "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
736         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
737         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
738         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
739         "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
740         "psrad $" #shift ", %%mm7               \n\t"\
741         "psrad $" #shift ", %%mm4               \n\t"\
742         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
743         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
744         "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
745         "psrad $" #shift ", %%mm0               \n\t"\
746         "psrad $" #shift ", %%mm2               \n\t"\
747         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
748         "movd %%mm7, " #dst "                   \n\t"\
749         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
750         "movd %%mm0, 16+" #dst "                \n\t"\
751         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
752         "movd %%mm2, 96+" #dst "                \n\t"\
753         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
754         "movd %%mm4, 112+" #dst "               \n\t"\
755         "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
756         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
757         "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
758         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
759         "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
760         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
761         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
762         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
763         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
764         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
765         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
766         "psrad $" #shift ", %%mm2               \n\t"\
767         "psrad $" #shift ", %%mm5               \n\t"\
768         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
769         "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
770         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
771         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
772         "psrad $" #shift ", %%mm6               \n\t"\
773         "psrad $" #shift ", %%mm4               \n\t"\
774         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
775         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
776         "movd %%mm2, 32+" #dst "                \n\t"\
777         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
778         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
779         "movd %%mm6, 48+" #dst "                \n\t"\
780         "movd %%mm4, 64+" #dst "                \n\t"\
781         "movd %%mm5, 80+" #dst "                \n\t"
782
783
784 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
785 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
786 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
787 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
788 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
789         "jmp 9f                                 \n\t"
790
791         "#.balign 16                            \n\t"\
792         "4:                                     \n\t"
793 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
795
796 #undef IDCT
797 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
798         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
799         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
800         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
801         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
802         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
803         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
804         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
805         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
806         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
807         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
808         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
809         #rounder ", %%mm4                       \n\t"\
810         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
811         #rounder ", %%mm0                       \n\t"\
812         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
813         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
814         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
815         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
816         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
817         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
818         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
819         "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
820         "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
821         "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
822         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
823         "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
824         "psrad $" #shift ", %%mm1               \n\t"\
825         "psrad $" #shift ", %%mm4               \n\t"\
826         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
827         "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
828         "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
829         "psrad $" #shift ", %%mm0               \n\t"\
830         "psrad $" #shift ", %%mm2               \n\t"\
831         "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
832         "movd %%mm1, " #dst "                   \n\t"\
833         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
834         "movd %%mm0, 16+" #dst "                \n\t"\
835         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
836         "movd %%mm2, 96+" #dst "                \n\t"\
837         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
838         "movd %%mm4, 112+" #dst "               \n\t"\
839         "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
840         "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
841         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
842         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
843         "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
844         "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
845         "psrad $" #shift ", %%mm2               \n\t"\
846         "psrad $" #shift ", %%mm5               \n\t"\
847         "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
848         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
849         "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
850         "psrad $" #shift ", %%mm6               \n\t"\
851         "psrad $" #shift ", %%mm1               \n\t"\
852         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
853         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
854         "movd %%mm2, 32+" #dst "                \n\t"\
855         "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
856         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
857         "movd %%mm6, 48+" #dst "                \n\t"\
858         "movd %%mm1, 64+" #dst "                \n\t"\
859         "movd %%mm5, 80+" #dst "                \n\t"   
860
861 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
862 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
863 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
864 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
865 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
866         "jmp 9f                                 \n\t"
867
868         "#.balign 16                            \n\t"\
869         "6:                                     \n\t"
870 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
871
872 #undef IDCT
873 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
874         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
875         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
876         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
877         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
878         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
879         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
880         #rounder ", %%mm4                       \n\t"\
881         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
882         #rounder ", %%mm0                       \n\t"\
883         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
884         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
885         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
886         "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
887         "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
888         "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
889         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
890         "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
891         "psrad $" #shift ", %%mm1               \n\t"\
892         "psrad $" #shift ", %%mm4               \n\t"\
893         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
894         "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
895         "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
896         "psrad $" #shift ", %%mm0               \n\t"\
897         "psrad $" #shift ", %%mm2               \n\t"\
898         "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
899         "movd %%mm1, " #dst "                   \n\t"\
900         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
901         "movd %%mm0, 16+" #dst "                \n\t"\
902         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
903         "movd %%mm2, 96+" #dst "                \n\t"\
904         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
905         "movd %%mm4, 112+" #dst "               \n\t"\
906         "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
907         "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
908         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
909         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
910         "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
911         "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
912         "psrad $" #shift ", %%mm2               \n\t"\
913         "psrad $" #shift ", %%mm5               \n\t"\
914         "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
915         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
916         "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
917         "psrad $" #shift ", %%mm6               \n\t"\
918         "psrad $" #shift ", %%mm1               \n\t"\
919         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
920         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
921         "movd %%mm2, 32+" #dst "                \n\t"\
922         "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
923         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
924         "movd %%mm6, 48+" #dst "                \n\t"\
925         "movd %%mm1, 64+" #dst "                \n\t"\
926         "movd %%mm5, 80+" #dst "                \n\t"   
927
928
929 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
930 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
931 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
932 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
933 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
934         "jmp 9f                                 \n\t"
935
936         "#.balign 16                            \n\t"\
937         "2:                                     \n\t"
938 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
939
940 #undef IDCT
941 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
942         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
943         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
944         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
945         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
946         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
947         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
948         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
949         #rounder ", %%mm4                       \n\t"\
950         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
951         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
952         #rounder ", %%mm0                       \n\t"\
953         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
954         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
955         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
956         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
957         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
958         "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
959         "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
960         "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
961         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
962         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
963         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
964         "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
965         "psrad $" #shift ", %%mm7               \n\t"\
966         "psrad $" #shift ", %%mm4               \n\t"\
967         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
968         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
969         "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
970         "psrad $" #shift ", %%mm0               \n\t"\
971         "psrad $" #shift ", %%mm2               \n\t"\
972         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
973         "movd %%mm7, " #dst "                   \n\t"\
974         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
975         "movd %%mm0, 16+" #dst "                \n\t"\
976         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
977         "movd %%mm2, 96+" #dst "                \n\t"\
978         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
979         "movd %%mm4, 112+" #dst "               \n\t"\
980         "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
981         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
982         "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
983         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
984         "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
985         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
986         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
987         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
988         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
989         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
990         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
991         "psrad $" #shift ", %%mm2               \n\t"\
992         "psrad $" #shift ", %%mm5               \n\t"\
993         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
994         "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
995         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
996         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
997         "psrad $" #shift ", %%mm6               \n\t"\
998         "psrad $" #shift ", %%mm4               \n\t"\
999         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
1000         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1001         "movd %%mm2, 32+" #dst "                \n\t"\
1002         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1003         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1004         "movd %%mm6, 48+" #dst "                \n\t"\
1005         "movd %%mm4, 64+" #dst "                \n\t"\
1006         "movd %%mm5, 80+" #dst "                \n\t"
1007
1008 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1009 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1010 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1011 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1012 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1013         "jmp 9f                                 \n\t"
1014
1015         "#.balign 16                            \n\t"\
1016         "3:                                     \n\t"
1017 #undef IDCT
1018 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1019         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1020         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1021         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1022         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1023         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1024         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1025         #rounder ", %%mm4                       \n\t"\
1026         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1027         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1028         #rounder ", %%mm0                       \n\t"\
1029         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1030         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1031         "movq 64(%2), %%mm3                     \n\t"\
1032         "pmaddwd %%mm2, %%mm3                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1033         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1034         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1035         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1036         "psrad $" #shift ", %%mm7               \n\t"\
1037         "psrad $" #shift ", %%mm4               \n\t"\
1038         "movq %%mm0, %%mm1                      \n\t" /* A1             a1 */\
1039         "paddd %%mm3, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1040         "psubd %%mm3, %%mm1                     \n\t" /* A1-B1          a1-b1 */\
1041         "psrad $" #shift ", %%mm0               \n\t"\
1042         "psrad $" #shift ", %%mm1               \n\t"\
1043         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1044         "movd %%mm7, " #dst "                   \n\t"\
1045         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1046         "movd %%mm0, 16+" #dst "                \n\t"\
1047         "packssdw %%mm1, %%mm1                  \n\t" /* A1-B1  a1-b1 */\
1048         "movd %%mm1, 96+" #dst "                \n\t"\
1049         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1050         "movd %%mm4, 112+" #dst "               \n\t"\
1051         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1052         "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1053         "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1054         "movq %%mm5, %%mm1                      \n\t" /* A2             a2 */\
1055         "paddd %%mm4, %%mm1                     \n\t" /* A2+B2          a2+b2 */\
1056         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1057         "psrad $" #shift ", %%mm1               \n\t"\
1058         "psrad $" #shift ", %%mm5               \n\t"\
1059         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1060         "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1061         "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1062         "psrad $" #shift ", %%mm6               \n\t"\
1063         "psrad $" #shift ", %%mm4               \n\t"\
1064         "packssdw %%mm1, %%mm1                  \n\t" /* A2+B2  a2+b2 */\
1065         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1066         "movd %%mm1, 32+" #dst "                \n\t"\
1067         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1068         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1069         "movd %%mm6, 48+" #dst "                \n\t"\
1070         "movd %%mm4, 64+" #dst "                \n\t"\
1071         "movd %%mm5, 80+" #dst "                \n\t"
1072
1073
1074 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1075 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1076 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1077 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1078 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1079         "jmp 9f                                 \n\t"
1080
1081         "#.balign 16                            \n\t"\
1082         "5:                                     \n\t"
1083 #undef IDCT
1084 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1085         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1086         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1087         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1088         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1089         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1090         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1091         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1092         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1093         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1094         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1095         #rounder ", %%mm4                       \n\t"\
1096         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1097         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1098         #rounder ", %%mm0                       \n\t"\
1099         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1100         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1101         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1102         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1103         "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1104         "movq 8+" #src4 ", %%mm3                \n\t" /* R6     R2      r6      r2 */\
1105         "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1106         "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1107         "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1108         "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1109         "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1110         "pmaddwd %%mm3, %%mm7                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1111         "pmaddwd 40(%2), %%mm3                  \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1112         #rounder ", %%mm1                       \n\t"\
1113         "paddd %%mm1, %%mm7                     \n\t" /* A0             a0 */\
1114         "paddd %%mm1, %%mm1                     \n\t" /* 2C0            2c0 */\
1115         #rounder ", %%mm2                       \n\t"\
1116         "psubd %%mm7, %%mm1                     \n\t" /* A3             a3 */\
1117         "paddd %%mm2, %%mm3                     \n\t" /* A1             a1 */\
1118         "paddd %%mm2, %%mm2                     \n\t" /* 2C1            2c1 */\
1119         "psubd %%mm3, %%mm2                     \n\t" /* A2             a2 */\
1120         "psrad $" #shift ", %%mm4               \n\t"\
1121         "psrad $" #shift ", %%mm7               \n\t"\
1122         "psrad $" #shift ", %%mm3               \n\t"\
1123         "packssdw %%mm7, %%mm4                  \n\t" /* A0     a0 */\
1124         "movq %%mm4, " #dst "                   \n\t"\
1125         "psrad $" #shift ", %%mm0               \n\t"\
1126         "packssdw %%mm3, %%mm0                  \n\t" /* A1     a1 */\
1127         "movq %%mm0, 16+" #dst "                \n\t"\
1128         "movq %%mm0, 96+" #dst "                \n\t"\
1129         "movq %%mm4, 112+" #dst "               \n\t"\
1130         "psrad $" #shift ", %%mm5               \n\t"\
1131         "psrad $" #shift ", %%mm6               \n\t"\
1132         "psrad $" #shift ", %%mm2               \n\t"\
1133         "packssdw %%mm2, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1134         "movq %%mm5, 32+" #dst "                \n\t"\
1135         "psrad $" #shift ", %%mm1               \n\t"\
1136         "packssdw %%mm1, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1137         "movq %%mm6, 48+" #dst "                \n\t"\
1138         "movq %%mm6, 64+" #dst "                \n\t"\
1139         "movq %%mm5, 80+" #dst "                \n\t"   
1140         
1141
1142 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1143 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1144 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1145 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1146 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1147         "jmp 9f                                 \n\t"
1148
1149
1150         "#.balign 16                            \n\t"\
1151         "1:                                     \n\t"
1152 #undef IDCT
1153 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1154         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1155         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1156         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1157         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1158         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1159         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1160         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1161         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1162         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1163         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1164         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1165         #rounder ", %%mm4                       \n\t"\
1166         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1167         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1168         #rounder ", %%mm0                       \n\t"\
1169         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1170         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1171         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1172         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1173         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1174         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1175         "movq 64(%2), %%mm1                     \n\t"\
1176         "pmaddwd %%mm2, %%mm1                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1177         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1178         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1179         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1180         "psrad $" #shift ", %%mm7               \n\t"\
1181         "psrad $" #shift ", %%mm4               \n\t"\
1182         "movq %%mm0, %%mm3                      \n\t" /* A1             a1 */\
1183         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1184         "psubd %%mm1, %%mm3                     \n\t" /* A1-B1          a1-b1 */\
1185         "psrad $" #shift ", %%mm0               \n\t"\
1186         "psrad $" #shift ", %%mm3               \n\t"\
1187         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1188         "movd %%mm7, " #dst "                   \n\t"\
1189         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1190         "movd %%mm0, 16+" #dst "                \n\t"\
1191         "packssdw %%mm3, %%mm3                  \n\t" /* A1-B1  a1-b1 */\
1192         "movd %%mm3, 96+" #dst "                \n\t"\
1193         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1194         "movd %%mm4, 112+" #dst "               \n\t"\
1195         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1196         "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1197         "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1198         "movq %%mm5, %%mm3                      \n\t" /* A2             a2 */\
1199         "paddd %%mm4, %%mm3                     \n\t" /* A2+B2          a2+b2 */\
1200         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1201         "psrad $" #shift ", %%mm3               \n\t"\
1202         "psrad $" #shift ", %%mm5               \n\t"\
1203         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1204         "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1205         "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1206         "psrad $" #shift ", %%mm6               \n\t"\
1207         "packssdw %%mm3, %%mm3                  \n\t" /* A2+B2  a2+b2 */\
1208         "movd %%mm3, 32+" #dst "                \n\t"\
1209         "psrad $" #shift ", %%mm4               \n\t"\
1210         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1211         "movd %%mm6, 48+" #dst "                \n\t"\
1212         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1213         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1214         "movd %%mm4, 64+" #dst "                \n\t"\
1215         "movd %%mm5, 80+" #dst "                \n\t"
1216         
1217
1218 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1219 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1220 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1221 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1222 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1223         "jmp 9f                                 \n\t"
1224
1225
1226         "#.balign 16                            \n\t"
1227         "7:                                     \n\t"
1228 #undef IDCT
1229 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1230         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1231         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1232         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1233         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1234         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1235         #rounder ", %%mm4                       \n\t"\
1236         #rounder ", %%mm0                       \n\t"\
1237         "psrad $" #shift ", %%mm4               \n\t"\
1238         "psrad $" #shift ", %%mm0               \n\t"\
1239         "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1240         "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1241         "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1242         "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1243         "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1244         "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1245         #rounder ", %%mm1                       \n\t"\
1246         #rounder ", %%mm2                       \n\t"\
1247         "psrad $" #shift ", %%mm1               \n\t"\
1248         "packssdw %%mm1, %%mm4                  \n\t" /* A0     a0 */\
1249         "movq %%mm4, " #dst "                   \n\t"\
1250         "psrad $" #shift ", %%mm2               \n\t"\
1251         "packssdw %%mm2, %%mm0                  \n\t" /* A1     a1 */\
1252         "movq %%mm0, 16+" #dst "                \n\t"\
1253         "movq %%mm0, 96+" #dst "                \n\t"\
1254         "movq %%mm4, 112+" #dst "               \n\t"\
1255         "movq %%mm0, 32+" #dst "                \n\t"\
1256         "movq %%mm4, 48+" #dst "                \n\t"\
1257         "movq %%mm4, 64+" #dst "                \n\t"\
1258         "movq %%mm0, 80+" #dst "                \n\t"   
1259
1260 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1261 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1262 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1263 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1264 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1265
1266
1267 #endif
1268
1269 /*
1270 Input
1271  00 40 04 44 20 60 24 64
1272  10 30 14 34 50 70 54 74
1273  01 41 03 43 21 61 23 63
1274  11 31 13 33 51 71 53 73
1275  02 42 06 46 22 62 26 66
1276  12 32 16 36 52 72 56 76
1277  05 45 07 47 25 65 27 67
1278  15 35 17 37 55 75 57 77
1279   
1280 Temp
1281  00 04 10 14 20 24 30 34
1282  40 44 50 54 60 64 70 74
1283  01 03 11 13 21 23 31 33
1284  41 43 51 53 61 63 71 73
1285  02 06 12 16 22 26 32 36
1286  42 46 52 56 62 66 72 76
1287  05 07 15 17 25 27 35 37
1288  45 47 55 57 65 67 75 77
1289 */
1290
1291 "9: \n\t"
1292                 :: "r" (block), "r" (temp), "r" (coeffs)
1293                 : "%eax"
1294         );
1295 }
1296
1297 void ff_simple_idct_mmx(int16_t *block)
1298 {
1299     idct(block);
1300 }
1301
1302 //FIXME merge add/put into the idct
1303
1304 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1305 {
1306     idct(block);
1307     put_pixels_clamped_mmx(block, dest, line_size);
1308 }
1309 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1310 {
1311     idct(block);
1312     add_pixels_clamped_mmx(block, dest, line_size);
1313 }