synfig-core/tags/synfig_0_61_03/synfig-core/src/modules/mod_libavcodec/libavcodec/simple_idct.c

   1 /*
   2  * Simple IDCT
   3  *
   4  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  */
  20
  21 /**
  22  * @file simple_idct.c
  23  * simpleidct in C.
  24  */
  25
  26 /*
  27   based upon some outcommented c code from mpeg2dec (idct_mmx.c
  28   written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
  29  */
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33
  34 #if 0
  35 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
  36 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
  37 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
  38 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
  39 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
  40 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
  41 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
  42 #define ROW_SHIFT 8
  43 #define COL_SHIFT 17
  44 #else
  45 #define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  46 #define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  47 #define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  48 #define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  49 #define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  50 #define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  51 #define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  52 #define ROW_SHIFT 11
  53 #define COL_SHIFT 20 // 6
  54 #endif
  55
  56 #if defined(ARCH_POWERPC_405)
  57
  58 /* signed 16x16 -> 32 multiply add accumulate */
  59 #define MAC16(rt, ra, rb) \
  60     asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
  61
  62 /* signed 16x16 -> 32 multiply */
  63 #define MUL16(rt, ra, rb) \
  64     asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb));
  65
  66 #else
  67
  68 /* signed 16x16 -> 32 multiply add accumulate */
  69 #define MAC16(rt, ra, rb) rt += (ra) * (rb)
  70
  71 /* signed 16x16 -> 32 multiply */
  72 #define MUL16(rt, ra, rb) rt = (ra) * (rb)
  73
  74 #endif
  75
  76 static inline void idctRowCondDC (DCTELEM * row)
  77 {
  78         int a0, a1, a2, a3, b0, b1, b2, b3;
  79 #ifdef FAST_64BIT
  80         uint64_t temp;
  81 #else
  82         uint32_t temp;
  83 #endif
  84
  85 #ifdef FAST_64BIT
  86 #ifdef WORDS_BIGENDIAN
  87 #define ROW0_MASK 0xffff000000000000LL
  88 #else
  89 #define ROW0_MASK 0xffffLL
  90 #endif
  91         if(sizeof(DCTELEM)==2){
  92             if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
  93                   ((uint64_t *)row)[1]) == 0) {
  94                 temp = (row[0] << 3) & 0xffff;
  95                 temp += temp << 16;
  96                 temp += temp << 32;
  97                 ((uint64_t *)row)[0] = temp;
  98                 ((uint64_t *)row)[1] = temp;
  99                 return;
 100             }
 101         }else{
 102             if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
 103                 row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
 104                 return;
 105             }
 106         }
 107 #else
 108         if(sizeof(DCTELEM)==2){
 109             if (!(((uint32_t*)row)[1] |
 110                   ((uint32_t*)row)[2] |
 111                   ((uint32_t*)row)[3] |
 112                   row[1])) {
 113                 temp = (row[0] << 3) & 0xffff;
 114                 temp += temp << 16;
 115                 ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
 116                 ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
 117                 return;
 118             }
 119         }else{
 120             if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
 121                 row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
 122                 return;
 123             }
 124         }
 125 #endif
 126
 127         a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 128         a1 = a0;
 129         a2 = a0;
 130         a3 = a0;
 131
 132         /* no need to optimize : gcc does it */
 133         a0 += W2 * row[2];
 134         a1 += W6 * row[2];
 135         a2 -= W6 * row[2];
 136         a3 -= W2 * row[2];
 137
 138         MUL16(b0, W1, row[1]);
 139         MAC16(b0, W3, row[3]);
 140         MUL16(b1, W3, row[1]);
 141         MAC16(b1, -W7, row[3]);
 142         MUL16(b2, W5, row[1]);
 143         MAC16(b2, -W1, row[3]);
 144         MUL16(b3, W7, row[1]);
 145         MAC16(b3, -W5, row[3]);
 146
 147 #ifdef FAST_64BIT
 148         temp = ((uint64_t*)row)[1];
 149 #else
 150         temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 151 #endif
 152         if (temp != 0) {
 153             a0 += W4*row[4] + W6*row[6];
 154             a1 += - W4*row[4] - W2*row[6];
 155             a2 += - W4*row[4] + W2*row[6];
 156             a3 += W4*row[4] - W6*row[6];
 157
 158             MAC16(b0, W5, row[5]);
 159             MAC16(b0, W7, row[7]);
 160
 161             MAC16(b1, -W1, row[5]);
 162             MAC16(b1, -W5, row[7]);
 163
 164             MAC16(b2, W7, row[5]);
 165             MAC16(b2, W3, row[7]);
 166
 167             MAC16(b3, W3, row[5]);
 168             MAC16(b3, -W1, row[7]);
 169         }
 170
 171         row[0] = (a0 + b0) >> ROW_SHIFT;
 172         row[7] = (a0 - b0) >> ROW_SHIFT;
 173         row[1] = (a1 + b1) >> ROW_SHIFT;
 174         row[6] = (a1 - b1) >> ROW_SHIFT;
 175         row[2] = (a2 + b2) >> ROW_SHIFT;
 176         row[5] = (a2 - b2) >> ROW_SHIFT;
 177         row[3] = (a3 + b3) >> ROW_SHIFT;
 178         row[4] = (a3 - b3) >> ROW_SHIFT;
 179 }
 180
 181 static inline void idctSparseColPut (uint8_t *dest, int line_size,
 182                                      DCTELEM * col)
 183 {
 184         int a0, a1, a2, a3, b0, b1, b2, b3;
 185         uint8_t *cm = cropTbl + MAX_NEG_CROP;
 186
 187         /* XXX: I did that only to give same values as previous code */
 188         a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
 189         a1 = a0;
 190         a2 = a0;
 191         a3 = a0;
 192
 193         a0 +=  + W2*col[8*2];
 194         a1 +=  + W6*col[8*2];
 195         a2 +=  - W6*col[8*2];
 196         a3 +=  - W2*col[8*2];
 197
 198         MUL16(b0, W1, col[8*1]);
 199         MUL16(b1, W3, col[8*1]);
 200         MUL16(b2, W5, col[8*1]);
 201         MUL16(b3, W7, col[8*1]);
 202
 203         MAC16(b0, + W3, col[8*3]);
 204         MAC16(b1, - W7, col[8*3]);
 205         MAC16(b2, - W1, col[8*3]);
 206         MAC16(b3, - W5, col[8*3]);
 207
 208         if(col[8*4]){
 209             a0 += + W4*col[8*4];
 210             a1 += - W4*col[8*4];
 211             a2 += - W4*col[8*4];
 212             a3 += + W4*col[8*4];
 213         }
 214
 215         if (col[8*5]) {
 216             MAC16(b0, + W5, col[8*5]);
 217             MAC16(b1, - W1, col[8*5]);
 218             MAC16(b2, + W7, col[8*5]);
 219             MAC16(b3, + W3, col[8*5]);
 220         }
 221
 222         if(col[8*6]){
 223             a0 += + W6*col[8*6];
 224             a1 += - W2*col[8*6];
 225             a2 += + W2*col[8*6];
 226             a3 += - W6*col[8*6];
 227         }
 228
 229         if (col[8*7]) {
 230             MAC16(b0, + W7, col[8*7]);
 231             MAC16(b1, - W5, col[8*7]);
 232             MAC16(b2, + W3, col[8*7]);
 233             MAC16(b3, - W1, col[8*7]);
 234         }
 235
 236         dest[0] = cm[(a0 + b0) >> COL_SHIFT];
 237         dest += line_size;
 238         dest[0] = cm[(a1 + b1) >> COL_SHIFT];
 239         dest += line_size;
 240         dest[0] = cm[(a2 + b2) >> COL_SHIFT];
 241         dest += line_size;
 242         dest[0] = cm[(a3 + b3) >> COL_SHIFT];
 243         dest += line_size;
 244         dest[0] = cm[(a3 - b3) >> COL_SHIFT];
 245         dest += line_size;
 246         dest[0] = cm[(a2 - b2) >> COL_SHIFT];
 247         dest += line_size;
 248         dest[0] = cm[(a1 - b1) >> COL_SHIFT];
 249         dest += line_size;
 250         dest[0] = cm[(a0 - b0) >> COL_SHIFT];
 251 }
 252
 253 static inline void idctSparseColAdd (uint8_t *dest, int line_size,
 254                                      DCTELEM * col)
 255 {
 256         int a0, a1, a2, a3, b0, b1, b2, b3;
 257         uint8_t *cm = cropTbl + MAX_NEG_CROP;
 258
 259         /* XXX: I did that only to give same values as previous code */
 260         a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
 261         a1 = a0;
 262         a2 = a0;
 263         a3 = a0;
 264
 265         a0 +=  + W2*col[8*2];
 266         a1 +=  + W6*col[8*2];
 267         a2 +=  - W6*col[8*2];
 268         a3 +=  - W2*col[8*2];
 269
 270         MUL16(b0, W1, col[8*1]);
 271         MUL16(b1, W3, col[8*1]);
 272         MUL16(b2, W5, col[8*1]);
 273         MUL16(b3, W7, col[8*1]);
 274
 275         MAC16(b0, + W3, col[8*3]);
 276         MAC16(b1, - W7, col[8*3]);
 277         MAC16(b2, - W1, col[8*3]);
 278         MAC16(b3, - W5, col[8*3]);
 279
 280         if(col[8*4]){
 281             a0 += + W4*col[8*4];
 282             a1 += - W4*col[8*4];
 283             a2 += - W4*col[8*4];
 284             a3 += + W4*col[8*4];
 285         }
 286
 287         if (col[8*5]) {
 288             MAC16(b0, + W5, col[8*5]);
 289             MAC16(b1, - W1, col[8*5]);
 290             MAC16(b2, + W7, col[8*5]);
 291             MAC16(b3, + W3, col[8*5]);
 292         }
 293
 294         if(col[8*6]){
 295             a0 += + W6*col[8*6];
 296             a1 += - W2*col[8*6];
 297             a2 += + W2*col[8*6];
 298             a3 += - W6*col[8*6];
 299         }
 300
 301         if (col[8*7]) {
 302             MAC16(b0, + W7, col[8*7]);
 303             MAC16(b1, - W5, col[8*7]);
 304             MAC16(b2, + W3, col[8*7]);
 305             MAC16(b3, - W1, col[8*7]);
 306         }
 307
 308         dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
 309         dest += line_size;
 310         dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
 311         dest += line_size;
 312         dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
 313         dest += line_size;
 314         dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
 315         dest += line_size;
 316         dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
 317         dest += line_size;
 318         dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
 319         dest += line_size;
 320         dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
 321         dest += line_size;
 322         dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
 323 }
 324
 325 static inline void idctSparseCol (DCTELEM * col)
 326 {
 327         int a0, a1, a2, a3, b0, b1, b2, b3;
 328
 329         /* XXX: I did that only to give same values as previous code */
 330         a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
 331         a1 = a0;
 332         a2 = a0;
 333         a3 = a0;
 334
 335         a0 +=  + W2*col[8*2];
 336         a1 +=  + W6*col[8*2];
 337         a2 +=  - W6*col[8*2];
 338         a3 +=  - W2*col[8*2];
 339
 340         MUL16(b0, W1, col[8*1]);
 341         MUL16(b1, W3, col[8*1]);
 342         MUL16(b2, W5, col[8*1]);
 343         MUL16(b3, W7, col[8*1]);
 344
 345         MAC16(b0, + W3, col[8*3]);
 346         MAC16(b1, - W7, col[8*3]);
 347         MAC16(b2, - W1, col[8*3]);
 348         MAC16(b3, - W5, col[8*3]);
 349
 350         if(col[8*4]){
 351             a0 += + W4*col[8*4];
 352             a1 += - W4*col[8*4];
 353             a2 += - W4*col[8*4];
 354             a3 += + W4*col[8*4];
 355         }
 356
 357         if (col[8*5]) {
 358             MAC16(b0, + W5, col[8*5]);
 359             MAC16(b1, - W1, col[8*5]);
 360             MAC16(b2, + W7, col[8*5]);
 361             MAC16(b3, + W3, col[8*5]);
 362         }
 363
 364         if(col[8*6]){
 365             a0 += + W6*col[8*6];
 366             a1 += - W2*col[8*6];
 367             a2 += + W2*col[8*6];
 368             a3 += - W6*col[8*6];
 369         }
 370
 371         if (col[8*7]) {
 372             MAC16(b0, + W7, col[8*7]);
 373             MAC16(b1, - W5, col[8*7]);
 374             MAC16(b2, + W3, col[8*7]);
 375             MAC16(b3, - W1, col[8*7]);
 376         }
 377
 378         col[0 ] = ((a0 + b0) >> COL_SHIFT);
 379         col[8 ] = ((a1 + b1) >> COL_SHIFT);
 380         col[16] = ((a2 + b2) >> COL_SHIFT);
 381         col[24] = ((a3 + b3) >> COL_SHIFT);
 382         col[32] = ((a3 - b3) >> COL_SHIFT);
 383         col[40] = ((a2 - b2) >> COL_SHIFT);
 384         col[48] = ((a1 - b1) >> COL_SHIFT);
 385         col[56] = ((a0 - b0) >> COL_SHIFT);
 386 }
 387
 388 void simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
 389 {
 390     int i;
 391     for(i=0; i<8; i++)
 392         idctRowCondDC(block + i*8);
 393
 394     for(i=0; i<8; i++)
 395         idctSparseColPut(dest + i, line_size, block + i);
 396 }
 397
 398 void simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
 399 {
 400     int i;
 401     for(i=0; i<8; i++)
 402         idctRowCondDC(block + i*8);
 403
 404     for(i=0; i<8; i++)
 405         idctSparseColAdd(dest + i, line_size, block + i);
 406 }
 407
 408 void simple_idct(DCTELEM *block)
 409 {
 410     int i;
 411     for(i=0; i<8; i++)
 412         idctRowCondDC(block + i*8);
 413
 414     for(i=0; i<8; i++)
 415         idctSparseCol(block + i);
 416 }
 417
 418 /* 2x4x8 idct */
 419
 420 #define CN_SHIFT 12
 421 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))
 422 #define C1 C_FIX(0.6532814824)
 423 #define C2 C_FIX(0.2705980501)
 424
 425 /* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized,
 426    and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
 427 #define C_SHIFT (4+1+12)
 428
 429 static inline void idct4col(uint8_t *dest, int line_size, const DCTELEM *col)
 430 {
 431     int c0, c1, c2, c3, a0, a1, a2, a3;
 432     const uint8_t *cm = cropTbl + MAX_NEG_CROP;
 433
 434     a0 = col[8*0];
 435     a1 = col[8*2];
 436     a2 = col[8*4];
 437     a3 = col[8*6];
 438     c0 = ((a0 + a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
 439     c2 = ((a0 - a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
 440     c1 = a1 * C1 + a3 * C2;
 441     c3 = a1 * C2 - a3 * C1;
 442     dest[0] = cm[(c0 + c1) >> C_SHIFT];
 443     dest += line_size;
 444     dest[0] = cm[(c2 + c3) >> C_SHIFT];
 445     dest += line_size;
 446     dest[0] = cm[(c2 - c3) >> C_SHIFT];
 447     dest += line_size;
 448     dest[0] = cm[(c0 - c1) >> C_SHIFT];
 449 }
 450
 451 #define BF(k) \
 452 {\
 453     int a0, a1;\
 454     a0 = ptr[k];\
 455     a1 = ptr[8 + k];\
 456     ptr[k] = a0 + a1;\
 457     ptr[8 + k] = a0 - a1;\
 458 }
 459
 460 /* only used by DV codec. The input must be interlaced. 128 is added
 461    to the pixels before clamping to avoid systematic error
 462    (1024*sqrt(2)) offset would be needed otherwise. */
 463 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to
 464    compensate the extra butterfly stage - I don't have the full DV
 465    specification */
 466 void simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block)
 467 {
 468     int i;
 469     DCTELEM *ptr;
 470
 471     /* butterfly */
 472     ptr = block;
 473     for(i=0;i<4;i++) {
 474         BF(0);
 475         BF(1);
 476         BF(2);
 477         BF(3);
 478         BF(4);
 479         BF(5);
 480         BF(6);
 481         BF(7);
 482         ptr += 2 * 8;
 483     }
 484
 485     /* IDCT8 on each line */
 486     for(i=0; i<8; i++) {
 487         idctRowCondDC(block + i*8);
 488     }
 489
 490     /* IDCT4 and store */
 491     for(i=0;i<8;i++) {
 492         idct4col(dest + i, 2 * line_size, block + i);
 493         idct4col(dest + line_size + i, 2 * line_size, block + 8 + i);
 494     }
 495 }
 496
 497 /* 8x4 & 4x8 WMV2 IDCT */
 498 #undef CN_SHIFT
 499 #undef C_SHIFT
 500 #undef C_FIX
 501 #undef C1
 502 #undef C2
 503 #define CN_SHIFT 12
 504 #define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
 505 #define C1 C_FIX(0.6532814824)
 506 #define C2 C_FIX(0.2705980501)
 507 #define C3 C_FIX(0.5)
 508 #define C_SHIFT (4+1+12)
 509 static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col)
 510 {
 511     int c0, c1, c2, c3, a0, a1, a2, a3;
 512     const uint8_t *cm = cropTbl + MAX_NEG_CROP;
 513
 514     a0 = col[8*0];
 515     a1 = col[8*1];
 516     a2 = col[8*2];
 517     a3 = col[8*3];
 518     c0 = (a0 + a2)*C3 + (1 << (C_SHIFT - 1));
 519     c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1));
 520     c1 = a1 * C1 + a3 * C2;
 521     c3 = a1 * C2 - a3 * C1;
 522     dest[0] = cm[dest[0] + ((c0 + c1) >> C_SHIFT)];
 523     dest += line_size;
 524     dest[0] = cm[dest[0] + ((c2 + c3) >> C_SHIFT)];
 525     dest += line_size;
 526     dest[0] = cm[dest[0] + ((c2 - c3) >> C_SHIFT)];
 527     dest += line_size;
 528     dest[0] = cm[dest[0] + ((c0 - c1) >> C_SHIFT)];
 529 }
 530
 531 #define RN_SHIFT 15
 532 #define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
 533 #define R1 R_FIX(0.6532814824)
 534 #define R2 R_FIX(0.2705980501)
 535 #define R3 R_FIX(0.5)
 536 #define R_SHIFT 11
 537 static inline void idct4row(DCTELEM *row)
 538 {
 539     int c0, c1, c2, c3, a0, a1, a2, a3;
 540     //const uint8_t *cm = cropTbl + MAX_NEG_CROP;
 541
 542     a0 = row[0];
 543     a1 = row[1];
 544     a2 = row[2];
 545     a3 = row[3];
 546     c0 = (a0 + a2)*R3 + (1 << (R_SHIFT - 1));
 547     c2 = (a0 - a2)*R3 + (1 << (R_SHIFT - 1));
 548     c1 = a1 * R1 + a3 * R2;
 549     c3 = a1 * R2 - a3 * R1;
 550     row[0]= (c0 + c1) >> R_SHIFT;
 551     row[1]= (c2 + c3) >> R_SHIFT;
 552     row[2]= (c2 - c3) >> R_SHIFT;
 553     row[3]= (c0 - c1) >> R_SHIFT;
 554 }
 555
 556 void simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block)
 557 {
 558     int i;
 559
 560     /* IDCT8 on each line */
 561     for(i=0; i<4; i++) {
 562         idctRowCondDC(block + i*8);
 563     }
 564
 565     /* IDCT4 and store */
 566     for(i=0;i<8;i++) {
 567         idct4col_add(dest + i, line_size, block + i);
 568     }
 569 }
 570
 571 void simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block)
 572 {
 573     int i;
 574
 575     /* IDCT4 on each line */
 576     for(i=0; i<8; i++) {
 577         idct4row(block + i*8);
 578     }
 579
 580     /* IDCT8 and store */
 581     for(i=0; i<4; i++){
 582         idctSparseColAdd(dest + i, line_size, block + i);
 583     }
 584 }
 585