2 Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 doVertDefFilter Ec Ec e e
33 doHorizDefFilter Ec Ec e e
35 Vertical RKAlgo1 E a a
36 Horizontal RKAlgo1 a a
39 LinIpolDeinterlace e E E*
40 CubicIpolDeinterlace a e e*
41 LinBlendDeinterlace e E E*
42 MedianDeinterlace# E Ec Ec
45 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
46 # more or less selfinvented filters so the exactness isnt too meaningfull
47 E = Exact implementation
48 e = allmost exact implementation (slightly different rounding,...)
49 a = alternative / approximate impl
50 c = checked against the other implementations (-vo md5)
55 reduce the time wasted on the mem transfer
56 unroll stuff if instructions depend too much on the prior one
57 move YScale thing to the end instead of fixing QP
58 write a faster and higher quality deblocking filter :)
59 make the mainloop more flexible (variable number of blocks at once
60 (the if/else stuff per block is slowing things down)
61 compare the quality & speed of all filters
64 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
68 //Changelog: use the CVS log
82 //#define DEBUG_BRIGHTNESS
84 #include "../fastmemcpy.h"
86 #include "postprocess.h"
87 #include "postprocess_internal.h"
89 #include "mangle.h" //FIXME should be supressed
92 #define memalign(a,b) malloc(b)
95 #define MIN(a,b) ((a) > (b) ? (b) : (a))
96 #define MAX(a,b) ((a) < (b) ? (b) : (a))
97 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
98 #define SIGN(a) ((a) > 0 ? 1 : -1)
100 #define GET_MODE_BUFFER_SIZE 500
101 #define OPTIONS_ARRAY_SIZE 10
103 #define TEMP_STRIDE 8
104 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
107 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
108 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
109 static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
110 static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
111 static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
112 static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
113 static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
117 static uint8_t clip_table[3*256];
118 static uint8_t * const clip_tab= clip_table + 256;
120 static int verbose= 0;
122 static const int deringThreshold= 20;
125 static struct PPFilter filters[]=
127 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
128 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
129 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
130 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
131 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
132 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
133 {"dr", "dering", 1, 5, 6, DERING},
134 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
135 {"lb", "linblenddeint", 1, 1, 4, INTERPOLATION_LINEAR_BLEND_DEINT_FILTER},
136 {"li", "linipoldeint", 1, 1, 4, INTERPOLATION_LINEAR_IPOL_DEINT_FILTER},
137 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
138 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
139 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
140 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
141 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
142 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
143 {NULL, NULL,0,0,0,0} //End Marker
146 static char *replaceTable[]=
148 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
149 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
150 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
151 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
156 static inline void unusedVariableWarningFixer()
158 if(w05 + w20 + b00 + b01 + b02 + b08 + b80 == 0) b00=0;
164 static inline void prefetchnta(void *p)
166 asm volatile( "prefetchnta (%0)\n\t"
171 static inline void prefetcht0(void *p)
173 asm volatile( "prefetcht0 (%0)\n\t"
178 static inline void prefetcht1(void *p)
180 asm volatile( "prefetcht1 (%0)\n\t"
185 static inline void prefetcht2(void *p)
187 asm volatile( "prefetcht2 (%0)\n\t"
193 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
196 * Check if the given 8x8 Block is mostly "flat"
198 static inline int isHorizDC(uint8_t src[], int stride, PPContext *c)
202 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
203 const int dcThreshold= dcOffset*2 + 1;
205 for(y=0; y<BLOCK_SIZE; y++)
207 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
208 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
209 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
210 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
211 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
212 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
213 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
216 return numEq > c->ppMode.flatnessThreshold;
220 * Check if the middle 8x8 Block in the given 8x16 block is flat
222 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
225 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
226 const int dcThreshold= dcOffset*2 + 1;
228 src+= stride*4; // src points to begin of the 8x8 Block
229 for(y=0; y<BLOCK_SIZE-1; y++)
231 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
232 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
233 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
234 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
235 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
236 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
237 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
238 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
241 return numEq > c->ppMode.flatnessThreshold;
244 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
249 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
251 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
253 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
255 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
260 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
267 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
273 for(x=0; x<BLOCK_SIZE; x+=4)
275 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
276 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
277 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
278 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
283 for(x=0; x<BLOCK_SIZE; x++)
285 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
292 for(x=0; x<BLOCK_SIZE; x++)
298 int v= src[x + y*stride];
302 if(max-min > 2*QP) return 0;
308 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
309 if( isVertDC_C(src, stride, c) ){
310 if( isVertMinMaxOk_C(src, stride, c->QP) )
319 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
322 for(y=0; y<BLOCK_SIZE; y++)
324 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
326 if(ABS(middleEnergy) < 8*QP)
328 const int q=(dst[3] - dst[4])/2;
329 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
330 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
332 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
336 d*= SIGN(-middleEnergy);
357 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
358 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
360 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
364 for(y=0; y<BLOCK_SIZE; y++)
366 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
367 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
370 sums[0] = first + dst[0];
371 sums[1] = dst[0] + dst[1];
372 sums[2] = dst[1] + dst[2];
373 sums[3] = dst[2] + dst[3];
374 sums[4] = dst[3] + dst[4];
375 sums[5] = dst[4] + dst[5];
376 sums[6] = dst[5] + dst[6];
377 sums[7] = dst[6] + dst[7];
378 sums[8] = dst[7] + last;
380 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
381 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
382 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
383 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
384 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
385 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
386 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
387 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
394 * Experimental Filter 1 (Horizontal)
395 * will not damage linear gradients
396 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
397 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
398 * MMX2 version does correct clipping C version doesnt
399 * not identical with the vertical one
401 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
404 static uint64_t *lut= NULL;
408 lut= (uint64_t*)memalign(8, 256*8);
411 int v= i < 128 ? 2*i : 2*(i-256);
413 //Simulate 112242211 9-Tap filter
414 uint64_t a= (v/16) & 0xFF;
415 uint64_t b= (v/8) & 0xFF;
416 uint64_t c= (v/4) & 0xFF;
417 uint64_t d= (3*v/8) & 0xFF;
419 //Simulate piecewise linear interpolation
420 uint64_t a= (v/16) & 0xFF;
421 uint64_t b= (v*3/16) & 0xFF;
422 uint64_t c= (v*5/16) & 0xFF;
423 uint64_t d= (7*v/16) & 0xFF;
424 uint64_t A= (0x100 - a)&0xFF;
425 uint64_t B= (0x100 - b)&0xFF;
426 uint64_t C= (0x100 - c)&0xFF;
427 uint64_t D= (0x100 - c)&0xFF;
429 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
430 (D<<24) | (C<<16) | (B<<8) | (A);
431 //lut[i] = (v<<32) | (v<<24);
435 for(y=0; y<BLOCK_SIZE; y++)
437 int a= src[1] - src[2];
438 int b= src[3] - src[4];
439 int c= src[5] - src[6];
441 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
445 int v = d * SIGN(-b);
460 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
462 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
468 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
472 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
476 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
477 #define COMPILE_3DNOW
491 #define RENAME(a) a ## _C
492 #include "postprocess_template.c"
502 #define RENAME(a) a ## _MMX
503 #include "postprocess_template.c"
513 #define RENAME(a) a ## _MMX2
514 #include "postprocess_template.c"
524 #define RENAME(a) a ## _3DNow
525 #include "postprocess_template.c"
528 // minor note: the HAVE_xyz is messed up after that line so dont use it
530 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
531 QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
533 PPContext *c= (PPContext *)vc;
534 PPMode *ppMode= (PPMode *)vm;
535 c->ppMode= *ppMode; //FIXME
537 // useing ifs here as they are faster than function pointers allthough the
538 // difference wouldnt be messureable here but its much better because
539 // someone might exchange the cpu whithout restarting mplayer ;)
540 #ifdef RUNTIME_CPUDETECT
542 // ordered per speed fasterst first
543 if(c->cpuCaps & PP_CPU_CAPS_MMX2)
544 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
545 else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
546 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
547 else if(c->cpuCaps & PP_CPU_CAPS_MMX)
548 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
550 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
552 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
554 #else //RUNTIME_CPUDETECT
556 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
557 #elif defined (HAVE_3DNOW)
558 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
559 #elif defined (HAVE_MMX)
560 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
562 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
564 #endif //!RUNTIME_CPUDETECT
567 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
568 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
570 /* -pp Command line Help
573 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
574 "long form example:\n"
575 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
576 "short form example:\n"
577 "vb:a/hb:a/lb de,-vb\n"
581 "short long name short long option Description\n"
582 "* * a autoq CPU power dependent enabler\n"
583 " c chrom chrominance filtering enabled\n"
584 " y nochrom chrominance filtering disabled\n"
585 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
586 " 1. difference factor: default=32, higher -> more deblocking\n"
587 " 2. flatness threshold: default=39, lower -> more deblocking\n"
588 " the h & v deblocking filters share these\n"
589 " so you can't set different thresholds for h / v\n"
590 "vb vdeblock (2 threshold) vertical deblocking filter\n"
591 "h1 x1hdeblock experimental h deblock filter 1\n"
592 "v1 x1vdeblock experimental v deblock filter 1\n"
593 "dr dering deringing filter\n"
594 "al autolevels automatic brightness / contrast\n"
595 " f fullyrange stretch luminance to (0..255)\n"
596 "lb linblenddeint linear blend deinterlacer\n"
597 "li linipoldeint linear interpolating deinterlace\n"
598 "ci cubicipoldeint cubic interpolating deinterlacer\n"
599 "md mediandeint median deinterlacer\n"
600 "fd ffmpegdeint ffmpeg deinterlacer\n"
601 "de default hb:a,vb:a,dr:a,al\n"
602 "fa fast h1:a,v1:a,dr:a,al\n"
603 "tn tmpnoise (3 threshold) temporal noise reducer\n"
604 " 1. <= 2. <= 3. larger -> stronger filtering\n"
605 "fq forceQuant <quantizer> force quantizer\n"
608 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
610 char temp[GET_MODE_BUFFER_SIZE];
612 char *filterDelimiters= ",/";
613 char *optionDelimiters= ":";
614 struct PPMode *ppMode;
617 ppMode= memalign(8, sizeof(PPMode));
620 ppMode->chromMode= 0;
621 ppMode->maxTmpNoise[0]= 700;
622 ppMode->maxTmpNoise[1]= 1500;
623 ppMode->maxTmpNoise[2]= 3000;
624 ppMode->maxAllowedY= 234;
625 ppMode->minAllowedY= 16;
626 ppMode->baseDcDiff= 256/8;
627 ppMode->flatnessThreshold= 56-16-1;
628 ppMode->maxClippedThreshold= 0.01;
631 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
633 if(verbose>1) printf("pp: %s\n", name);
637 int q= 1000000; //PP_QUALITY_MAX;
640 char *options[OPTIONS_ARRAY_SIZE];
643 int numOfUnknownOptions=0;
644 int enable=1; //does the user want us to enabled or disabled the filter
646 filterToken= strtok(p, filterDelimiters);
647 if(filterToken == NULL) break;
648 p+= strlen(filterToken) + 1; // p points to next filterToken
649 filterName= strtok(filterToken, optionDelimiters);
650 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
652 if(*filterName == '-')
658 for(;;){ //for all options
659 option= strtok(NULL, optionDelimiters);
660 if(option == NULL) break;
662 if(verbose>1) printf("pp: option: %s\n", option);
663 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
664 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
665 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
668 options[numOfUnknownOptions] = option;
669 numOfUnknownOptions++;
671 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
673 options[numOfUnknownOptions] = NULL;
675 /* replace stuff from the replace Table */
676 for(i=0; replaceTable[2*i]!=NULL; i++)
678 if(!strcmp(replaceTable[2*i], filterName))
680 int newlen= strlen(replaceTable[2*i + 1]);
684 if(p==NULL) p= temp, *p=0; //last filter
685 else p--, *p=','; //not last filter
688 spaceLeft= p - temp + plen;
689 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
694 memmove(p + newlen, p, plen+1);
695 memcpy(p, replaceTable[2*i + 1], newlen);
700 for(i=0; filters[i].shortName!=NULL; i++)
702 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
703 if( !strcmp(filters[i].longName, filterName)
704 || !strcmp(filters[i].shortName, filterName))
706 ppMode->lumMode &= ~filters[i].mask;
707 ppMode->chromMode &= ~filters[i].mask;
710 if(!enable) break; // user wants to disable it
712 if(q >= filters[i].minLumQuality)
713 ppMode->lumMode|= filters[i].mask;
714 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
715 if(q >= filters[i].minChromQuality)
716 ppMode->chromMode|= filters[i].mask;
718 if(filters[i].mask == LEVEL_FIX)
721 ppMode->minAllowedY= 16;
722 ppMode->maxAllowedY= 234;
723 for(o=0; options[o]!=NULL; o++)
725 if( !strcmp(options[o],"fullyrange")
726 ||!strcmp(options[o],"f"))
728 ppMode->minAllowedY= 0;
729 ppMode->maxAllowedY= 255;
730 numOfUnknownOptions--;
734 else if(filters[i].mask == TEMP_NOISE_FILTER)
739 for(o=0; options[o]!=NULL; o++)
742 ppMode->maxTmpNoise[numOfNoises]=
743 strtol(options[o], &tail, 0);
747 numOfUnknownOptions--;
748 if(numOfNoises >= 3) break;
752 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
756 for(o=0; options[o]!=NULL && o<2; o++)
759 int val= strtol(options[o], &tail, 0);
760 if(tail==options[o]) break;
762 numOfUnknownOptions--;
763 if(o==0) ppMode->baseDcDiff= val;
764 else ppMode->flatnessThreshold= val;
767 else if(filters[i].mask == FORCE_QUANT)
770 ppMode->forcedQuant= 15;
772 for(o=0; options[o]!=NULL && o<1; o++)
775 int val= strtol(options[o], &tail, 0);
776 if(tail==options[o]) break;
778 numOfUnknownOptions--;
779 ppMode->forcedQuant= val;
784 if(!filterNameOk) ppMode->error++;
785 ppMode->error += numOfUnknownOptions;
788 if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
791 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
798 void pp_free_mode(pp_mode_t *mode){
802 static void reallocAlign(void **p, int alignment, int size){
804 *p= memalign(alignment, size);
808 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
809 int mbWidth = (width+15)>>4;
810 int mbHeight= (height+15)>>4;
814 c->qpStride= qpStride;
816 reallocAlign((void **)&c->tempDst, 8, stride*24);
817 reallocAlign((void **)&c->tempSrc, 8, stride*24);
818 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
819 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
821 c->yHistogram[i]= width*height/64*15/256;
825 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
826 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
827 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
830 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
831 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
832 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
833 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
836 static void global_init(void){
838 memset(clip_table, 0, 256);
839 for(i=256; i<512; i++)
841 memset(clip_table+512, 0, 256);
844 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
845 PPContext *c= memalign(32, sizeof(PPContext));
846 int stride= (width+15)&(~15); //assumed / will realloc if needed
847 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
851 memset(c, 0, sizeof(PPContext));
853 if(cpuCaps&PP_FORMAT){
854 c->hChromaSubSample= cpuCaps&0x3;
855 c->vChromaSubSample= (cpuCaps>>4)&0x3;
857 c->hChromaSubSample= 1;
858 c->vChromaSubSample= 1;
861 reallocBuffers(c, width, height, stride, qpStride);
868 void pp_free_context(void *vc){
869 PPContext *c = (PPContext*)vc;
872 for(i=0; i<3; i++) free(c->tempBlured[i]);
873 for(i=0; i<3; i++) free(c->tempBluredPast[i]);
881 free(c->nonBQPTable);
882 free(c->forcedQPTable);
884 memset(c, 0, sizeof(PPContext));
889 void pp_postprocess(uint8_t * src[3], int srcStride[3],
890 uint8_t * dst[3], int dstStride[3],
891 int width, int height,
892 QP_STORE_T *QP_store, int QPStride,
893 pp_mode_t *vm, void *vc, int pict_type)
895 int mbWidth = (width+15)>>4;
896 int mbHeight= (height+15)>>4;
897 PPMode *mode = (PPMode*)vm;
898 PPContext *c = (PPContext*)vc;
899 int minStride= MAX(srcStride[0], dstStride[0]);
901 if(c->stride < minStride || c->qpStride < QPStride)
902 reallocBuffers(c, width, height,
903 MAX(minStride, c->stride),
904 MAX(c->qpStride, QPStride));
906 if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
909 QP_store= c->forcedQPTable;
911 if(mode->lumMode & FORCE_QUANT)
912 for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
914 for(i=0; i<mbWidth; i++) QP_store[i]= 1;
916 //printf("pict_type:%d\n", pict_type);
918 if(pict_type & PP_PICT_TYPE_QP2){
920 const int count= mbHeight * QPStride;
921 for(i=0; i<(count>>2); i++){
922 ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
924 for(i<<=2; i<count; i++){
925 c->stdQPTable[i] = QP_store[i]>>1;
927 QP_store= c->stdQPTable;
932 for(y=0; y<mbHeight; y++){
933 for(x=0; x<mbWidth; x++){
934 printf("%2d ", QP_store[x + y*QPStride]);
944 const int count= mbHeight * QPStride;
945 for(i=0; i<(count>>2); i++){
946 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x1F1F1F1F;
948 for(i<<=2; i<count; i++){
949 c->nonBQPTable[i] = QP_store[i] & 0x1F;
955 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
958 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
959 width, height, QP_store, QPStride, 0, mode, c);
961 width = (width )>>c->hChromaSubSample;
962 height = (height)>>c->vChromaSubSample;
966 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
967 width, height, QP_store, QPStride, 1, mode, c);
968 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
969 width, height, QP_store, QPStride, 2, mode, c);
971 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
973 memcpy(dst[1], src[1], srcStride[1]*height);
974 memcpy(dst[2], src[2], srcStride[2]*height);
979 for(y=0; y<height; y++)
981 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
982 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);