2 * High quality image resampling with polyphase filters
3 * Copyright (c) 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * High quality image resampling with polyphase filters .
29 #include "fastmemcpy.h"
32 #define NB_COMPONENTS 3
35 #define NB_PHASES (1 << PHASE_BITS)
37 #define FCENTER 1 /* index of the center of the filter */
38 //#define TEST 1 /* Test it */
40 #define POS_FRAC_BITS 16
41 #define POS_FRAC (1 << POS_FRAC_BITS)
42 /* 6 bits precision is needed for MMX */
45 #define LINE_BUF_HEIGHT (NB_TAPS * 4)
47 struct ImgReSampleContext {
48 int iwidth, iheight, owidth, oheight, topBand, bottomBand, leftBand, rightBand;
50 int16_t h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
51 int16_t v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
55 static inline int get_phase(int pos)
57 return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1);
60 /* This function must be optimized */
61 static void h_resample_fast(uint8_t *dst, int dst_width, uint8_t *src, int src_width,
62 int src_start, int src_incr, int16_t *filters)
64 int src_pos, phase, sum, i;
69 for(i=0;i<dst_width;i++) {
72 if ((src_pos >> POS_FRAC_BITS) < 0 ||
73 (src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS))
76 s = src + (src_pos >> POS_FRAC_BITS);
77 phase = get_phase(src_pos);
78 filter = filters + phase * NB_TAPS;
80 sum = s[0] * filter[0] +
88 for(j=0;j<NB_TAPS;j++)
89 sum += s[j] * filter[j];
92 sum = sum >> FILTER_BITS;
103 /* This function must be optimized */
104 static void v_resample(uint8_t *dst, int dst_width, uint8_t *src, int wrap,
111 for(i=0;i<dst_width;i++) {
113 sum = s[0 * wrap] * filter[0] +
114 s[1 * wrap] * filter[1] +
115 s[2 * wrap] * filter[2] +
116 s[3 * wrap] * filter[3];
123 for(j=0;j<NB_TAPS;j++) {
124 sum += s1[0] * filter[j];
129 sum = sum >> FILTER_BITS;
142 #include "i386/mmx.h"
144 #define FILTER4(reg) \
146 s = src + (src_pos >> POS_FRAC_BITS);\
147 phase = get_phase(src_pos);\
148 filter = filters + phase * NB_TAPS;\
150 punpcklbw_r2r(mm7, reg);\
151 movq_m2r(*filter, mm6);\
152 pmaddwd_r2r(reg, mm6);\
155 paddd_r2r(mm6, reg);\
156 psrad_i2r(FILTER_BITS, reg);\
157 src_pos += src_incr;\
160 #define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
162 /* XXX: do four pixels at a time */
163 static void h_resample_fast4_mmx(uint8_t *dst, int dst_width, uint8_t *src, int src_width,
164 int src_start, int src_incr, int16_t *filters)
174 while (dst_width >= 4) {
181 packuswb_r2r(mm7, mm0);
182 packuswb_r2r(mm7, mm1);
183 packuswb_r2r(mm7, mm3);
184 packuswb_r2r(mm7, mm2);
196 while (dst_width > 0) {
198 packuswb_r2r(mm7, mm0);
207 static void v_resample4_mmx(uint8_t *dst, int dst_width, uint8_t *src, int wrap,
225 while (dst_width >= 4) {
226 movq_m2r(s[0 * wrap], mm0);
227 punpcklbw_r2r(mm7, mm0);
228 movq_m2r(s[1 * wrap], mm1);
229 punpcklbw_r2r(mm7, mm1);
230 movq_m2r(s[2 * wrap], mm2);
231 punpcklbw_r2r(mm7, mm2);
232 movq_m2r(s[3 * wrap], mm3);
233 punpcklbw_r2r(mm7, mm3);
235 pmullw_m2r(coefs[0], mm0);
236 pmullw_m2r(coefs[1], mm1);
237 pmullw_m2r(coefs[2], mm2);
238 pmullw_m2r(coefs[3], mm3);
243 psraw_i2r(FILTER_BITS, mm0);
245 packuswb_r2r(mm7, mm0);
248 *(uint32_t *)dst = tmp.ud[0];
253 while (dst_width > 0) {
254 sum = s[0 * wrap] * filter[0] +
255 s[1 * wrap] * filter[1] +
256 s[2 * wrap] * filter[2] +
257 s[3 * wrap] * filter[3];
258 sum = sum >> FILTER_BITS;
274 vector unsigned char v;
279 vector signed short v;
283 void v_resample16_altivec(uint8_t *dst, int dst_width, uint8_t *src, int wrap,
288 vector unsigned char *tv, tmp, dstv, zero;
289 vec_ss_t srchv[4], srclv[4], fv[4];
290 vector signed short zeros, sumhv, sumlv;
296 The vec_madds later on does an implicit >>15 on the result.
297 Since FILTER_BITS is 8, and we have 15 bits of magnitude in
298 a signed short, we have just enough bits to pre-shift our
299 filter constants <<7 to compensate for vec_madds.
301 fv[i].s[0] = filter[i] << (15-FILTER_BITS);
302 fv[i].v = vec_splat(fv[i].v, 0);
305 zero = vec_splat_u8(0);
306 zeros = vec_splat_s16(0);
310 When we're resampling, we'd ideally like both our input buffers,
311 and output buffers to be 16-byte aligned, so we can do both aligned
312 reads and writes. Sadly we can't always have this at the moment, so
313 we opt for aligned writes, as unaligned writes have a huge overhead.
314 To do this, do enough scalar resamples to get dst 16-byte aligned.
316 i = (-(int)dst) & 0xf;
318 sum = s[0 * wrap] * filter[0] +
319 s[1 * wrap] * filter[1] +
320 s[2 * wrap] * filter[2] +
321 s[3 * wrap] * filter[3];
322 sum = sum >> FILTER_BITS;
323 if (sum<0) sum = 0; else if (sum>255) sum=255;
331 /* Do our altivec resampling on 16 pixels at once. */
332 while(dst_width>=16) {
334 Read 16 (potentially unaligned) bytes from each of
335 4 lines into 4 vectors, and split them into shorts.
336 Interleave the multipy/accumulate for the resample
337 filter with the loads to hide the 3 cycle latency
340 tv = (vector unsigned char *) &s[0 * wrap];
341 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
342 srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
343 srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
344 sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
345 sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
347 tv = (vector unsigned char *) &s[1 * wrap];
348 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
349 srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
350 srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
351 sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
352 sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
354 tv = (vector unsigned char *) &s[2 * wrap];
355 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
356 srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
357 srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
358 sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
359 sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
361 tv = (vector unsigned char *) &s[3 * wrap];
362 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
363 srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
364 srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
365 sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
366 sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
369 Pack the results into our destination vector,
370 and do an aligned write of that back to memory.
372 dstv = vec_packsu(sumhv, sumlv) ;
373 vec_st(dstv, 0, (vector unsigned char *) dst);
381 If there are any leftover pixels, resample them
382 with the slow scalar method.
385 sum = s[0 * wrap] * filter[0] +
386 s[1 * wrap] * filter[1] +
387 s[2 * wrap] * filter[2] +
388 s[3 * wrap] * filter[3];
389 sum = sum >> FILTER_BITS;
390 if (sum<0) sum = 0; else if (sum>255) sum=255;
399 /* slow version to handle limit cases. Does not need optimisation */
400 static void h_resample_slow(uint8_t *dst, int dst_width, uint8_t *src, int src_width,
401 int src_start, int src_incr, int16_t *filters)
403 int src_pos, phase, sum, j, v, i;
404 uint8_t *s, *src_end;
407 src_end = src + src_width;
409 for(i=0;i<dst_width;i++) {
410 s = src + (src_pos >> POS_FRAC_BITS);
411 phase = get_phase(src_pos);
412 filter = filters + phase * NB_TAPS;
414 for(j=0;j<NB_TAPS;j++) {
417 else if (s >= src_end)
421 sum += v * filter[j];
424 sum = sum >> FILTER_BITS;
435 static void h_resample(uint8_t *dst, int dst_width, uint8_t *src, int src_width,
436 int src_start, int src_incr, int16_t *filters)
441 n = (0 - src_start + src_incr - 1) / src_incr;
442 h_resample_slow(dst, n, src, src_width, src_start, src_incr, filters);
445 src_start += n * src_incr;
447 src_end = src_start + dst_width * src_incr;
448 if (src_end > ((src_width - NB_TAPS) << POS_FRAC_BITS)) {
449 n = (((src_width - NB_TAPS + 1) << POS_FRAC_BITS) - 1 - src_start) /
455 if ((mm_flags & MM_MMX) && NB_TAPS == 4)
456 h_resample_fast4_mmx(dst, n,
457 src, src_width, src_start, src_incr, filters);
460 h_resample_fast(dst, n,
461 src, src_width, src_start, src_incr, filters);
465 src_start += n * src_incr;
466 h_resample_slow(dst, dst_width,
467 src, src_width, src_start, src_incr, filters);
471 static void component_resample(ImgReSampleContext *s,
472 uint8_t *output, int owrap, int owidth, int oheight,
473 uint8_t *input, int iwrap, int iwidth, int iheight)
475 int src_y, src_y1, last_src_y, ring_y, phase_y, y1, y;
476 uint8_t *new_line, *src_line;
478 last_src_y = - FCENTER - 1;
479 /* position of the bottom of the filter in the source image */
480 src_y = (last_src_y + NB_TAPS) * POS_FRAC;
481 ring_y = NB_TAPS; /* position in ring buffer */
482 for(y=0;y<oheight;y++) {
483 /* apply horizontal filter on new lines from input if needed */
484 src_y1 = src_y >> POS_FRAC_BITS;
485 while (last_src_y < src_y1) {
486 if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS)
489 /* handle limit conditions : replicate line (slightly
490 inefficient because we filter multiple times) */
494 } else if (y1 >= iheight) {
497 src_line = input + y1 * iwrap;
498 new_line = s->line_buf + ring_y * owidth;
499 /* apply filter and handle limit cases correctly */
500 h_resample(new_line, owidth,
501 src_line, iwidth, - FCENTER * POS_FRAC, s->h_incr,
502 &s->h_filters[0][0]);
503 /* handle ring buffer wraping */
504 if (ring_y >= LINE_BUF_HEIGHT) {
505 memcpy(s->line_buf + (ring_y - LINE_BUF_HEIGHT) * owidth,
509 /* apply vertical filter */
510 phase_y = get_phase(src_y);
512 /* desactivated MMX because loss of precision */
513 if ((mm_flags & MM_MMX) && NB_TAPS == 4 && 0)
514 v_resample4_mmx(output, owidth,
515 s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
516 &s->v_filters[phase_y][0]);
520 if ((mm_flags & MM_ALTIVEC) && NB_TAPS == 4 && FILTER_BITS <= 6)
521 v_resample16_altivec(output, owidth,
522 s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
523 &s->v_filters[phase_y][0]);
526 v_resample(output, owidth,
527 s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
528 &s->v_filters[phase_y][0]);
535 /* XXX: the following filter is quite naive, but it seems to suffice
537 static void build_filter(int16_t *filter, float factor)
540 float x, y, tab[NB_TAPS], norm, mult;
542 /* if upsampling, only need to interpolate, no filter */
546 for(ph=0;ph<NB_PHASES;ph++) {
548 for(i=0;i<NB_TAPS;i++) {
550 x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor;
559 /* normalize so that an uniform color remains the same */
560 mult = (float)(1 << FILTER_BITS) / norm;
561 for(i=0;i<NB_TAPS;i++) {
562 v = (int)(tab[i] * mult);
563 filter[ph * NB_TAPS + i] = v;
568 ImgReSampleContext *img_resample_init(int owidth, int oheight,
569 int iwidth, int iheight)
571 return img_resample_full_init(owidth, oheight, iwidth, iheight, 0, 0, 0, 0);
574 ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
575 int iwidth, int iheight,
576 int topBand, int bottomBand,
577 int leftBand, int rightBand)
579 ImgReSampleContext *s;
581 s = av_mallocz(sizeof(ImgReSampleContext));
584 s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS));
589 s->oheight = oheight;
591 s->iheight = iheight;
592 s->topBand = topBand;
593 s->bottomBand = bottomBand;
594 s->leftBand = leftBand;
595 s->rightBand = rightBand;
597 s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / owidth;
598 s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / oheight;
600 build_filter(&s->h_filters[0][0], (float) owidth / (float) (iwidth - leftBand - rightBand));
601 build_filter(&s->v_filters[0][0], (float) oheight / (float) (iheight - topBand - bottomBand));
609 void img_resample(ImgReSampleContext *s,
610 AVPicture *output, AVPicture *input)
615 shift = (i == 0) ? 0 : 1;
616 component_resample(s, output->data[i], output->linesize[i],
617 s->owidth >> shift, s->oheight >> shift,
618 input->data[i] + (input->linesize[i] * (s->topBand >> shift)) + (s->leftBand >> shift),
619 input->linesize[i], ((s->iwidth - s->leftBand - s->rightBand) >> shift),
620 (s->iheight - s->topBand - s->bottomBand) >> shift);
624 void img_resample_close(ImgReSampleContext *s)
626 av_free(s->line_buf);
632 void *av_mallocz(int size)
636 memset(ptr, 0, size);
640 void av_free(void *ptr)
642 /* XXX: this test should not be needed on most libcs */
650 uint8_t img[XSIZE * YSIZE];
655 uint8_t img1[XSIZE1 * YSIZE1];
656 uint8_t img2[XSIZE1 * YSIZE1];
658 void save_pgm(const char *filename, uint8_t *img, int xsize, int ysize)
661 f=fopen(filename,"w");
662 fprintf(f,"P5\n%d %d\n%d\n", xsize, ysize, 255);
663 fwrite(img,1, xsize * ysize,f);
667 static void dump_filter(int16_t *filter)
671 for(ph=0;ph<NB_PHASES;ph++) {
673 for(i=0;i<NB_TAPS;i++) {
674 printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0);
684 int main(int argc, char **argv)
686 int x, y, v, i, xsize, ysize;
687 ImgReSampleContext *s;
688 float fact, factors[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
691 /* build test image */
692 for(y=0;y<YSIZE;y++) {
693 for(x=0;x<XSIZE;x++) {
694 if (x < XSIZE/2 && y < YSIZE/2) {
695 if (x < XSIZE/4 && y < YSIZE/4) {
701 } else if (x < XSIZE/4) {
706 } else if (y < XSIZE/4) {
718 if (((x+3) % 4) <= 1 &&
725 } else if (x < XSIZE/2) {
726 v = ((x - (XSIZE/2)) * 255) / (XSIZE/2);
727 } else if (y < XSIZE/2) {
728 v = ((y - (XSIZE/2)) * 255) / (XSIZE/2);
730 v = ((x + y - XSIZE) * 255) / XSIZE;
732 img[(YSIZE - y) * XSIZE + (XSIZE - x)] = v;
735 save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE);
736 for(i=0;i<sizeof(factors)/sizeof(float);i++) {
738 xsize = (int)(XSIZE * fact);
739 ysize = (int)((YSIZE - 100) * fact);
740 s = img_resample_full_init(xsize, ysize, XSIZE, YSIZE, 50 ,50, 0, 0);
741 printf("Factor=%0.2f\n", fact);
742 dump_filter(&s->h_filters[0][0]);
743 component_resample(s, img1, xsize, xsize, ysize,
744 img + 50 * XSIZE, XSIZE, XSIZE, YSIZE - 100);
745 img_resample_close(s);
747 sprintf(buf, "/tmp/out%d.pgm", i);
748 save_pgm(buf, img1, xsize, ysize);
753 printf("MMX test\n");
755 xsize = (int)(XSIZE * fact);
756 ysize = (int)(YSIZE * fact);
758 s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
759 component_resample(s, img1, xsize, xsize, ysize,
760 img, XSIZE, XSIZE, YSIZE);
763 s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
764 component_resample(s, img2, xsize, xsize, ysize,
765 img, XSIZE, XSIZE, YSIZE);
766 if (memcmp(img1, img2, xsize * ysize) != 0) {
767 fprintf(stderr, "mmx error\n");