1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 * libpng version 1.2.5 - October 3, 2002
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2002 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22 * [runtime MMX configuration, GRR 20010102]
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31 static int mmx_supported=2;
37 int mmx_supported_local = 0;
39 push ebx //CPUID will trash these
43 pushfd //Save Eflag to stack
44 pop eax //Get Eflag from stack into eax
45 mov ecx, eax //Make another copy of Eflag in ecx
46 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
47 push eax //Save modified Eflag back to stack
49 popfd //Restored modified value back to Eflag reg
50 pushfd //Save Eflag to stack
51 pop eax //Get Eflag from stack
52 push ecx // save original Eflag to stack
53 popfd // restore original Eflag
54 xor eax, ecx //Compare the new Eflag with the original Eflag
55 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
56 //skip following instructions and jump to
59 xor eax, eax //Set eax to zero
61 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
64 cmp eax, 1 //make sure eax return non-zero value
65 jl NOT_SUPPORTED //If eax is zero, mmx not supported
67 xor eax, eax //set eax to zero
68 inc eax //Now increment eax to 1. This instruction is
69 //faster than the instruction "mov eax, 1"
71 _asm _emit 0x0f //CPUID instruction
74 and edx, 0x00800000 //mask out all bits but mmx bit(24)
75 cmp edx, 0 // 0 = mmx not supported
76 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
78 mov mmx_supported_local, 1 //set return value to 1
81 mov eax, mmx_supported_local //move return value to eax
82 pop edx //CPUID trashed these
87 //mmx_supported_local=0; // test code for force don't support MMX
88 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
90 mmx_supported = mmx_supported_local;
91 return mmx_supported_local;
94 /* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
105 /* Use this routine for x86 platform - uses faster MMX routine if machine
109 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
111 #ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
115 png_debug(1,"in png_combine_row_asm\n");
117 if (mmx_supported == 2) {
118 /* this should have happened in png_init_mmx_flags() already */
119 png_warning(png_ptr, "asm_flags may not have been initialized");
125 png_memcpy(row, png_ptr->row_buf + 1,
126 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
128 /* GRR: add "else if (mask == 0)" case?
129 * or does png_combine_row() not even get called in that case? */
132 switch (png_ptr->row_info.pixel_depth)
138 int s_inc, s_start, s_end;
143 sp = png_ptr->row_buf + 1;
146 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
147 if (png_ptr->transformations & PNG_PACKSWAP)
163 for (i = 0; i < png_ptr->width; i++)
169 value = (*sp >> shift) & 0x1;
170 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
171 *dp |= (png_byte)(value << shift);
195 int s_start, s_end, s_inc;
201 sp = png_ptr->row_buf + 1;
204 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
205 if (png_ptr->transformations & PNG_PACKSWAP)
221 for (i = 0; i < png_ptr->width; i++)
225 value = (*sp >> shift) & 0x3;
226 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
227 *dp |= (png_byte)(value << shift);
250 int s_start, s_end, s_inc;
256 sp = png_ptr->row_buf + 1;
259 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
260 if (png_ptr->transformations & PNG_PACKSWAP)
275 for (i = 0; i < png_ptr->width; i++)
279 value = (*sp >> shift) & 0xf;
280 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
281 *dp |= (png_byte)(value << shift);
308 __int64 mask0=0x0102040810204080;
310 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
311 /* && mmx_supported */ )
313 srcptr = png_ptr->row_buf + 1;
317 len = png_ptr->width &~7; //reduce to multiple of 8
318 diff = png_ptr->width & 7; //amount lost
322 movd mm7, unmask //load bit pattern
323 psubb mm6,mm6 //zero mm6
326 punpckldq mm7,mm7 //fill register with 8 masks
330 pand mm0,mm7 //nonzero if keep byte
331 pcmpeqb mm0,mm6 //zeros->1s, v versa
333 mov ecx,len //load length of line (pixels)
334 mov esi,srcptr //load source
335 mov ebx,dstptr //load dest
347 add esi,8 //inc by 8 bytes processed
349 sub ecx,8 //dec by 8 pixels processed
359 sal edx,24 //make low byte the high byte
362 sal edx,1 //move high bit to CF
363 jnc skip8 //if CF = 0
376 else /* mmx not supported - use modified C routine */
378 register unsigned int incr1, initial_val, final_val;
379 png_size_t pixel_bytes;
381 register int disp = png_pass_inc[png_ptr->pass];
382 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
384 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
385 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
387 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
388 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
389 final_val = png_ptr->width*pixel_bytes;
390 incr1 = (disp)*pixel_bytes;
391 for (i = initial_val; i < final_val; i += incr1)
393 png_memcpy(dstptr, srcptr, pixel_bytes);
408 __int64 mask1=0x0101020204040808,
409 mask0=0x1010202040408080;
411 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
412 /* && mmx_supported */ )
414 srcptr = png_ptr->row_buf + 1;
418 len = (png_ptr->width)&~7;
419 diff = (png_ptr->width)&7;
422 movd mm7, unmask //load bit pattern
423 psubb mm6,mm6 //zero mm6
426 punpckldq mm7,mm7 //fill register with 8 masks
437 mov ecx,len //load length of line
438 mov esi,srcptr //load source
439 mov ebx,dstptr //load dest
460 add esi,16 //inc by 16 bytes processed
462 sub ecx,8 //dec by 8 pixels processed
472 sal edx,24 //make low byte the high byte
474 sal edx,1 //move high bit to CF
475 jnc skip16 //if CF = 0
488 else /* mmx not supported - use modified C routine */
490 register unsigned int incr1, initial_val, final_val;
491 png_size_t pixel_bytes;
493 register int disp = png_pass_inc[png_ptr->pass];
494 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
496 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
497 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
499 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
500 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
501 final_val = png_ptr->width*pixel_bytes;
502 incr1 = (disp)*pixel_bytes;
503 for (i = initial_val; i < final_val; i += incr1)
505 png_memcpy(dstptr, srcptr, pixel_bytes);
521 __int64 mask2=0x0101010202020404, //24bpp
522 mask1=0x0408080810101020,
523 mask0=0x2020404040808080;
525 srcptr = png_ptr->row_buf + 1;
529 len = (png_ptr->width)&~7;
530 diff = (png_ptr->width)&7;
532 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
533 /* && mmx_supported */ )
537 movd mm7, unmask //load bit pattern
538 psubb mm6,mm6 //zero mm6
541 punpckldq mm7,mm7 //fill register with 8 masks
555 mov ecx,len //load length of line
556 mov esi,srcptr //load source
557 mov ebx,dstptr //load dest
587 add esi,24 //inc by 24 bytes processed
589 sub ecx,8 //dec by 8 pixels processed
599 sal edx,24 //make low byte the high byte
601 sal edx,1 //move high bit to CF
602 jnc skip24 //if CF = 0
619 else /* mmx not supported - use modified C routine */
621 register unsigned int incr1, initial_val, final_val;
622 png_size_t pixel_bytes;
624 register int disp = png_pass_inc[png_ptr->pass];
625 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
627 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
628 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
630 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
631 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
632 final_val = png_ptr->width*pixel_bytes;
633 incr1 = (disp)*pixel_bytes;
634 for (i = initial_val; i < final_val; i += incr1)
636 png_memcpy(dstptr, srcptr, pixel_bytes);
652 __int64 mask3=0x0101010102020202, //32bpp
653 mask2=0x0404040408080808,
654 mask1=0x1010101020202020,
655 mask0=0x4040404080808080;
657 srcptr = png_ptr->row_buf + 1;
661 len = (png_ptr->width)&~7;
662 diff = (png_ptr->width)&7;
664 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
665 /* && mmx_supported */ )
669 movd mm7, unmask //load bit pattern
670 psubb mm6,mm6 //zero mm6
673 punpckldq mm7,mm7 //fill register with 8 masks
690 mov ecx,len //load length of line
691 mov esi,srcptr //load source
692 mov ebx,dstptr //load dest
730 add esi,32 //inc by 32 bytes processed
732 sub ecx,8 //dec by 8 pixels processed
742 sal edx,24 //make low byte the high byte
744 sal edx,1 //move high bit to CF
745 jnc skip32 //if CF = 0
759 else /* mmx _not supported - Use modified C routine */
761 register unsigned int incr1, initial_val, final_val;
762 png_size_t pixel_bytes;
764 register int disp = png_pass_inc[png_ptr->pass];
765 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
767 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
768 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
770 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
771 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
772 final_val = png_ptr->width*pixel_bytes;
773 incr1 = (disp)*pixel_bytes;
774 for (i = initial_val; i < final_val; i += incr1)
776 png_memcpy(dstptr, srcptr, pixel_bytes);
792 __int64 mask5=0x0101010101010202,
793 mask4=0x0202020204040404,
794 mask3=0x0404080808080808,
795 mask2=0x1010101010102020,
796 mask1=0x2020202040404040,
797 mask0=0x4040808080808080;
799 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
800 /* && mmx_supported */ )
802 srcptr = png_ptr->row_buf + 1;
806 len = (png_ptr->width)&~7;
807 diff = (png_ptr->width)&7;
810 movd mm7, unmask //load bit pattern
811 psubb mm6,mm6 //zero mm6
814 punpckldq mm7,mm7 //fill register with 8 masks
837 mov ecx,len //load length of line
838 mov esi,srcptr //load source
839 mov ebx,dstptr //load dest
887 add esi,48 //inc by 32 bytes processed
889 sub ecx,8 //dec by 8 pixels processed
899 sal edx,24 //make low byte the high byte
902 sal edx,1 //move high bit to CF
903 jnc skip48 //if CF = 0
917 else /* mmx _not supported - Use modified C routine */
919 register unsigned int incr1, initial_val, final_val;
920 png_size_t pixel_bytes;
922 register int disp = png_pass_inc[png_ptr->pass];
923 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
925 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
926 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
928 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
929 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
930 final_val = png_ptr->width*pixel_bytes;
931 incr1 = (disp)*pixel_bytes;
932 for (i = initial_val; i < final_val; i += incr1)
934 png_memcpy(dstptr, srcptr, pixel_bytes);
947 png_size_t pixel_bytes;
948 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
950 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
951 register unsigned int incr1, initial_val, final_val;
953 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
954 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
956 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
957 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
958 final_val = png_ptr->width*pixel_bytes;
959 incr1 = (disp)*pixel_bytes;
960 for (i = initial_val; i < final_val; i += incr1)
962 png_memcpy(dp, sptr, pixel_bytes);
968 } /* end switch (png_ptr->row_info.pixel_depth) */
969 } /* end if (non-trivial mask) */
971 } /* end png_combine_row() */
974 #if defined(PNG_READ_INTERLACING_SUPPORTED)
977 png_do_read_interlace(png_structp png_ptr)
979 png_row_infop row_info = &(png_ptr->row_info);
980 png_bytep row = png_ptr->row_buf + 1;
981 int pass = png_ptr->pass;
982 png_uint_32 transformations = png_ptr->transformations;
983 #ifdef PNG_USE_LOCAL_ARRAYS
984 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
987 png_debug(1,"in png_do_read_interlace\n");
989 if (mmx_supported == 2) {
990 /* this should have happened in png_init_mmx_flags() already */
991 png_warning(png_ptr, "asm_flags may not have been initialized");
995 if (row != NULL && row_info != NULL)
997 png_uint_32 final_width;
999 final_width = row_info->width * png_pass_inc[pass];
1001 switch (row_info->pixel_depth)
1007 int s_start, s_end, s_inc;
1012 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1013 dp = row + (png_size_t)((final_width - 1) >> 3);
1014 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1015 if (transformations & PNG_PACKSWAP)
1017 sshift = (int)((row_info->width + 7) & 7);
1018 dshift = (int)((final_width + 7) & 7);
1026 sshift = 7 - (int)((row_info->width + 7) & 7);
1027 dshift = 7 - (int)((final_width + 7) & 7);
1033 for (i = row_info->width; i; i--)
1035 v = (png_byte)((*sp >> sshift) & 0x1);
1036 for (j = 0; j < png_pass_inc[pass]; j++)
1038 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1039 *dp |= (png_byte)(v << dshift);
1040 if (dshift == s_end)
1048 if (sshift == s_end)
1063 int s_start, s_end, s_inc;
1066 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1067 dp = row + (png_size_t)((final_width - 1) >> 2);
1068 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1069 if (transformations & PNG_PACKSWAP)
1071 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1072 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1080 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1081 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1087 for (i = row_info->width; i; i--)
1092 v = (png_byte)((*sp >> sshift) & 0x3);
1093 for (j = 0; j < png_pass_inc[pass]; j++)
1095 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1096 *dp |= (png_byte)(v << dshift);
1097 if (dshift == s_end)
1105 if (sshift == s_end)
1120 int s_start, s_end, s_inc;
1123 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1124 dp = row + (png_size_t)((final_width - 1) >> 1);
1125 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1126 if (transformations & PNG_PACKSWAP)
1128 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1129 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1137 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1138 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1144 for (i = row_info->width; i; i--)
1149 v = (png_byte)((*sp >> sshift) & 0xf);
1150 for (j = 0; j < png_pass_inc[pass]; j++)
1152 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1153 *dp |= (png_byte)(v << dshift);
1154 if (dshift == s_end)
1162 if (sshift == s_end)
1173 default: // This is the place where the routine is modified
1175 __int64 const4 = 0x0000000000FFFFFF;
1176 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1177 __int64 const6 = 0x00000000000000FF;
1180 png_size_t pixel_bytes;
1181 int width = row_info->width;
1183 pixel_bytes = (row_info->pixel_depth >> 3);
1185 sptr = row + (width - 1) * pixel_bytes;
1186 dp = row + (final_width - 1) * pixel_bytes;
1187 // New code by Nirav Chhatrapati - Intel Corporation
1189 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1191 // use MMX routine if machine supports it
1192 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1193 /* && mmx_supported */ )
1195 if (pixel_bytes == 3)
1197 if (((pass == 0) || (pass == 1)) && width)
1204 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1206 movd mm0, [esi] ; X X X X X v2 v1 v0
1207 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1208 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1209 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1210 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1211 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1212 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1213 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1214 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1215 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1216 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1217 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1218 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1220 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1222 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1232 else if (((pass == 2) || (pass == 3)) && width)
1239 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1241 movd mm0, [esi] ; X X X X X v2 v1 v0
1242 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1243 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1244 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1245 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1246 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1247 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1248 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1249 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1250 movq [edi+4], mm0 ; move to memory
1251 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1252 movd [edi], mm0 ; move to memory
1260 else if (width) /* && ((pass == 4) || (pass == 5)) */
1262 int width_mmx = ((width >> 1) << 1) - 8;
1265 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1276 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1277 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1278 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1279 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1280 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1281 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1282 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1283 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1284 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1285 movq [edi], mm0 ; move quad to memory
1286 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1287 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1288 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1289 movd [edi+8], mm6 ; move double to memory
1298 sptr -= width_mmx*3;
1300 for (i = width; i; i--)
1305 png_memcpy(v, sptr, 3);
1306 for (j = 0; j < png_pass_inc[pass]; j++)
1308 png_memcpy(dp, v, 3);
1314 } /* end of pixel_bytes == 3 */
1316 else if (pixel_bytes == 1)
1318 if (((pass == 0) || (pass == 1)) && width)
1320 int width_mmx = ((width >> 2) << 2);
1332 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1333 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1334 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1335 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1336 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1337 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1338 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1339 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1340 movq [edi], mm0 ; move to memory v3
1341 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1342 movq [edi+8], mm3 ; move to memory v2
1343 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1344 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1345 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1346 movq [edi+16], mm2 ; move to memory v1
1347 movq [edi+24], mm4 ; move to memory v0
1358 for (i = width; i; i--)
1362 /* I simplified this part in version 1.0.4e
1363 * here and in several other instances where
1364 * pixel_bytes == 1 -- GR-P
1369 * png_memcpy(v, sptr, pixel_bytes);
1370 * for (j = 0; j < png_pass_inc[pass]; j++)
1372 * png_memcpy(dp, v, pixel_bytes);
1373 * dp -= pixel_bytes;
1375 * sptr -= pixel_bytes;
1377 * Replacement code is in the next three lines:
1380 for (j = 0; j < png_pass_inc[pass]; j++)
1385 else if (((pass == 2) || (pass == 3)) && width)
1387 int width_mmx = ((width >> 2) << 2);
1399 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1400 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1401 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1402 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1403 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1404 movq [edi], mm0 ; move to memory v2 and v3
1406 movq [edi+8], mm1 ; move to memory v1 and v0
1416 for (i = width; i; i--)
1420 for (j = 0; j < png_pass_inc[pass]; j++)
1427 else if (width) /* && ((pass == 4) || (pass == 5))) */
1429 int width_mmx = ((width >> 3) << 3);
1441 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1442 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1443 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1444 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1445 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1446 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1448 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1459 for (i = width; i; i--)
1463 for (j = 0; j < png_pass_inc[pass]; j++)
1470 } /* end of pixel_bytes == 1 */
1472 else if (pixel_bytes == 2)
1474 if (((pass == 0) || (pass == 1)) && width)
1476 int width_mmx = ((width >> 1) << 1);
1488 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1489 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1490 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1491 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1492 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1495 movq [edi + 16], mm1
1496 movq [edi + 24], mm1
1505 sptr -= (width_mmx*2 - 2); // sign fixed
1506 dp -= (width_mmx*16 - 2); // sign fixed
1507 for (i = width; i; i--)
1512 png_memcpy(v, sptr, 2);
1513 for (j = 0; j < png_pass_inc[pass]; j++)
1516 png_memcpy(dp, v, 2);
1520 else if (((pass == 2) || (pass == 3)) && width)
1522 int width_mmx = ((width >> 1) << 1) ;
1534 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1535 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1536 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1537 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1538 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1550 sptr -= (width_mmx*2 - 2); // sign fixed
1551 dp -= (width_mmx*8 - 2); // sign fixed
1552 for (i = width; i; i--)
1557 png_memcpy(v, sptr, 2);
1558 for (j = 0; j < png_pass_inc[pass]; j++)
1561 png_memcpy(dp, v, 2);
1565 else if (width) // pass == 4 or 5
1567 int width_mmx = ((width >> 1) << 1) ;
1579 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1580 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1590 sptr -= (width_mmx*2 - 2); // sign fixed
1591 dp -= (width_mmx*4 - 2); // sign fixed
1592 for (i = width; i; i--)
1597 png_memcpy(v, sptr, 2);
1598 for (j = 0; j < png_pass_inc[pass]; j++)
1601 png_memcpy(dp, v, 2);
1605 } /* end of pixel_bytes == 2 */
1607 else if (pixel_bytes == 4)
1609 if (((pass == 0) || (pass == 1)) && width)
1611 int width_mmx = ((width >> 1) << 1) ;
1623 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1624 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1625 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1626 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1629 movq [edi + 16], mm0
1630 movq [edi + 24], mm0
1632 movq [edi + 40], mm1
1635 movq [edi + 56], mm1
1643 sptr -= (width_mmx*4 - 4); // sign fixed
1644 dp -= (width_mmx*32 - 4); // sign fixed
1645 for (i = width; i; i--)
1650 png_memcpy(v, sptr, 4);
1651 for (j = 0; j < png_pass_inc[pass]; j++)
1654 png_memcpy(dp, v, 4);
1658 else if (((pass == 2) || (pass == 3)) && width)
1660 int width_mmx = ((width >> 1) << 1) ;
1672 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1673 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1674 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1675 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1679 movq [edi + 24], mm1
1688 sptr -= (width_mmx*4 - 4); // sign fixed
1689 dp -= (width_mmx*16 - 4); // sign fixed
1690 for (i = width; i; i--)
1695 png_memcpy(v, sptr, 4);
1696 for (j = 0; j < png_pass_inc[pass]; j++)
1699 png_memcpy(dp, v, 4);
1703 else if (width) // pass == 4 or 5
1705 int width_mmx = ((width >> 1) << 1) ;
1717 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1718 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1719 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1720 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1731 sptr -= (width_mmx*4 - 4); // sign fixed
1732 dp -= (width_mmx*8 - 4); // sign fixed
1733 for (i = width; i; i--)
1738 png_memcpy(v, sptr, 4);
1739 for (j = 0; j < png_pass_inc[pass]; j++)
1742 png_memcpy(dp, v, 4);
1747 } /* end of pixel_bytes == 4 */
1749 else if (pixel_bytes == 6)
1751 for (i = width; i; i--)
1755 png_memcpy(v, sptr, 6);
1756 for (j = 0; j < png_pass_inc[pass]; j++)
1758 png_memcpy(dp, v, 6);
1763 } /* end of pixel_bytes == 6 */
1767 for (i = width; i; i--)
1771 png_memcpy(v, sptr, pixel_bytes);
1772 for (j = 0; j < png_pass_inc[pass]; j++)
1774 png_memcpy(dp, v, pixel_bytes);
1780 } /* end of mmx_supported */
1782 else /* MMX not supported: use modified C code - takes advantage
1783 * of inlining of memcpy for a constant */
1785 if (pixel_bytes == 1)
1787 for (i = width; i; i--)
1790 for (j = 0; j < png_pass_inc[pass]; j++)
1795 else if (pixel_bytes == 3)
1797 for (i = width; i; i--)
1801 png_memcpy(v, sptr, pixel_bytes);
1802 for (j = 0; j < png_pass_inc[pass]; j++)
1804 png_memcpy(dp, v, pixel_bytes);
1807 sptr -= pixel_bytes;
1810 else if (pixel_bytes == 2)
1812 for (i = width; i; i--)
1816 png_memcpy(v, sptr, pixel_bytes);
1817 for (j = 0; j < png_pass_inc[pass]; j++)
1819 png_memcpy(dp, v, pixel_bytes);
1822 sptr -= pixel_bytes;
1825 else if (pixel_bytes == 4)
1827 for (i = width; i; i--)
1831 png_memcpy(v, sptr, pixel_bytes);
1832 for (j = 0; j < png_pass_inc[pass]; j++)
1834 png_memcpy(dp, v, pixel_bytes);
1837 sptr -= pixel_bytes;
1840 else if (pixel_bytes == 6)
1842 for (i = width; i; i--)
1846 png_memcpy(v, sptr, pixel_bytes);
1847 for (j = 0; j < png_pass_inc[pass]; j++)
1849 png_memcpy(dp, v, pixel_bytes);
1852 sptr -= pixel_bytes;
1857 for (i = width; i; i--)
1861 png_memcpy(v, sptr, pixel_bytes);
1862 for (j = 0; j < png_pass_inc[pass]; j++)
1864 png_memcpy(dp, v, pixel_bytes);
1867 sptr -= pixel_bytes;
1871 } /* end of MMX not supported */
1874 } /* end switch (row_info->pixel_depth) */
1876 row_info->width = final_width;
1877 row_info->rowbytes = ((final_width *
1878 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1883 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1886 // These variables are utilized in the functions below. They are declared
1887 // globally here to ensure alignment on 8-byte boundaries.
1892 } LBCarryMask = {0x0101010101010101},
1893 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1894 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1897 // Optimized code for PNG Average filter decoder
1899 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1900 , png_bytep prev_row)
1903 png_uint_32 FullLength;
1904 png_uint_32 MMXLength;
1908 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1909 FullLength = row_info->rowbytes; // # of bytes to filter
1911 // Init address pointers and offset
1912 mov edi, row // edi ==> Avg(x)
1913 xor ebx, ebx // ebx ==> x
1915 mov esi, prev_row // esi ==> Prior(x)
1916 sub edx, bpp // edx ==> Raw(x-bpp)
1919 // Compute the Raw value for the first bpp bytes
1920 // Raw(x) = Avg(x) + (Prior(x)/2)
1922 mov al, [esi + ebx] // Load al with Prior(x)
1924 shr al, 1 // divide by 2
1925 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1927 mov [edi+ebx-1], al // Write back Raw(x);
1928 // mov does not affect flags; -1 to offset inc ebx
1930 // get # of bytes to alignment
1931 mov diff, edi // take start of row
1932 add diff, ebx // add bpp
1933 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1934 and diff, 0xfffffff8 // mask to alignment boundary
1935 sub diff, edi // subtract from start ==> value ebx at alignment
1938 // Compute the Raw value for the bytes upto the alignment boundary
1939 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1943 mov cl, [esi + ebx] // load cl with Prior(x)
1944 mov al, [edx + ebx] // load al with Raw(x-bpp)
1947 shr ax, 1 // divide by 2
1948 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1949 cmp ebx, diff // Check if at alignment boundary
1950 mov [edi+ebx-1], al // Write back Raw(x);
1951 // mov does not affect flags; -1 to offset inc ebx
1952 jb davglp1 // Repeat until at alignment boundary
1956 sub eax, ebx // subtract alignment fix
1957 and eax, 0x00000007 // calc bytes over mult of 8
1958 sub ecx, eax // drop over bytes from original length
1961 // Now do the math for the rest of the row
1966 ActiveMask.use = 0x0000000000ffffff;
1967 ShiftBpp.use = 24; // == 3 * 8
1968 ShiftRem.use = 40; // == 64 - 24
1970 // Re-init address pointers and offset
1971 movq mm7, ActiveMask
1972 mov ebx, diff // ebx ==> x = offset to alignment boundary
1973 movq mm5, LBCarryMask
1974 mov edi, row // edi ==> Avg(x)
1975 movq mm4, HBClearMask
1976 mov esi, prev_row // esi ==> Prior(x)
1977 // PRIME the pump (load the first Raw(x-bpp) data set
1978 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1979 // (we correct position in loop below)
1981 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1982 // Add (Prev_row/2) to Average
1984 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1985 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1987 pand mm3, mm1 // get lsb for each prev_row byte
1988 psrlq mm1, 1 // divide prev_row bytes by 2
1989 pand mm1, mm4 // clear invalid bit 7 of each byte
1990 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1991 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1992 movq mm1, mm3 // now use mm1 for getting LBCarrys
1993 pand mm1, mm2 // get LBCarrys for each byte where both
1994 // lsb's were == 1 (Only valid for active group)
1995 psrlq mm2, 1 // divide raw bytes by 2
1996 pand mm2, mm4 // clear invalid bit 7 of each byte
1997 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1998 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
1999 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2001 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2002 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2003 movq mm2, mm0 // mov updated Raws to mm2
2004 psllq mm2, ShiftBpp // shift data to position correctly
2005 movq mm1, mm3 // now use mm1 for getting LBCarrys
2006 pand mm1, mm2 // get LBCarrys for each byte where both
2007 // lsb's were == 1 (Only valid for active group)
2008 psrlq mm2, 1 // divide raw bytes by 2
2009 pand mm2, mm4 // clear invalid bit 7 of each byte
2010 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2011 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2012 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2015 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2016 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2018 movq mm2, mm0 // mov updated Raws to mm2
2019 psllq mm2, ShiftBpp // shift data to position correctly
2020 // Data only needs to be shifted once here to
2021 // get the correct x-bpp offset.
2022 movq mm1, mm3 // now use mm1 for getting LBCarrys
2023 pand mm1, mm2 // get LBCarrys for each byte where both
2024 // lsb's were == 1 (Only valid for active group)
2025 psrlq mm2, 1 // divide raw bytes by 2
2026 pand mm2, mm4 // clear invalid bit 7 of each byte
2027 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2028 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2030 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2033 // Now ready to write back to memory
2034 movq [edi + ebx - 8], mm0
2035 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2037 movq mm2, mm0 // mov updated Raw(x) to mm2
2048 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2049 // appropriate inactive bytes
2050 ShiftBpp.use = bpp << 3;
2051 ShiftRem.use = 64 - ShiftBpp.use;
2053 movq mm4, HBClearMask
2054 // Re-init address pointers and offset
2055 mov ebx, diff // ebx ==> x = offset to alignment boundary
2056 // Load ActiveMask and clear all bytes except for 1st active group
2057 movq mm7, ActiveMask
2058 mov edi, row // edi ==> Avg(x)
2060 mov esi, prev_row // esi ==> Prior(x)
2062 movq mm5, LBCarryMask
2063 psllq mm6, ShiftBpp // Create mask for 2nd active group
2064 // PRIME the pump (load the first Raw(x-bpp) data set
2065 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2066 // (we correct position in loop below)
2068 movq mm0, [edi + ebx]
2069 psrlq mm2, ShiftRem // shift data to position correctly
2070 movq mm1, [esi + ebx]
2071 // Add (Prev_row/2) to Average
2073 pand mm3, mm1 // get lsb for each prev_row byte
2074 psrlq mm1, 1 // divide prev_row bytes by 2
2075 pand mm1, mm4 // clear invalid bit 7 of each byte
2076 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2077 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2078 movq mm1, mm3 // now use mm1 for getting LBCarrys
2079 pand mm1, mm2 // get LBCarrys for each byte where both
2080 // lsb's were == 1 (Only valid for active group)
2081 psrlq mm2, 1 // divide raw bytes by 2
2082 pand mm2, mm4 // clear invalid bit 7 of each byte
2083 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2084 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2085 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2087 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2088 movq mm2, mm0 // mov updated Raws to mm2
2089 psllq mm2, ShiftBpp // shift data to position correctly
2091 movq mm1, mm3 // now use mm1 for getting LBCarrys
2092 pand mm1, mm2 // get LBCarrys for each byte where both
2093 // lsb's were == 1 (Only valid for active group)
2094 psrlq mm2, 1 // divide raw bytes by 2
2095 pand mm2, mm4 // clear invalid bit 7 of each byte
2096 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2097 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2098 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2101 // Now ready to write back to memory
2102 movq [edi + ebx - 8], mm0
2103 // Prep Raw(x-bpp) for next loop
2104 movq mm2, mm0 // mov updated Raws to mm2
2111 ActiveMask.use = 0x000000000000ffff;
2112 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2113 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2116 movq mm7, ActiveMask
2117 // Re-init address pointers and offset
2118 mov ebx, diff // ebx ==> x = offset to alignment boundary
2119 movq mm5, LBCarryMask
2120 mov edi, row // edi ==> Avg(x)
2121 movq mm4, HBClearMask
2122 mov esi, prev_row // esi ==> Prior(x)
2123 // PRIME the pump (load the first Raw(x-bpp) data set
2124 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2125 // (we correct position in loop below)
2127 movq mm0, [edi + ebx]
2128 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2129 movq mm1, [esi + ebx]
2130 // Add (Prev_row/2) to Average
2132 pand mm3, mm1 // get lsb for each prev_row byte
2133 psrlq mm1, 1 // divide prev_row bytes by 2
2134 pand mm1, mm4 // clear invalid bit 7 of each byte
2136 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2137 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2138 movq mm1, mm3 // now use mm1 for getting LBCarrys
2139 pand mm1, mm2 // get LBCarrys for each byte where both
2140 // lsb's were == 1 (Only valid for active group)
2141 psrlq mm2, 1 // divide raw bytes by 2
2142 pand mm2, mm4 // clear invalid bit 7 of each byte
2143 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2144 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2145 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2146 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2147 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2148 movq mm2, mm0 // mov updated Raws to mm2
2149 psllq mm2, ShiftBpp // shift data to position correctly
2150 movq mm1, mm3 // now use mm1 for getting LBCarrys
2151 pand mm1, mm2 // get LBCarrys for each byte where both
2152 // lsb's were == 1 (Only valid for active group)
2153 psrlq mm2, 1 // divide raw bytes by 2
2154 pand mm2, mm4 // clear invalid bit 7 of each byte
2155 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2156 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2157 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2159 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2160 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2161 movq mm2, mm0 // mov updated Raws to mm2
2162 psllq mm2, ShiftBpp // shift data to position correctly
2163 // Data only needs to be shifted once here to
2164 // get the correct x-bpp offset.
2165 movq mm1, mm3 // now use mm1 for getting LBCarrys
2166 pand mm1, mm2 // get LBCarrys for each byte where both
2167 // lsb's were == 1 (Only valid for active group)
2168 psrlq mm2, 1 // divide raw bytes by 2
2169 pand mm2, mm4 // clear invalid bit 7 of each byte
2170 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2171 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2172 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2174 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2175 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2176 movq mm2, mm0 // mov updated Raws to mm2
2177 psllq mm2, ShiftBpp // shift data to position correctly
2178 // Data only needs to be shifted once here to
2179 // get the correct x-bpp offset.
2181 movq mm1, mm3 // now use mm1 for getting LBCarrys
2182 pand mm1, mm2 // get LBCarrys for each byte where both
2183 // lsb's were == 1 (Only valid for active group)
2184 psrlq mm2, 1 // divide raw bytes by 2
2185 pand mm2, mm4 // clear invalid bit 7 of each byte
2186 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2187 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2188 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2191 // Now ready to write back to memory
2192 movq [edi + ebx - 8], mm0
2193 // Prep Raw(x-bpp) for next loop
2194 movq mm2, mm0 // mov updated Raws to mm2
2203 // Re-init address pointers and offset
2204 mov ebx, diff // ebx ==> x = offset to alignment boundary
2205 mov edi, row // edi ==> Avg(x)
2206 cmp ebx, FullLength // Test if offset at end of array
2208 // Do Paeth decode for remaining bytes
2209 mov esi, prev_row // esi ==> Prior(x)
2211 xor ecx, ecx // zero ecx before using cl & cx in loop below
2212 sub edx, bpp // edx ==> Raw(x-bpp)
2214 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2216 mov cl, [esi + ebx] // load cl with Prior(x)
2217 mov al, [edx + ebx] // load al with Raw(x-bpp)
2220 shr ax, 1 // divide by 2
2221 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2222 cmp ebx, FullLength // Check if at end of array
2223 mov [edi+ebx-1], al // Write back Raw(x);
2224 // mov does not affect flags; -1 to offset inc ebx
2234 // Re-init address pointers and offset
2235 mov ebx, diff // ebx ==> x = offset to alignment boundary
2236 movq mm5, LBCarryMask
2237 mov edi, row // edi ==> Avg(x)
2238 movq mm4, HBClearMask
2239 mov esi, prev_row // esi ==> Prior(x)
2240 // PRIME the pump (load the first Raw(x-bpp) data set
2241 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2242 // (NO NEED to correct position in loop below)
2244 movq mm0, [edi + ebx]
2246 movq mm1, [esi + ebx]
2248 pand mm3, mm1 // get lsb for each prev_row byte
2249 psrlq mm1, 1 // divide prev_row bytes by 2
2250 pand mm3, mm2 // get LBCarrys for each byte where both
2252 psrlq mm2, 1 // divide raw bytes by 2
2253 pand mm1, mm4 // clear invalid bit 7 of each byte
2254 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2255 pand mm2, mm4 // clear invalid bit 7 of each byte
2256 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2257 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2259 movq [edi + ebx - 8], mm0
2260 movq mm2, mm0 // reuse as Raw(x-bpp)
2265 default: // bpp greater than 8
2268 movq mm5, LBCarryMask
2269 // Re-init address pointers and offset
2270 mov ebx, diff // ebx ==> x = offset to alignment boundary
2271 mov edi, row // edi ==> Avg(x)
2272 movq mm4, HBClearMask
2274 mov esi, prev_row // esi ==> Prior(x)
2275 sub edx, bpp // edx ==> Raw(x-bpp)
2277 movq mm0, [edi + ebx]
2279 movq mm1, [esi + ebx]
2280 pand mm3, mm1 // get lsb for each prev_row byte
2281 movq mm2, [edx + ebx]
2282 psrlq mm1, 1 // divide prev_row bytes by 2
2283 pand mm3, mm2 // get LBCarrys for each byte where both
2285 psrlq mm2, 1 // divide raw bytes by 2
2286 pand mm1, mm4 // clear invalid bit 7 of each byte
2287 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2288 pand mm2, mm4 // clear invalid bit 7 of each byte
2289 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2291 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2293 movq [edi + ebx - 8], mm0
2298 } // end switch ( bpp )
2301 // MMX acceleration complete now do clean-up
2302 // Check if any remaining bytes left to decode
2303 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2304 mov edi, row // edi ==> Avg(x)
2305 cmp ebx, FullLength // Test if offset at end of array
2307 // Do Paeth decode for remaining bytes
2308 mov esi, prev_row // esi ==> Prior(x)
2310 xor ecx, ecx // zero ecx before using cl & cx in loop below
2311 sub edx, bpp // edx ==> Raw(x-bpp)
2313 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2315 mov cl, [esi + ebx] // load cl with Prior(x)
2316 mov al, [edx + ebx] // load al with Raw(x-bpp)
2319 shr ax, 1 // divide by 2
2320 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2321 cmp ebx, FullLength // Check if at end of array
2322 mov [edi+ebx-1], al // Write back Raw(x);
2323 // mov does not affect flags; -1 to offset inc ebx
2326 emms // End MMX instructions; prep for possible FP instrs.
2330 // Optimized code for PNG Paeth filter decoder
2332 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2335 png_uint_32 FullLength;
2336 png_uint_32 MMXLength;
2341 int patemp, pbtemp, pctemp;
2343 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2344 FullLength = row_info->rowbytes; // # of bytes to filter
2347 xor ebx, ebx // ebx ==> x offset
2349 xor edx, edx // edx ==> x-bpp offset
2353 // Compute the Raw value for the first bpp bytes
2354 // Note: the formula works out to be always
2355 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2361 mov [edi + ebx - 1], al
2363 // get # of bytes to alignment
2364 mov diff, edi // take start of row
2365 add diff, ebx // add bpp
2367 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2368 and diff, 0xfffffff8 // mask to alignment boundary
2369 sub diff, edi // subtract from start ==> value ebx at alignment
2374 // pav = p - a = (a + b - c) - a = b - c
2375 mov al, [esi + ebx] // load Prior(x) into al
2376 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2377 sub eax, ecx // subtract Prior(x-bpp)
2378 mov patemp, eax // Save pav for later use
2380 // pbv = p - b = (a + b - c) - b = a - c
2381 mov al, [edi + edx] // load Raw(x-bpp) into al
2382 sub eax, ecx // subtract Prior(x-bpp)
2384 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2385 add eax, patemp // pcv = pav + pbv
2387 test eax, 0x80000000
2389 neg eax // reverse sign of neg values
2391 mov pctemp, eax // save pc for later use
2393 test ecx, 0x80000000
2395 neg ecx // reverse sign of neg values
2397 mov pbtemp, ecx // save pb for later use
2400 test eax, 0x80000000
2402 neg eax // reverse sign of neg values
2404 mov patemp, eax // save pa for later use
2408 // pa > pb; now test if pb <= pc
2411 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2412 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2415 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2416 mov cl, [esi + ebx] // load Prior(x) into cl
2419 // pa <= pb; now test if pa <= pc
2422 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2423 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2426 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2427 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2431 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2432 add [edi + ebx - 1], cl
2438 sub eax, ebx // subtract alignment fix
2439 and eax, 0x00000007 // calc bytes over mult of 8
2440 sub ecx, eax // drop over bytes from original length
2443 // Now do the math for the rest of the row
2448 ActiveMask.use = 0x0000000000ffffff;
2449 ActiveMaskEnd.use = 0xffff000000000000;
2450 ShiftBpp.use = 24; // == bpp(3) * 8
2451 ShiftRem.use = 40; // == 64 - 24
2458 // PRIME the pump (load the first Raw(x-bpp) data set
2459 movq mm1, [edi+ebx-8]
2461 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2462 movq mm2, [esi + ebx] // load b=Prior(x)
2463 punpcklbw mm1, mm0 // Unpack High bytes of a
2464 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2465 punpcklbw mm2, mm0 // Unpack High bytes of b
2466 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2467 // pav = p - a = (a + b - c) - a = b - c
2469 punpcklbw mm3, mm0 // Unpack High bytes of c
2470 // pbv = p - b = (a + b - c) - b = a - c
2474 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2478 // pa = abs(p-a) = abs(pav)
2479 // pb = abs(p-b) = abs(pbv)
2480 // pc = abs(p-c) = abs(pcv)
2481 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2483 pand mm0, mm4 // Only pav bytes < 0 in mm7
2484 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2486 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2490 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2491 pand mm0, mm6 // Only pav bytes < 0 in mm7
2497 pcmpgtw mm7, mm5 // pa > pb?
2499 // use mm7 mask to merge pa & pb
2501 // use mm0 mask copy to merge a & b
2507 // test ((pa <= pb)? pa:pb) <= pc
2508 pcmpgtw mm7, mm6 // pab > pc?
2515 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2516 pand mm7, ActiveMask
2517 movq mm2, mm3 // load b=Prior(x) step 1
2518 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2519 punpcklbw mm3, mm0 // Unpack High bytes of c
2520 movq [edi + ebx], mm7 // write back updated value
2521 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2522 // Now do Paeth for 2nd set of bytes (3-5)
2523 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2524 punpcklbw mm1, mm0 // Unpack High bytes of a
2526 punpcklbw mm2, mm0 // Unpack High bytes of b
2527 // pbv = p - b = (a + b - c) - b = a - c
2529 // pav = p - a = (a + b - c) - a = b - c
2533 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2534 // pav + pbv = pbv + pav
2538 // pa = abs(p-a) = abs(pav)
2539 // pb = abs(p-b) = abs(pbv)
2540 // pc = abs(p-c) = abs(pcv)
2541 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2542 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2543 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2544 pand mm7, mm4 // Only pav bytes < 0 in mm7
2550 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2551 pand mm0, mm6 // Only pav bytes < 0 in mm7
2556 pcmpgtw mm7, mm5 // pa > pb?
2558 // use mm7 mask to merge pa & pb
2560 // use mm0 mask copy to merge a & b
2566 // test ((pa <= pb)? pa:pb) <= pc
2567 pcmpgtw mm7, mm6 // pab > pc?
2568 movq mm2, [esi + ebx] // load b=Prior(x)
2575 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2576 pand mm7, ActiveMask
2577 punpckhbw mm2, mm0 // Unpack High bytes of b
2578 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2579 // pav = p - a = (a + b - c) - a = b - c
2581 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2582 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2583 movq [edi + ebx], mm7 // write back updated value
2585 punpckhbw mm3, mm0 // Unpack High bytes of c
2586 psllq mm1, ShiftBpp // Shift bytes
2587 // Now mm1 will be used as Raw(x-bpp)
2588 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2590 punpckhbw mm1, mm0 // Unpack High bytes of a
2592 // pbv = p - b = (a + b - c) - b = a - c
2594 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2600 // pa = abs(p-a) = abs(pav)
2601 // pb = abs(p-b) = abs(pbv)
2602 // pc = abs(p-c) = abs(pcv)
2603 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2604 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2605 pand mm0, mm4 // Only pav bytes < 0 in mm7
2606 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2612 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2613 pand mm0, mm6 // Only pav bytes < 0 in mm7
2618 pcmpgtw mm7, mm5 // pa > pb?
2620 // use mm0 mask copy to merge a & b
2622 // use mm7 mask to merge pa & pb
2628 // test ((pa <= pb)? pa:pb) <= pc
2629 pcmpgtw mm7, mm6 // pab > pc?
2635 // Step ebx to next set of 8 bytes and repeat loop til done
2637 pand mm1, ActiveMaskEnd
2638 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2641 pxor mm0, mm0 // pxor does not affect flags
2642 movq [edi + ebx - 8], mm1 // write back updated value
2643 // mm1 will be used as Raw(x-bpp) next loop
2644 // mm3 ready to be used as Prior(x-bpp) next loop
2654 ActiveMask.use = 0x00000000ffffffff;
2655 ActiveMask2.use = 0xffffffff00000000;
2656 ShiftBpp.use = bpp << 3; // == bpp * 8
2657 ShiftRem.use = 64 - ShiftBpp.use;
2663 // PRIME the pump (load the first Raw(x-bpp) data set
2664 movq mm1, [edi+ebx-8]
2667 // Must shift to position Raw(x-bpp) data
2669 // Do first set of 4 bytes
2670 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2671 punpcklbw mm1, mm0 // Unpack Low bytes of a
2672 movq mm2, [esi + ebx] // load b=Prior(x)
2673 punpcklbw mm2, mm0 // Unpack Low bytes of b
2674 // Must shift to position Prior(x-bpp) data
2676 // pav = p - a = (a + b - c) - a = b - c
2678 punpcklbw mm3, mm0 // Unpack Low bytes of c
2679 // pbv = p - b = (a + b - c) - b = a - c
2683 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2686 // pa = abs(p-a) = abs(pav)
2687 // pb = abs(p-b) = abs(pbv)
2688 // pc = abs(p-c) = abs(pcv)
2689 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2691 pand mm0, mm4 // Only pav bytes < 0 in mm7
2692 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2694 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2698 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2699 pand mm0, mm6 // Only pav bytes < 0 in mm7
2705 pcmpgtw mm7, mm5 // pa > pb?
2707 // use mm7 mask to merge pa & pb
2709 // use mm0 mask copy to merge a & b
2715 // test ((pa <= pb)? pa:pb) <= pc
2716 pcmpgtw mm7, mm6 // pab > pc?
2723 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2724 pand mm7, ActiveMask
2726 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2727 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2729 movq [edi + ebx], mm7 // write back updated value
2730 movq mm1, [edi+ebx-8]
2736 punpckhbw mm3, mm0 // Unpack High bytes of c
2738 // Do second set of 4 bytes
2739 punpckhbw mm2, mm0 // Unpack High bytes of b
2740 punpckhbw mm1, mm0 // Unpack High bytes of a
2741 // pav = p - a = (a + b - c) - a = b - c
2743 // pbv = p - b = (a + b - c) - b = a - c
2747 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2750 // pa = abs(p-a) = abs(pav)
2751 // pb = abs(p-b) = abs(pbv)
2752 // pc = abs(p-c) = abs(pcv)
2753 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2755 pand mm0, mm4 // Only pav bytes < 0 in mm7
2756 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2758 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2762 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2763 pand mm0, mm6 // Only pav bytes < 0 in mm7
2769 pcmpgtw mm7, mm5 // pa > pb?
2771 // use mm7 mask to merge pa & pb
2773 // use mm0 mask copy to merge a & b
2779 // test ((pa <= pb)? pa:pb) <= pc
2780 pcmpgtw mm7, mm6 // pab > pc?
2787 // Step ex to next set of 8 bytes and repeat loop til done
2790 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2792 movq [edi + ebx - 8], mm1 // write back updated value
2793 // mm1 will be used as Raw(x-bpp) next loop
2801 ActiveMask.use = 0x00000000ffffffff;
2807 // PRIME the pump (load the first Raw(x-bpp) data set
2808 movq mm1, [edi+ebx-8] // Only time should need to read
2809 // a=Raw(x-bpp) bytes
2811 // Do first set of 4 bytes
2812 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2813 punpckhbw mm1, mm0 // Unpack Low bytes of a
2814 movq mm2, [esi + ebx] // load b=Prior(x)
2815 punpcklbw mm2, mm0 // Unpack High bytes of b
2816 // pav = p - a = (a + b - c) - a = b - c
2818 punpckhbw mm3, mm0 // Unpack High bytes of c
2819 // pbv = p - b = (a + b - c) - b = a - c
2823 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2826 // pa = abs(p-a) = abs(pav)
2827 // pb = abs(p-b) = abs(pbv)
2828 // pc = abs(p-c) = abs(pcv)
2829 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2831 pand mm0, mm4 // Only pav bytes < 0 in mm7
2832 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2834 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2838 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2839 pand mm0, mm6 // Only pav bytes < 0 in mm7
2845 pcmpgtw mm7, mm5 // pa > pb?
2847 // use mm7 mask to merge pa & pb
2849 // use mm0 mask copy to merge a & b
2855 // test ((pa <= pb)? pa:pb) <= pc
2856 pcmpgtw mm7, mm6 // pab > pc?
2863 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2864 pand mm7, ActiveMask
2865 movq mm2, mm3 // load b=Prior(x) step 1
2866 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2867 punpcklbw mm3, mm0 // Unpack High bytes of c
2868 movq [edi + ebx], mm7 // write back updated value
2869 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2870 // Do second set of 4 bytes
2871 punpckhbw mm2, mm0 // Unpack Low bytes of b
2872 punpcklbw mm1, mm0 // Unpack Low bytes of a
2873 // pav = p - a = (a + b - c) - a = b - c
2875 // pbv = p - b = (a + b - c) - b = a - c
2879 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2882 // pa = abs(p-a) = abs(pav)
2883 // pb = abs(p-b) = abs(pbv)
2884 // pc = abs(p-c) = abs(pcv)
2885 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2887 pand mm0, mm4 // Only pav bytes < 0 in mm7
2888 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2890 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2894 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2895 pand mm0, mm6 // Only pav bytes < 0 in mm7
2901 pcmpgtw mm7, mm5 // pa > pb?
2903 // use mm7 mask to merge pa & pb
2905 // use mm0 mask copy to merge a & b
2911 // test ((pa <= pb)? pa:pb) <= pc
2912 pcmpgtw mm7, mm6 // pab > pc?
2919 // Step ex to next set of 8 bytes and repeat loop til done
2922 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2924 movq [edi + ebx - 8], mm1 // write back updated value
2925 // mm1 will be used as Raw(x-bpp) next loop
2932 ActiveMask.use = 0x00000000ffffffff;
2938 // PRIME the pump (load the first Raw(x-bpp) data set
2939 movq mm1, [edi+ebx-8] // Only time should need to read
2940 // a=Raw(x-bpp) bytes
2942 // Do first set of 4 bytes
2943 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2944 punpcklbw mm1, mm0 // Unpack Low bytes of a
2945 movq mm2, [esi + ebx] // load b=Prior(x)
2946 punpcklbw mm2, mm0 // Unpack Low bytes of b
2947 // pav = p - a = (a + b - c) - a = b - c
2949 punpcklbw mm3, mm0 // Unpack Low bytes of c
2950 // pbv = p - b = (a + b - c) - b = a - c
2954 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2957 // pa = abs(p-a) = abs(pav)
2958 // pb = abs(p-b) = abs(pbv)
2959 // pc = abs(p-c) = abs(pcv)
2960 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2962 pand mm0, mm4 // Only pav bytes < 0 in mm7
2963 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2965 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2969 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2970 pand mm0, mm6 // Only pav bytes < 0 in mm7
2976 pcmpgtw mm7, mm5 // pa > pb?
2978 // use mm7 mask to merge pa & pb
2980 // use mm0 mask copy to merge a & b
2986 // test ((pa <= pb)? pa:pb) <= pc
2987 pcmpgtw mm7, mm6 // pab > pc?
2994 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2995 pand mm7, ActiveMask
2996 movq mm2, [esi + ebx] // load b=Prior(x)
2997 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2998 punpckhbw mm3, mm0 // Unpack High bytes of c
2999 movq [edi + ebx], mm7 // write back updated value
3000 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3002 // Do second set of 4 bytes
3003 punpckhbw mm2, mm0 // Unpack High bytes of b
3004 punpckhbw mm1, mm0 // Unpack High bytes of a
3005 // pav = p - a = (a + b - c) - a = b - c
3007 // pbv = p - b = (a + b - c) - b = a - c
3011 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3014 // pa = abs(p-a) = abs(pav)
3015 // pb = abs(p-b) = abs(pbv)
3016 // pc = abs(p-c) = abs(pcv)
3017 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3019 pand mm0, mm4 // Only pav bytes < 0 in mm7
3020 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3022 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3026 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3027 pand mm0, mm6 // Only pav bytes < 0 in mm7
3033 pcmpgtw mm7, mm5 // pa > pb?
3035 // use mm7 mask to merge pa & pb
3037 // use mm0 mask copy to merge a & b
3043 // test ((pa <= pb)? pa:pb) <= pc
3044 pcmpgtw mm7, mm6 // pab > pc?
3051 // Step ex to next set of 8 bytes and repeat loop til done
3054 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3056 movq [edi + ebx - 8], mm1 // write back updated value
3057 // mm1 will be used as Raw(x-bpp) next loop
3073 // Do Paeth decode for remaining bytes
3075 xor ecx, ecx // zero ecx before using cl & cx in loop below
3076 sub edx, bpp // Set edx = ebx - bpp
3079 // pav = p - a = (a + b - c) - a = b - c
3080 mov al, [esi + ebx] // load Prior(x) into al
3081 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3082 sub eax, ecx // subtract Prior(x-bpp)
3083 mov patemp, eax // Save pav for later use
3085 // pbv = p - b = (a + b - c) - b = a - c
3086 mov al, [edi + edx] // load Raw(x-bpp) into al
3087 sub eax, ecx // subtract Prior(x-bpp)
3089 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3090 add eax, patemp // pcv = pav + pbv
3092 test eax, 0x80000000
3094 neg eax // reverse sign of neg values
3096 mov pctemp, eax // save pc for later use
3098 test ecx, 0x80000000
3100 neg ecx // reverse sign of neg values
3102 mov pbtemp, ecx // save pb for later use
3105 test eax, 0x80000000
3107 neg eax // reverse sign of neg values
3109 mov patemp, eax // save pa for later use
3113 // pa > pb; now test if pb <= pc
3116 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3117 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3120 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3121 mov cl, [esi + ebx] // load Prior(x) into cl
3124 // pa <= pb; now test if pa <= pc
3127 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3128 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3131 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3132 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3136 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3137 add [edi + ebx - 1], cl
3143 return; // No need to go further with this one
3144 } // end switch ( bpp )
3147 // MMX acceleration complete now do clean-up
3148 // Check if any remaining bytes left to decode
3154 // Do Paeth decode for remaining bytes
3156 xor ecx, ecx // zero ecx before using cl & cx in loop below
3157 sub edx, bpp // Set edx = ebx - bpp
3160 // pav = p - a = (a + b - c) - a = b - c
3161 mov al, [esi + ebx] // load Prior(x) into al
3162 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3163 sub eax, ecx // subtract Prior(x-bpp)
3164 mov patemp, eax // Save pav for later use
3166 // pbv = p - b = (a + b - c) - b = a - c
3167 mov al, [edi + edx] // load Raw(x-bpp) into al
3168 sub eax, ecx // subtract Prior(x-bpp)
3170 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3171 add eax, patemp // pcv = pav + pbv
3173 test eax, 0x80000000
3175 neg eax // reverse sign of neg values
3177 mov pctemp, eax // save pc for later use
3179 test ecx, 0x80000000
3181 neg ecx // reverse sign of neg values
3183 mov pbtemp, ecx // save pb for later use
3186 test eax, 0x80000000
3188 neg eax // reverse sign of neg values
3190 mov patemp, eax // save pa for later use
3194 // pa > pb; now test if pb <= pc
3197 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3198 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3201 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3202 mov cl, [esi + ebx] // load Prior(x) into cl
3205 // pa <= pb; now test if pa <= pc
3208 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3209 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3212 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3213 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3217 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3218 add [edi + ebx - 1], cl
3222 emms // End MMX instructions; prep for possible FP instrs.
3226 // Optimized code for PNG Sub filter decoder
3228 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3232 png_uint_32 FullLength;
3233 png_uint_32 MMXLength;
3236 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3237 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3240 mov esi, edi // lp = row
3241 add edi, bpp // rp = row + bpp
3243 // get # of bytes to alignment
3244 mov diff, edi // take start of row
3245 add diff, 0xf // add 7 + 8 to incr past
3246 // alignment boundary
3248 and diff, 0xfffffff8 // mask to alignment boundary
3249 sub diff, edi // subtract from start ==> value
3262 sub edx, ebx // subtract alignment fix
3263 and edx, 0x00000007 // calc bytes over mult of 8
3264 sub ecx, edx // drop over bytes from length
3268 // Now do the math for the rest of the row
3273 ActiveMask.use = 0x0000ffffff000000;
3274 ShiftBpp.use = 24; // == 3 * 8
3275 ShiftRem.use = 40; // == 64 - 24
3278 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3279 mov esi, edi // lp = row
3280 add edi, bpp // rp = row + bpp
3283 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3285 // PRIME the pump (load the first Raw(x-bpp) data set
3286 movq mm1, [edi+ebx-8]
3288 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3289 // no need for mask; shift clears inactive bytes
3290 // Add 1st active group
3293 // Add 2nd active group
3294 movq mm1, mm0 // mov updated Raws to mm1
3295 psllq mm1, ShiftBpp // shift data to position correctly
3296 pand mm1, mm7 // mask to use only 2nd active group
3298 // Add 3rd active group
3299 movq mm1, mm0 // mov updated Raws to mm1
3300 psllq mm1, ShiftBpp // shift data to position correctly
3301 pand mm1, mm6 // mask to use only 3rd active group
3305 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3306 // Prep for doing 1st add at top of loop
3315 // Placed here just in case this is a duplicate of the
3316 // non-MMX code for the SUB filter in png_read_filter_row below
3321 // bpp = (row_info->pixel_depth + 7) >> 3;
3322 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3323 // i < row_info->rowbytes; i++, rp++, lp++)
3325 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3332 mov esi, edi // lp = row
3334 add edi, bpp // rp = row + bpp
3351 ShiftBpp.use = bpp << 3;
3352 ShiftRem.use = 64 - ShiftBpp.use;
3356 mov esi, edi // lp = row
3357 add edi, bpp // rp = row + bpp
3358 // PRIME the pump (load the first Raw(x-bpp) data set
3359 movq mm1, [edi+ebx-8]
3361 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3362 // no need for mask; shift clears inactive bytes
3365 // Add 2nd active group
3366 movq mm1, mm0 // mov updated Raws to mm1
3367 psllq mm1, ShiftBpp // shift data to position correctly
3368 // there is no need for any mask
3369 // since shift clears inactive bits/bytes
3373 movq [edi+ebx-8], mm0
3374 movq mm1, mm0 // Prep for doing 1st add at top of loop
3382 ActiveMask.use = 0x00000000ffff0000;
3383 ShiftBpp.use = 16; // == 2 * 8
3384 ShiftRem.use = 48; // == 64 - 16
3386 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3390 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3392 mov esi, edi // lp = row
3394 add edi, bpp // rp = row + bpp
3395 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3397 // PRIME the pump (load the first Raw(x-bpp) data set
3398 movq mm1, [edi+ebx-8]
3400 // Add 1st active group
3401 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3402 // no need for mask; shift clears inactive
3406 // Add 2nd active group
3407 movq mm1, mm0 // mov updated Raws to mm1
3408 psllq mm1, ShiftBpp // shift data to position correctly
3409 pand mm1, mm7 // mask to use only 2nd active group
3411 // Add 3rd active group
3412 movq mm1, mm0 // mov updated Raws to mm1
3413 psllq mm1, ShiftBpp // shift data to position correctly
3414 pand mm1, mm6 // mask to use only 3rd active group
3416 // Add 4th active group
3417 movq mm1, mm0 // mov updated Raws to mm1
3418 psllq mm1, ShiftBpp // shift data to position correctly
3419 pand mm1, mm5 // mask to use only 4th active group
3423 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3424 movq mm1, mm0 // Prep for doing 1st add at top of loop
3434 mov esi, edi // lp = row
3435 add edi, bpp // rp = row + bpp
3437 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3438 // Raw(x-bpp) data set
3439 and ecx, 0x0000003f // calc bytes over mult of 64
3441 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3443 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3444 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3445 // Now mm0 will be used as Raw(x-bpp) for
3446 // the 2nd group of 8 bytes. This will be
3447 // repeated for each group of 8 bytes with
3448 // the 8th group being used as the Raw(x-bpp)
3449 // for the 1st group of the next loop.
3451 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3452 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3454 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3455 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3457 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3458 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3460 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3461 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3463 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3464 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3466 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3467 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3471 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3480 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3481 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3482 // be the new Raw(x-bpp) for the next loop
3489 default: // bpp greater than 8 bytes
3494 mov esi, edi // lp = row
3495 add edi, bpp // rp = row + bpp
3502 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3509 } // end switch ( bpp )
3516 mov esi, edi // lp = row
3518 add edi, bpp // rp = row + bpp
3526 emms // End MMX instructions; prep for possible FP instrs.
3530 // Optimized code for PNG Up filter decoder
3532 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3536 len = row_info->rowbytes; // # of bytes to filter
3539 // get # of bytes to alignment
3554 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3559 sub edx, ebx // subtract alignment fix
3560 and edx, 0x0000003f // calc bytes over mult of 64
3561 sub ecx, edx // drop over bytes from length
3562 // Unrolled loop - use all MMX registers and interleave to reduce
3563 // number of branch instructions (loops) and reduce partial stalls
3567 movq mm3, [esi+ebx+8]
3569 movq mm2, [edi+ebx+8]
3572 movq mm5, [esi+ebx+16]
3573 movq [edi+ebx+8], mm2
3574 movq mm4, [edi+ebx+16]
3575 movq mm7, [esi+ebx+24]
3577 movq mm6, [edi+ebx+24]
3578 movq [edi+ebx+16], mm4
3580 movq mm1, [esi+ebx+32]
3581 movq [edi+ebx+24], mm6
3582 movq mm0, [edi+ebx+32]
3583 movq mm3, [esi+ebx+40]
3585 movq mm2, [edi+ebx+40]
3586 movq [edi+ebx+32], mm0
3588 movq mm5, [esi+ebx+48]
3589 movq [edi+ebx+40], mm2
3590 movq mm4, [edi+ebx+48]
3591 movq mm7, [esi+ebx+56]
3593 movq mm6, [edi+ebx+56]
3594 movq [edi+ebx+48], mm4
3598 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3599 // -8 to offset add ebx
3602 cmp edx, 0 // Test for bytes over mult of 64
3606 // 2 lines added by lcreeve@netins.net
3607 // (mail 11 Jul 98 in png-implement list)
3608 cmp edx, 8 //test for less than 8 bytes
3613 and edx, 0x00000007 // calc bytes over mult of 8
3614 sub ecx, edx // drop over bytes from length
3616 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3623 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3625 cmp edx, 0 // Test for bytes over mult of 8
3629 add ecx, edx // move over byte count into counter
3630 // Loop using x86 registers to update remaining bytes
3636 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3639 // Conversion of filtered row completed
3640 emms // End MMX instructions; prep for possible FP instrs.
3645 // Optimized png_read_filter_row routines
3647 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3648 row, png_bytep prev_row, int filter)
3654 if (mmx_supported == 2) {
3655 /* this should have happened in png_init_mmx_flags() already */
3656 png_warning(png_ptr, "asm_flags may not have been initialized");
3661 png_debug(1, "in png_read_filter_row\n");
3664 case 0: sprintf(filnm, "none");
3666 case 1: sprintf(filnm, "sub-%s",
3667 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3669 case 2: sprintf(filnm, "up-%s",
3670 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3672 case 3: sprintf(filnm, "avg-%s",
3673 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3675 case 4: sprintf(filnm, "Paeth-%s",
3676 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3678 default: sprintf(filnm, "unknw");
3681 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3682 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3683 (int)((row_info->pixel_depth + 7) >> 3));
3684 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3685 #endif /* PNG_DEBUG */
3689 case PNG_FILTER_VALUE_NONE:
3692 case PNG_FILTER_VALUE_SUB:
3694 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3695 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3696 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3698 png_read_filter_row_mmx_sub(row_info, row);
3703 png_uint_32 istop = row_info->rowbytes;
3704 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3705 png_bytep rp = row + bpp;
3708 for (i = bpp; i < istop; i++)
3710 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3717 case PNG_FILTER_VALUE_UP:
3719 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3720 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3721 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3723 png_read_filter_row_mmx_up(row_info, row, prev_row);
3728 png_uint_32 istop = row_info->rowbytes;
3730 png_bytep pp = prev_row;
3732 for (i = 0; i < istop; ++i)
3734 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3741 case PNG_FILTER_VALUE_AVG:
3743 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3744 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3745 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3747 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3753 png_bytep pp = prev_row;
3755 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3756 png_uint_32 istop = row_info->rowbytes - bpp;
3758 for (i = 0; i < bpp; i++)
3760 *rp = (png_byte)(((int)(*rp) +
3761 ((int)(*pp++) >> 1)) & 0xff);
3765 for (i = 0; i < istop; i++)
3767 *rp = (png_byte)(((int)(*rp) +
3768 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3775 case PNG_FILTER_VALUE_PAETH:
3777 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3778 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3779 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3781 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3787 png_bytep pp = prev_row;
3789 png_bytep cp = prev_row;
3790 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3791 png_uint_32 istop=row_info->rowbytes - bpp;
3793 for (i = 0; i < bpp; i++)
3795 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3799 for (i = 0; i < istop; i++) // use leftover rp,pp
3801 int a, b, c, pa, pb, pc, p;
3815 pa = p < 0 ? -p : p;
3816 pb = pc < 0 ? -pc : pc;
3817 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3821 if (pa <= pb && pa <= pc)
3829 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3831 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3839 png_warning(png_ptr, "Ignoring bad row filter type");
3845 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */