Fix bugs in previous commit that caused FTBFS in synfig and ETL FTBFS with older...
[synfig.git] / synfig-core / tags / synfig_0_61_04 / synfig-core / src / modules / mod_libavcodec / libavcodec / h264.c
1 /*
2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this library; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  *
19  */
20  
21 /**
22  * @file h264.c
23  * H.264 / AVC / MPEG4 part10 codec.
24  * @author Michael Niedermayer <michaelni@gmx.at>
25  */
26
27 #include "common.h"
28 #include "dsputil.h"
29 #include "avcodec.h"
30 #include "mpegvideo.h"
31 #include "h264data.h"
32 #include "golomb.h"
33
34 #undef NDEBUG
35 #include <assert.h>
36
37 #define interlaced_dct interlaced_dct_is_a_bad_name
38 #define mb_intra mb_intra_isnt_initalized_see_mb_type
39
40 #define LUMA_DC_BLOCK_INDEX   25
41 #define CHROMA_DC_BLOCK_INDEX 26
42
43 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
44 #define COEFF_TOKEN_VLC_BITS           8
45 #define TOTAL_ZEROS_VLC_BITS           9
46 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
47 #define RUN_VLC_BITS                   3
48 #define RUN7_VLC_BITS                  6
49
50 #define MAX_SPS_COUNT 32
51 #define MAX_PPS_COUNT 256
52
53 #define MAX_MMCO_COUNT 66
54
55 /**
56  * Sequence parameter set
57  */
58 typedef struct SPS{
59     
60     int profile_idc;
61     int level_idc;
62     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
63     int poc_type;                      ///< pic_order_cnt_type
64     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
65     int delta_pic_order_always_zero_flag;
66     int offset_for_non_ref_pic;
67     int offset_for_top_to_bottom_field;
68     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
69     int ref_frame_count;               ///< num_ref_frames
70     int gaps_in_frame_num_allowed_flag;
71     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
72     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
73     int frame_mbs_only_flag;
74     int mb_aff;                        ///<mb_adaptive_frame_field_flag
75     int direct_8x8_inference_flag;
76     int crop;                   ///< frame_cropping_flag
77     int crop_left;              ///< frame_cropping_rect_left_offset
78     int crop_right;             ///< frame_cropping_rect_right_offset
79     int crop_top;               ///< frame_cropping_rect_top_offset
80     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
81     int vui_parameters_present_flag;
82     int sar_width;
83     int sar_height;
84     short offset_for_ref_frame[256]; //FIXME dyn aloc?
85 }SPS;
86
87 /**
88  * Picture parameter set
89  */
90 typedef struct PPS{
91     int sps_id;
92     int cabac;                  ///< entropy_coding_mode_flag
93     int pic_order_present;      ///< pic_order_present_flag
94     int slice_group_count;      ///< num_slice_groups_minus1 + 1
95     int mb_slice_group_map_type;
96     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
97     int weighted_pred;          ///< weighted_pred_flag
98     int weighted_bipred_idc;
99     int init_qp;                ///< pic_init_qp_minus26 + 26
100     int init_qs;                ///< pic_init_qs_minus26 + 26
101     int chroma_qp_index_offset;
102     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
103     int constrained_intra_pred; ///< constrained_intra_pred_flag
104     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
105 }PPS;
106
107 /**
108  * Memory management control operation opcode.
109  */
110 typedef enum MMCOOpcode{
111     MMCO_END=0,
112     MMCO_SHORT2UNUSED,
113     MMCO_LONG2UNUSED,
114     MMCO_SHORT2LONG,
115     MMCO_SET_MAX_LONG,
116     MMCO_RESET, 
117     MMCO_LONG,
118 } MMCOOpcode;
119
120 /**
121  * Memory management control operation.
122  */
123 typedef struct MMCO{
124     MMCOOpcode opcode;
125     int short_frame_num;
126     int long_index;
127 } MMCO;
128
129 /**
130  * H264Context
131  */
132 typedef struct H264Context{
133     MpegEncContext s;
134     int nal_ref_idc;    
135     int nal_unit_type;
136 #define NAL_SLICE               1
137 #define NAL_DPA                 2
138 #define NAL_DPB                 3
139 #define NAL_DPC                 4
140 #define NAL_IDR_SLICE           5
141 #define NAL_SEI                 6
142 #define NAL_SPS                 7
143 #define NAL_PPS                 8
144 #define NAL_PICTURE_DELIMITER   9
145 #define NAL_FILTER_DATA         10
146     uint8_t *rbsp_buffer;
147     int rbsp_buffer_size;
148
149     int chroma_qp; //QPc
150
151     int prev_mb_skiped; //FIXME remove (IMHO not used)
152
153     //prediction stuff
154     int chroma_pred_mode;
155     int intra16x16_pred_mode;
156     
157     int8_t intra4x4_pred_mode_cache[5*8];
158     int8_t (*intra4x4_pred_mode)[8];
159     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
160     void (*pred8x8  [4+3])(uint8_t *src, int stride);
161     void (*pred16x16[4+3])(uint8_t *src, int stride);
162     unsigned int topleft_samples_available;
163     unsigned int top_samples_available;
164     unsigned int topright_samples_available;
165     unsigned int left_samples_available;
166
167     /**
168      * non zero coeff count cache.
169      * is 64 if not available.
170      */
171     uint8_t non_zero_count_cache[6*8];
172     uint8_t (*non_zero_count)[16];
173
174     /**
175      * Motion vector cache.
176      */
177     int16_t mv_cache[2][5*8][2];
178     int8_t ref_cache[2][5*8];
179 #define LIST_NOT_USED -1 //FIXME rename?
180 #define PART_NOT_AVAILABLE -2
181     
182     /**
183      * is 1 if the specific list MV&references are set to 0,0,-2.
184      */
185     int mv_cache_clean[2];
186
187     int block_offset[16+8];
188     int chroma_subblock_offset[16]; //FIXME remove
189     
190     uint16_t *mb2b_xy; //FIXME are these 4 a good idea?
191     uint16_t *mb2b8_xy;
192     int b_stride;
193     int b8_stride;
194
195     int halfpel_flag;
196     int thirdpel_flag;
197
198     int unknown_svq3_flag;
199     int next_slice_index;
200
201     SPS sps_buffer[MAX_SPS_COUNT];
202     SPS sps; ///< current sps
203     
204     PPS pps_buffer[MAX_PPS_COUNT];
205     /**
206      * current pps
207      */
208     PPS pps; //FIXME move tp Picture perhaps? (->no) do we need that?
209
210     int slice_num;
211     uint8_t *slice_table_base;
212     uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
213     int slice_type;
214     int slice_type_fixed;
215     
216     //interlacing specific flags
217     int mb_field_decoding_flag;
218     
219     int sub_mb_type[4];
220     
221     //POC stuff
222     int poc_lsb;
223     int poc_msb;
224     int delta_poc_bottom;
225     int delta_poc[2];
226     int frame_num;
227     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
228     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
229     int frame_num_offset;         ///< for POC type 2
230     int prev_frame_num_offset;    ///< for POC type 2
231     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
232
233     /**
234      * frame_num for frames or 2*frame_num for field pics.
235      */
236     int curr_pic_num;
237     
238     /**
239      * max_frame_num or 2*max_frame_num for field pics.
240      */
241     int max_pic_num;
242
243     //Weighted pred stuff
244     int luma_log2_weight_denom;
245     int chroma_log2_weight_denom;
246     int luma_weight[2][16];
247     int luma_offset[2][16];
248     int chroma_weight[2][16][2];
249     int chroma_offset[2][16][2];
250    
251     //deblock
252     int disable_deblocking_filter_idc;
253     int slice_alpha_c0_offset_div2;
254     int slice_beta_offset_div2;
255      
256     int redundant_pic_count;
257     
258     int direct_spatial_mv_pred;
259
260     /**
261      * num_ref_idx_l0/1_active_minus1 + 1
262      */
263     int ref_count[2];// FIXME split for AFF
264     Picture *short_ref[16];
265     Picture *long_ref[16];
266     Picture default_ref_list[2][32];
267     Picture ref_list[2][32]; //FIXME size?
268     Picture field_ref_list[2][32]; //FIXME size?
269     
270     /**
271      * memory management control operations buffer.
272      */
273     MMCO mmco[MAX_MMCO_COUNT];
274     int mmco_index;
275     
276     int long_ref_count;  ///< number of actual long term references
277     int short_ref_count; ///< number of actual short term references
278     
279     //data partitioning
280     GetBitContext intra_gb;
281     GetBitContext inter_gb;
282     GetBitContext *intra_gb_ptr;
283     GetBitContext *inter_gb_ptr;
284     
285     DCTELEM mb[16*24] __align8;
286 }H264Context;
287
288 static VLC coeff_token_vlc[4];
289 static VLC chroma_dc_coeff_token_vlc;
290
291 static VLC total_zeros_vlc[15];
292 static VLC chroma_dc_total_zeros_vlc[3];
293
294 static VLC run_vlc[6];
295 static VLC run7_vlc;
296
297 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
298 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
299
300 static inline uint32_t pack16to32(int a, int b){
301 #ifdef WORDS_BIGENDIAN
302    return (b&0xFFFF) + (a<<16);
303 #else
304    return (a&0xFFFF) + (b<<16);
305 #endif
306 }
307
308 /**
309  * fill a rectangle.
310  * @param h height of the recatangle, should be a constant
311  * @param w width of the recatangle, should be a constant
312  * @param size the size of val (1 or 4), should be a constant
313  */
314 static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ //FIXME ensure this IS inlined
315     uint8_t *p= (uint8_t*)vp;
316     assert(size==1 || size==4);
317     
318     w      *= size;
319     stride *= size;
320     
321 //FIXME check what gcc generates for 64 bit on x86 and possible write a 32 bit ver of it
322     if(w==2 && h==2){
323         *(uint16_t*)(p + 0)=
324         *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101;
325     }else if(w==2 && h==4){
326         *(uint16_t*)(p + 0*stride)=
327         *(uint16_t*)(p + 1*stride)=
328         *(uint16_t*)(p + 2*stride)=
329         *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101;
330     }else if(w==4 && h==1){
331         *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101;
332     }else if(w==4 && h==2){
333         *(uint32_t*)(p + 0*stride)=
334         *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101;
335     }else if(w==4 && h==4){
336         *(uint32_t*)(p + 0*stride)=
337         *(uint32_t*)(p + 1*stride)=
338         *(uint32_t*)(p + 2*stride)=
339         *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101;
340     }else if(w==8 && h==1){
341         *(uint32_t*)(p + 0)=
342         *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101;
343     }else if(w==8 && h==2){
344         *(uint32_t*)(p + 0 + 0*stride)=
345         *(uint32_t*)(p + 4 + 0*stride)=
346         *(uint32_t*)(p + 0 + 1*stride)=
347         *(uint32_t*)(p + 4 + 1*stride)=  size==4 ? val : val*0x01010101;
348     }else if(w==8 && h==4){
349         *(uint64_t*)(p + 0*stride)=
350         *(uint64_t*)(p + 1*stride)=
351         *(uint64_t*)(p + 2*stride)=
352         *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
353     }else if(w==16 && h==2){
354         *(uint64_t*)(p + 0+0*stride)=
355         *(uint64_t*)(p + 8+0*stride)=
356         *(uint64_t*)(p + 0+1*stride)=
357         *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
358     }else if(w==16 && h==4){
359         *(uint64_t*)(p + 0+0*stride)=
360         *(uint64_t*)(p + 8+0*stride)=
361         *(uint64_t*)(p + 0+1*stride)=
362         *(uint64_t*)(p + 8+1*stride)=
363         *(uint64_t*)(p + 0+2*stride)=
364         *(uint64_t*)(p + 8+2*stride)=
365         *(uint64_t*)(p + 0+3*stride)=
366         *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
367     }else
368         assert(0);
369 }
370
371 static inline void fill_caches(H264Context *h, int mb_type){
372     MpegEncContext * const s = &h->s;
373     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
374     int topleft_xy, top_xy, topright_xy, left_xy[2];
375     int topleft_type, top_type, topright_type, left_type[2];
376     int left_block[4];
377     int i;
378
379     //wow what a mess, why didnt they simplify the interlacing&intra stuff, i cant imagine that these complex rules are worth it 
380     
381     if(h->sps.mb_aff){
382     //FIXME
383         topleft_xy = 0; /* avoid warning */
384         top_xy = 0; /* avoid warning */
385         topright_xy = 0; /* avoid warning */
386     }else{
387         topleft_xy = mb_xy-1 - s->mb_stride;
388         top_xy     = mb_xy   - s->mb_stride;
389         topright_xy= mb_xy+1 - s->mb_stride;
390         left_xy[0]   = mb_xy-1;
391         left_xy[1]   = mb_xy-1;
392         left_block[0]= 0;
393         left_block[1]= 1;
394         left_block[2]= 2;
395         left_block[3]= 3;
396     }
397
398     topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
399     top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
400     topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
401     left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
402     left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
403
404     if(IS_INTRA(mb_type)){
405         h->topleft_samples_available= 
406         h->top_samples_available= 
407         h->left_samples_available= 0xFFFF;
408         h->topright_samples_available= 0xEEEA;
409
410         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
411             h->topleft_samples_available= 0xB3FF;
412             h->top_samples_available= 0x33FF;
413             h->topright_samples_available= 0x26EA;
414         }
415         for(i=0; i<2; i++){
416             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
417                 h->topleft_samples_available&= 0xDF5F;
418                 h->left_samples_available&= 0x5F5F;
419             }
420         }
421         
422         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
423             h->topleft_samples_available&= 0x7FFF;
424         
425         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
426             h->topright_samples_available&= 0xFBFF;
427     
428         if(IS_INTRA4x4(mb_type)){
429             if(IS_INTRA4x4(top_type)){
430                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
431                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
432                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
433                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
434             }else{
435                 int pred;
436                 if(IS_INTRA16x16(top_type) || (IS_INTER(top_type) && !h->pps.constrained_intra_pred))
437                     pred= 2;
438                 else{
439                     pred= -1;
440                 }
441                 h->intra4x4_pred_mode_cache[4+8*0]=
442                 h->intra4x4_pred_mode_cache[5+8*0]=
443                 h->intra4x4_pred_mode_cache[6+8*0]=
444                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
445             }
446             for(i=0; i<2; i++){
447                 if(IS_INTRA4x4(left_type[i])){
448                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
449                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
450                 }else{
451                     int pred;
452                     if(IS_INTRA16x16(left_type[i]) || (IS_INTER(left_type[i]) && !h->pps.constrained_intra_pred))
453                         pred= 2;
454                     else{
455                         pred= -1;
456                     }
457                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
458                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
459                 }
460             }
461         }
462     }
463     
464     
465 /*
466 0 . T T. T T T T 
467 1 L . .L . . . . 
468 2 L . .L . . . . 
469 3 . T TL . . . . 
470 4 L . .L . . . . 
471 5 L . .. . . . . 
472 */
473 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
474     if(top_type){
475         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][0];
476         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][1];
477         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][2];
478         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
479     
480         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][7];
481         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
482     
483         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][10];
484         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
485     }else{
486         h->non_zero_count_cache[4+8*0]=      
487         h->non_zero_count_cache[5+8*0]=
488         h->non_zero_count_cache[6+8*0]=
489         h->non_zero_count_cache[7+8*0]=
490     
491         h->non_zero_count_cache[1+8*0]=
492         h->non_zero_count_cache[2+8*0]=
493     
494         h->non_zero_count_cache[1+8*3]=
495         h->non_zero_count_cache[2+8*3]= 64;
496     }
497     
498     if(left_type[0]){
499         h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][6];
500         h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][5];
501         h->non_zero_count_cache[0+8*1]= h->non_zero_count[left_xy[0]][9]; //FIXME left_block
502         h->non_zero_count_cache[0+8*4]= h->non_zero_count[left_xy[0]][12];
503     }else{
504         h->non_zero_count_cache[3+8*1]= 
505         h->non_zero_count_cache[3+8*2]= 
506         h->non_zero_count_cache[0+8*1]= 
507         h->non_zero_count_cache[0+8*4]= 64;
508     }
509     
510     if(left_type[1]){
511         h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[1]][4];
512         h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[1]][3];
513         h->non_zero_count_cache[0+8*2]= h->non_zero_count[left_xy[1]][8];
514         h->non_zero_count_cache[0+8*5]= h->non_zero_count[left_xy[1]][11];
515     }else{
516         h->non_zero_count_cache[3+8*3]= 
517         h->non_zero_count_cache[3+8*4]= 
518         h->non_zero_count_cache[0+8*2]= 
519         h->non_zero_count_cache[0+8*5]= 64;
520     }
521     
522 #if 1
523     if(IS_INTER(mb_type)){
524         int list;
525         for(list=0; list<2; list++){
526             if((!IS_8X8(mb_type)) && !USES_LIST(mb_type, list)){
527                 /*if(!h->mv_cache_clean[list]){
528                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
529                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
530                     h->mv_cache_clean[list]= 1;
531                 }*/
532                 continue; //FIXME direct mode ...
533             }
534             h->mv_cache_clean[list]= 0;
535             
536             if(IS_INTER(topleft_type)){
537                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
538                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
539                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
540                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
541             }else{
542                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
543                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
544             }
545             
546             if(IS_INTER(top_type)){
547                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
548                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
549                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
550                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
551                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
552                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
553                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
554                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
555                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
556                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
557             }else{
558                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]= 
559                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]= 
560                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 
561                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
562                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
563             }
564
565             if(IS_INTER(topright_type)){
566                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
567                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
568                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
569                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
570             }else{
571                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
572                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
573             }
574             
575             //FIXME unify cleanup or sth
576             if(IS_INTER(left_type[0])){
577                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
578                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
579                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
580                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
581                 h->ref_cache[list][scan8[0] - 1 + 0*8]= 
582                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
583             }else{
584                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
585                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
586                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
587                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
588             }
589             
590             if(IS_INTER(left_type[1])){
591                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
592                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
593                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
594                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
595                 h->ref_cache[list][scan8[0] - 1 + 2*8]= 
596                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
597             }else{
598                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
599                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
600                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
601                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
602             }
603
604             h->ref_cache[list][scan8[5 ]+1] = 
605             h->ref_cache[list][scan8[7 ]+1] = 
606             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewher else)
607             h->ref_cache[list][scan8[4 ]] = 
608             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
609             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
610             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
611             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
612             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
613             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
614         }
615 //FIXME
616
617     }
618 #endif
619 }
620
621 static inline void write_back_intra_pred_mode(H264Context *h){
622     MpegEncContext * const s = &h->s;
623     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
624
625     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
626     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
627     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
628     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
629     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
630     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
631     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
632 }
633
634 /**
635  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
636  */
637 static inline int check_intra4x4_pred_mode(H264Context *h){
638     MpegEncContext * const s = &h->s;
639     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
640     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
641     int i;
642     
643     if(!(h->top_samples_available&0x8000)){
644         for(i=0; i<4; i++){
645             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
646             if(status<0){
647                 fprintf(stderr, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
648                 return -1;
649             } else if(status){
650                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
651             }
652         }
653     }
654     
655     if(!(h->left_samples_available&0x8000)){
656         for(i=0; i<4; i++){
657             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
658             if(status<0){
659                 fprintf(stderr, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
660                 return -1;
661             } else if(status){
662                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
663             }
664         }
665     }
666
667     return 0;
668 } //FIXME cleanup like next
669
670 /**
671  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
672  */
673 static inline int check_intra_pred_mode(H264Context *h, int mode){
674     MpegEncContext * const s = &h->s;
675     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
676     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
677     
678     if(!(h->top_samples_available&0x8000)){
679         mode= top[ mode ];
680         if(mode<0){
681             fprintf(stderr, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
682             return -1;
683         }
684     }
685     
686     if(!(h->left_samples_available&0x8000)){
687         mode= left[ mode ];
688         if(mode<0){
689             fprintf(stderr, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
690             return -1;
691         } 
692     }
693
694     return mode;
695 }
696
697 /**
698  * gets the predicted intra4x4 prediction mode.
699  */
700 static inline int pred_intra_mode(H264Context *h, int n){
701     const int index8= scan8[n];
702     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
703     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
704     const int min= FFMIN(left, top);
705
706     tprintf("mode:%d %d min:%d\n", left ,top, min);
707
708     if(min<0) return DC_PRED;
709     else      return min;
710 }
711
712 static inline void write_back_non_zero_count(H264Context *h){
713     MpegEncContext * const s = &h->s;
714     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
715
716     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[4+8*4];
717     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[5+8*4];
718     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[6+8*4];
719     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
720     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[7+8*3];
721     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[7+8*2];
722     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[7+8*1];
723     
724     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[1+8*2];
725     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
726     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[2+8*1];
727
728     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[1+8*5];
729     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
730     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[2+8*4];
731 }
732
733 /**
734  * gets the predicted number of non zero coefficients.
735  * @param n block index
736  */
737 static inline int pred_non_zero_count(H264Context *h, int n){
738     const int index8= scan8[n];
739     const int left= h->non_zero_count_cache[index8 - 1];
740     const int top = h->non_zero_count_cache[index8 - 8];
741     int i= left + top;
742     
743     if(i<64) i= (i+1)>>1;
744
745     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
746
747     return i&31;
748 }
749
750 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
751     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
752
753     if(topright_ref != PART_NOT_AVAILABLE){
754         *C= h->mv_cache[list][ i - 8 + part_width ];
755         return topright_ref;
756     }else{
757         tprintf("topright MV not available\n");
758
759         *C= h->mv_cache[list][ i - 8 - 1 ];
760         return h->ref_cache[list][ i - 8 - 1 ];
761     }
762 }
763
764 /**
765  * gets the predicted MV.
766  * @param n the block index
767  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
768  * @param mx the x component of the predicted motion vector
769  * @param my the y component of the predicted motion vector
770  */
771 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
772     const int index8= scan8[n];
773     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
774     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
775     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
776     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
777     const int16_t * C;
778     int diagonal_ref, match_count;
779
780     assert(part_width==1 || part_width==2 || part_width==4);
781
782 /* mv_cache
783   B . . A T T T T 
784   U . . L . . , .
785   U . . L . . . .
786   U . . L . . , .
787   . . . L . . . .
788 */
789
790     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
791     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
792     if(match_count > 1){ //most common
793         *mx= mid_pred(A[0], B[0], C[0]);
794         *my= mid_pred(A[1], B[1], C[1]);
795     }else if(match_count==1){
796         if(left_ref==ref){
797             *mx= A[0];
798             *my= A[1];        
799         }else if(top_ref==ref){
800             *mx= B[0];
801             *my= B[1];        
802         }else{
803             *mx= C[0];
804             *my= C[1];        
805         }
806     }else{
807         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
808             *mx= A[0];
809             *my= A[1];        
810         }else{
811             *mx= mid_pred(A[0], B[0], C[0]);
812             *my= mid_pred(A[1], B[1], C[1]);
813         }
814     }
815         
816     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
817 }
818
819 /**
820  * gets the directionally predicted 16x8 MV.
821  * @param n the block index
822  * @param mx the x component of the predicted motion vector
823  * @param my the y component of the predicted motion vector
824  */
825 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
826     if(n==0){
827         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
828         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
829
830         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
831         
832         if(top_ref == ref){
833             *mx= B[0];
834             *my= B[1];
835             return;
836         }
837     }else{
838         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
839         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
840         
841         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
842
843         if(left_ref == ref){
844             *mx= A[0];
845             *my= A[1];
846             return;
847         }
848     }
849
850     //RARE
851     pred_motion(h, n, 4, list, ref, mx, my);
852 }
853
854 /**
855  * gets the directionally predicted 8x16 MV.
856  * @param n the block index
857  * @param mx the x component of the predicted motion vector
858  * @param my the y component of the predicted motion vector
859  */
860 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
861     if(n==0){
862         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
863         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
864         
865         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
866
867         if(left_ref == ref){
868             *mx= A[0];
869             *my= A[1];
870             return;
871         }
872     }else{
873         const int16_t * C;
874         int diagonal_ref;
875
876         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
877         
878         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
879
880         if(diagonal_ref == ref){ 
881             *mx= C[0];
882             *my= C[1];
883             return;
884         }
885     }
886
887     //RARE
888     pred_motion(h, n, 2, list, ref, mx, my);
889 }
890
891 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
892     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
893     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
894
895     tprintf("pred_pskip: (%d) (%d) at %2d %2d", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
896
897     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
898        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
899        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
900        
901         *mx = *my = 0;
902         return;
903     }
904         
905     pred_motion(h, 0, 4, 0, 0, mx, my);
906
907     return;
908 }
909
910 static inline void write_back_motion(H264Context *h, int mb_type){
911     MpegEncContext * const s = &h->s;
912     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
913     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
914     int list;
915
916     for(list=0; list<2; list++){
917         int y;
918         if((!IS_8X8(mb_type)) && !USES_LIST(mb_type, list)){
919             if(1){ //FIXME skip or never read if mb_type doesnt use it
920                 for(y=0; y<4; y++){
921                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
922                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
923                 }
924                 for(y=0; y<2; y++){
925                     *(uint16_t*)s->current_picture.motion_val[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101;
926                 }
927             }
928             continue; //FIXME direct mode ...
929         }
930         
931         for(y=0; y<4; y++){
932             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
933             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
934         }
935         for(y=0; y<2; y++){
936             s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
937             s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
938         }
939     }
940 }
941
942 /**
943  * Decodes a network abstraction layer unit.
944  * @param consumed is the number of bytes used as input
945  * @param length is the length of the array
946  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp ttailing?
947  * @returns decoded bytes, might be src+1 if no escapes 
948  */
949 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
950     int i, si, di;
951     uint8_t *dst;
952
953 //    src[0]&0x80;              //forbidden bit
954     h->nal_ref_idc= src[0]>>5;
955     h->nal_unit_type= src[0]&0x1F;
956
957     src++; length--;
958 #if 0    
959     for(i=0; i<length; i++)
960         printf("%2X ", src[i]);
961 #endif
962     for(i=0; i+1<length; i+=2){
963         if(src[i]) continue;
964         if(i>0 && src[i-1]==0) i--;
965         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
966             if(src[i+2]!=3){
967                 /* startcode, so we must be past the end */
968                 length=i;
969             }
970             break;
971         }
972     }
973
974     if(i>=length-1){ //no escaped 0
975         *dst_length= length;
976         *consumed= length+1; //+1 for the header
977         return src; 
978     }
979
980     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
981     dst= h->rbsp_buffer;
982
983 //printf("deoding esc\n");
984     si=di=0;
985     while(si<length){ 
986         //remove escapes (very rare 1:2^22)
987         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
988             if(src[si+2]==3){ //escape
989                 dst[di++]= 0;
990                 dst[di++]= 0;
991                 si+=3;
992             }else //next start code
993                 break;
994         }
995
996         dst[di++]= src[si++];
997     }
998
999     *dst_length= di;
1000     *consumed= si + 1;//+1 for the header
1001 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1002     return dst;
1003 }
1004
1005 /**
1006  * @param src the data which should be escaped
1007  * @param dst the target buffer, dst+1 == src is allowed as a special case
1008  * @param length the length of the src data
1009  * @param dst_length the length of the dst array
1010  * @returns length of escaped data in bytes or -1 if an error occured
1011  */
1012 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1013     int i, escape_count, si, di;
1014     uint8_t *temp;
1015     
1016     assert(length>=0);
1017     assert(dst_length>0);
1018     
1019     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1020
1021     if(length==0) return 1;
1022
1023     escape_count= 0;
1024     for(i=0; i<length; i+=2){
1025         if(src[i]) continue;
1026         if(i>0 && src[i-1]==0) 
1027             i--;
1028         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1029             escape_count++;
1030             i+=2;
1031         }
1032     }
1033     
1034     if(escape_count==0){ 
1035         if(dst+1 != src)
1036             memcpy(dst+1, src, length);
1037         return length + 1;
1038     }
1039     
1040     if(length + escape_count + 1> dst_length)
1041         return -1;
1042
1043     //this should be damn rare (hopefully)
1044
1045     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1046     temp= h->rbsp_buffer;
1047 //printf("encoding esc\n");
1048     
1049     si= 0;
1050     di= 0;
1051     while(si < length){
1052         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1053             temp[di++]= 0; si++;
1054             temp[di++]= 0; si++;
1055             temp[di++]= 3; 
1056             temp[di++]= src[si++];
1057         }
1058         else
1059             temp[di++]= src[si++];
1060     }
1061     memcpy(dst+1, temp, length+escape_count);
1062     
1063     assert(di == length+escape_count);
1064     
1065     return di + 1;
1066 }
1067
1068 /**
1069  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1070  */
1071 static void encode_rbsp_trailing(PutBitContext *pb){
1072     int length;
1073     put_bits(pb, 1, 1);
1074     length= (-get_bit_count(pb))&7;
1075     if(length) put_bits(pb, length, 0);
1076 }
1077
1078 /**
1079  * identifies the exact end of the bitstream
1080  * @return the length of the trailing, or 0 if damaged
1081  */
1082 static int decode_rbsp_trailing(uint8_t *src){
1083     int v= *src;
1084     int r;
1085
1086     tprintf("rbsp trailing %X\n", v);
1087
1088     for(r=1; r<9; r++){
1089         if(v&1) return r;
1090         v>>=1;
1091     }
1092     return 0;
1093 }
1094
1095 /**
1096  * idct tranforms the 16 dc values and dequantize them.
1097  * @param qp quantization parameter
1098  */
1099 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp){
1100     const int qmul= dequant_coeff[qp][0];
1101 #define stride 16
1102     int i;
1103     int temp[16]; //FIXME check if this is a good idea
1104     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1105     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1106
1107 //memset(block, 64, 2*256);
1108 //return;
1109     for(i=0; i<4; i++){
1110         const int offset= y_offset[i];
1111         const int z0= block[offset+stride*0] + block[offset+stride*4];
1112         const int z1= block[offset+stride*0] - block[offset+stride*4];
1113         const int z2= block[offset+stride*1] - block[offset+stride*5];
1114         const int z3= block[offset+stride*1] + block[offset+stride*5];
1115
1116         temp[4*i+0]= z0+z3;
1117         temp[4*i+1]= z1+z2;
1118         temp[4*i+2]= z1-z2;
1119         temp[4*i+3]= z0-z3;
1120     }
1121
1122     for(i=0; i<4; i++){
1123         const int offset= x_offset[i];
1124         const int z0= temp[4*0+i] + temp[4*2+i];
1125         const int z1= temp[4*0+i] - temp[4*2+i];
1126         const int z2= temp[4*1+i] - temp[4*3+i];
1127         const int z3= temp[4*1+i] + temp[4*3+i];
1128
1129         block[stride*0 +offset]= ((z0 + z3)*qmul + 2)>>2; //FIXME think about merging this into decode_resdual
1130         block[stride*2 +offset]= ((z1 + z2)*qmul + 2)>>2;
1131         block[stride*8 +offset]= ((z1 - z2)*qmul + 2)>>2;
1132         block[stride*10+offset]= ((z0 - z3)*qmul + 2)>>2;
1133     }
1134 }
1135
1136 /**
1137  * dct tranforms the 16 dc values.
1138  * @param qp quantization parameter ??? FIXME
1139  */
1140 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1141 //    const int qmul= dequant_coeff[qp][0];
1142     int i;
1143     int temp[16]; //FIXME check if this is a good idea
1144     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1145     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1146
1147     for(i=0; i<4; i++){
1148         const int offset= y_offset[i];
1149         const int z0= block[offset+stride*0] + block[offset+stride*4];
1150         const int z1= block[offset+stride*0] - block[offset+stride*4];
1151         const int z2= block[offset+stride*1] - block[offset+stride*5];
1152         const int z3= block[offset+stride*1] + block[offset+stride*5];
1153
1154         temp[4*i+0]= z0+z3;
1155         temp[4*i+1]= z1+z2;
1156         temp[4*i+2]= z1-z2;
1157         temp[4*i+3]= z0-z3;
1158     }
1159
1160     for(i=0; i<4; i++){
1161         const int offset= x_offset[i];
1162         const int z0= temp[4*0+i] + temp[4*2+i];
1163         const int z1= temp[4*0+i] - temp[4*2+i];
1164         const int z2= temp[4*1+i] - temp[4*3+i];
1165         const int z3= temp[4*1+i] + temp[4*3+i];
1166
1167         block[stride*0 +offset]= (z0 + z3)>>1;
1168         block[stride*2 +offset]= (z1 + z2)>>1;
1169         block[stride*8 +offset]= (z1 - z2)>>1;
1170         block[stride*10+offset]= (z0 - z3)>>1;
1171     }
1172 }
1173 #undef xStride
1174 #undef stride
1175
1176 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp){
1177     const int qmul= dequant_coeff[qp][0];
1178     const int stride= 16*2;
1179     const int xStride= 16;
1180     int a,b,c,d,e;
1181
1182     a= block[stride*0 + xStride*0];
1183     b= block[stride*0 + xStride*1];
1184     c= block[stride*1 + xStride*0];
1185     d= block[stride*1 + xStride*1];
1186
1187     e= a-b;
1188     a= a+b;
1189     b= c-d;
1190     c= c+d;
1191
1192     block[stride*0 + xStride*0]= ((a+c)*qmul + 0)>>1;
1193     block[stride*0 + xStride*1]= ((e+b)*qmul + 0)>>1;
1194     block[stride*1 + xStride*0]= ((a-c)*qmul + 0)>>1;
1195     block[stride*1 + xStride*1]= ((e-b)*qmul + 0)>>1;
1196 }
1197
1198 static void chroma_dc_dct_c(DCTELEM *block){
1199     const int stride= 16*2;
1200     const int xStride= 16;
1201     int a,b,c,d,e;
1202
1203     a= block[stride*0 + xStride*0];
1204     b= block[stride*0 + xStride*1];
1205     c= block[stride*1 + xStride*0];
1206     d= block[stride*1 + xStride*1];
1207
1208     e= a-b;
1209     a= a+b;
1210     b= c-d;
1211     c= c+d;
1212
1213     block[stride*0 + xStride*0]= (a+c);
1214     block[stride*0 + xStride*1]= (e+b);
1215     block[stride*1 + xStride*0]= (a-c);
1216     block[stride*1 + xStride*1]= (e-b);
1217 }
1218
1219 /**
1220  * gets the chroma qp.
1221  */
1222 static inline int get_chroma_qp(H264Context *h, int qscale){
1223     
1224     return chroma_qp[clip(qscale + h->pps.chroma_qp_index_offset, 0, 51)];
1225 }
1226
1227
1228 /**
1229  *
1230  */
1231 static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
1232     int i;
1233     uint8_t *cm = cropTbl + MAX_NEG_CROP;
1234
1235     block[0] += 32;
1236 #if 1
1237     for(i=0; i<4; i++){
1238         const int z0=  block[i + 4*0]     +  block[i + 4*2];
1239         const int z1=  block[i + 4*0]     -  block[i + 4*2];
1240         const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
1241         const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
1242
1243         block[i + 4*0]= z0 + z3;
1244         block[i + 4*1]= z1 + z2;
1245         block[i + 4*2]= z1 - z2;
1246         block[i + 4*3]= z0 - z3;
1247     }
1248
1249     for(i=0; i<4; i++){
1250         const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
1251         const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
1252         const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
1253         const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
1254
1255         dst[0 + i*stride]= cm[ dst[0 + i*stride] + ((z0 + z3) >> 6) ];
1256         dst[1 + i*stride]= cm[ dst[1 + i*stride] + ((z1 + z2) >> 6) ];
1257         dst[2 + i*stride]= cm[ dst[2 + i*stride] + ((z1 - z2) >> 6) ];
1258         dst[3 + i*stride]= cm[ dst[3 + i*stride] + ((z0 - z3) >> 6) ];
1259     }
1260 #else
1261     for(i=0; i<4; i++){
1262         const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
1263         const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
1264         const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
1265         const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
1266
1267         block[0 + 4*i]= z0 + z3;
1268         block[1 + 4*i]= z1 + z2;
1269         block[2 + 4*i]= z1 - z2;
1270         block[3 + 4*i]= z0 - z3;
1271     }
1272
1273     for(i=0; i<4; i++){
1274         const int z0=  block[i + 4*0]     +  block[i + 4*2];
1275         const int z1=  block[i + 4*0]     -  block[i + 4*2];
1276         const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
1277         const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
1278
1279         dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ];
1280         dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ];
1281         dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ];
1282         dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ];
1283     }
1284 #endif
1285 }
1286
1287 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
1288     int i;
1289     //FIXME try int temp instead of block
1290     
1291     for(i=0; i<4; i++){
1292         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
1293         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
1294         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
1295         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
1296         const int z0= d0 + d3;
1297         const int z3= d0 - d3;
1298         const int z1= d1 + d2;
1299         const int z2= d1 - d2;
1300         
1301         block[0 + 4*i]=   z0 +   z1;
1302         block[1 + 4*i]= 2*z3 +   z2;
1303         block[2 + 4*i]=   z0 -   z1;
1304         block[3 + 4*i]=   z3 - 2*z2;
1305     }    
1306
1307     for(i=0; i<4; i++){
1308         const int z0= block[0*4 + i] + block[3*4 + i];
1309         const int z3= block[0*4 + i] - block[3*4 + i];
1310         const int z1= block[1*4 + i] + block[2*4 + i];
1311         const int z2= block[1*4 + i] - block[2*4 + i];
1312         
1313         block[0*4 + i]=   z0 +   z1;
1314         block[1*4 + i]= 2*z3 +   z2;
1315         block[2*4 + i]=   z0 -   z1;
1316         block[3*4 + i]=   z3 - 2*z2;
1317     }
1318 }
1319
1320 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, iam not sure, its very close
1321 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1322 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1323     int i;
1324     const int * const quant_table= quant_coeff[qscale];
1325     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1326     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1327     const unsigned int threshold2= (threshold1<<1);
1328     int last_non_zero;
1329
1330     if(seperate_dc){
1331         if(qscale<=18){
1332             //avoid overflows
1333             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1334             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1335             const unsigned int dc_threshold2= (dc_threshold1<<1);
1336
1337             int level= block[0]*quant_coeff[qscale+18][0];
1338             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1339                 if(level>0){
1340                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1341                     block[0]= level;
1342                 }else{
1343                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1344                     block[0]= -level;
1345                 }
1346 //                last_non_zero = i;
1347             }else{
1348                 block[0]=0;
1349             }
1350         }else{
1351             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1352             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1353             const unsigned int dc_threshold2= (dc_threshold1<<1);
1354
1355             int level= block[0]*quant_table[0];
1356             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1357                 if(level>0){
1358                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1359                     block[0]= level;
1360                 }else{
1361                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1362                     block[0]= -level;
1363                 }
1364 //                last_non_zero = i;
1365             }else{
1366                 block[0]=0;
1367             }
1368         }
1369         last_non_zero= 0;
1370         i=1;
1371     }else{
1372         last_non_zero= -1;
1373         i=0;
1374     }
1375
1376     for(; i<16; i++){
1377         const int j= scantable[i];
1378         int level= block[j]*quant_table[j];
1379
1380 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1381 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1382         if(((unsigned)(level+threshold1))>threshold2){
1383             if(level>0){
1384                 level= (bias + level)>>QUANT_SHIFT;
1385                 block[j]= level;
1386             }else{
1387                 level= (bias - level)>>QUANT_SHIFT;
1388                 block[j]= -level;
1389             }
1390             last_non_zero = i;
1391         }else{
1392             block[j]=0;
1393         }
1394     }
1395
1396     return last_non_zero;
1397 }
1398
1399 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1400     const uint32_t a= ((uint32_t*)(src-stride))[0];
1401     ((uint32_t*)(src+0*stride))[0]= a;
1402     ((uint32_t*)(src+1*stride))[0]= a;
1403     ((uint32_t*)(src+2*stride))[0]= a;
1404     ((uint32_t*)(src+3*stride))[0]= a;
1405 }
1406
1407 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1408     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1409     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1410     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1411     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1412 }
1413
1414 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1415     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1416                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1417     
1418     ((uint32_t*)(src+0*stride))[0]= 
1419     ((uint32_t*)(src+1*stride))[0]= 
1420     ((uint32_t*)(src+2*stride))[0]= 
1421     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 
1422 }
1423
1424 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1425     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1426     
1427     ((uint32_t*)(src+0*stride))[0]= 
1428     ((uint32_t*)(src+1*stride))[0]= 
1429     ((uint32_t*)(src+2*stride))[0]= 
1430     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 
1431 }
1432
1433 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1434     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1435     
1436     ((uint32_t*)(src+0*stride))[0]= 
1437     ((uint32_t*)(src+1*stride))[0]= 
1438     ((uint32_t*)(src+2*stride))[0]= 
1439     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 
1440 }
1441
1442 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1443     ((uint32_t*)(src+0*stride))[0]= 
1444     ((uint32_t*)(src+1*stride))[0]= 
1445     ((uint32_t*)(src+2*stride))[0]= 
1446     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1447 }
1448
1449
1450 #define LOAD_TOP_RIGHT_EDGE\
1451     const int t4= topright[0];\
1452     const int t5= topright[1];\
1453     const int t6= topright[2];\
1454     const int t7= topright[3];\
1455
1456 #define LOAD_LEFT_EDGE\
1457     const int l0= src[-1+0*stride];\
1458     const int l1= src[-1+1*stride];\
1459     const int l2= src[-1+2*stride];\
1460     const int l3= src[-1+3*stride];\
1461
1462 #define LOAD_TOP_EDGE\
1463     const int t0= src[ 0-1*stride];\
1464     const int t1= src[ 1-1*stride];\
1465     const int t2= src[ 2-1*stride];\
1466     const int t3= src[ 3-1*stride];\
1467
1468 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1469     const int lt= src[-1-1*stride];
1470     LOAD_TOP_EDGE
1471     LOAD_LEFT_EDGE
1472
1473     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; 
1474     src[0+2*stride]=
1475     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; 
1476     src[0+1*stride]=
1477     src[1+2*stride]=
1478     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; 
1479     src[0+0*stride]=
1480     src[1+1*stride]=
1481     src[2+2*stride]=
1482     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 
1483     src[1+0*stride]=
1484     src[2+1*stride]=
1485     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1486     src[2+0*stride]=
1487     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1488     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1489 }
1490
1491 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1492     LOAD_TOP_EDGE    
1493     LOAD_TOP_RIGHT_EDGE    
1494 //    LOAD_LEFT_EDGE    
1495
1496     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1497     src[1+0*stride]=
1498     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1499     src[2+0*stride]=
1500     src[1+1*stride]=
1501     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1502     src[3+0*stride]=
1503     src[2+1*stride]=
1504     src[1+2*stride]=
1505     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1506     src[3+1*stride]=
1507     src[2+2*stride]=
1508     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1509     src[3+2*stride]=
1510     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1511     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1512 }
1513
1514 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1515     const int lt= src[-1-1*stride];
1516     LOAD_TOP_EDGE    
1517     LOAD_LEFT_EDGE    
1518     const __attribute__((unused)) int unu= l3;
1519
1520     src[0+0*stride]=
1521     src[1+2*stride]=(lt + t0 + 1)>>1;
1522     src[1+0*stride]=
1523     src[2+2*stride]=(t0 + t1 + 1)>>1;
1524     src[2+0*stride]=
1525     src[3+2*stride]=(t1 + t2 + 1)>>1;
1526     src[3+0*stride]=(t2 + t3 + 1)>>1;
1527     src[0+1*stride]=
1528     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1529     src[1+1*stride]=
1530     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1531     src[2+1*stride]=
1532     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1533     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1534     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1535     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1536 }
1537
1538 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1539     LOAD_TOP_EDGE    
1540     LOAD_TOP_RIGHT_EDGE    
1541     const __attribute__((unused)) int unu= t7;
1542
1543     src[0+0*stride]=(t0 + t1 + 1)>>1;
1544     src[1+0*stride]=
1545     src[0+2*stride]=(t1 + t2 + 1)>>1;
1546     src[2+0*stride]=
1547     src[1+2*stride]=(t2 + t3 + 1)>>1;
1548     src[3+0*stride]=
1549     src[2+2*stride]=(t3 + t4+ 1)>>1;
1550     src[3+2*stride]=(t4 + t5+ 1)>>1;
1551     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1552     src[1+1*stride]=
1553     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1554     src[2+1*stride]=
1555     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1556     src[3+1*stride]=
1557     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
1558     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
1559 }
1560
1561 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
1562     LOAD_LEFT_EDGE    
1563
1564     src[0+0*stride]=(l0 + l1 + 1)>>1;
1565     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1566     src[2+0*stride]=
1567     src[0+1*stride]=(l1 + l2 + 1)>>1;
1568     src[3+0*stride]=
1569     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1570     src[2+1*stride]=
1571     src[0+2*stride]=(l2 + l3 + 1)>>1;
1572     src[3+1*stride]=
1573     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
1574     src[3+2*stride]=
1575     src[1+3*stride]=
1576     src[0+3*stride]=
1577     src[2+2*stride]=
1578     src[2+3*stride]=
1579     src[3+3*stride]=l3;
1580 }
1581     
1582 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
1583     const int lt= src[-1-1*stride];
1584     LOAD_TOP_EDGE    
1585     LOAD_LEFT_EDGE    
1586     const __attribute__((unused)) int unu= t3;
1587
1588     src[0+0*stride]=
1589     src[2+1*stride]=(lt + l0 + 1)>>1;
1590     src[1+0*stride]=
1591     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
1592     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
1593     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1594     src[0+1*stride]=
1595     src[2+2*stride]=(l0 + l1 + 1)>>1;
1596     src[1+1*stride]=
1597     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1598     src[0+2*stride]=
1599     src[2+3*stride]=(l1 + l2+ 1)>>1;
1600     src[1+2*stride]=
1601     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1602     src[0+3*stride]=(l2 + l3 + 1)>>1;
1603     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1604 }
1605
1606 static void pred16x16_vertical_c(uint8_t *src, int stride){
1607     int i;
1608     const uint32_t a= ((uint32_t*)(src-stride))[0];
1609     const uint32_t b= ((uint32_t*)(src-stride))[1];
1610     const uint32_t c= ((uint32_t*)(src-stride))[2];
1611     const uint32_t d= ((uint32_t*)(src-stride))[3];
1612     
1613     for(i=0; i<16; i++){
1614         ((uint32_t*)(src+i*stride))[0]= a;
1615         ((uint32_t*)(src+i*stride))[1]= b;
1616         ((uint32_t*)(src+i*stride))[2]= c;
1617         ((uint32_t*)(src+i*stride))[3]= d;
1618     }
1619 }
1620
1621 static void pred16x16_horizontal_c(uint8_t *src, int stride){
1622     int i;
1623
1624     for(i=0; i<16; i++){
1625         ((uint32_t*)(src+i*stride))[0]=
1626         ((uint32_t*)(src+i*stride))[1]=
1627         ((uint32_t*)(src+i*stride))[2]=
1628         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
1629     }
1630 }
1631
1632 static void pred16x16_dc_c(uint8_t *src, int stride){
1633     int i, dc=0;
1634
1635     for(i=0;i<16; i++){
1636         dc+= src[-1+i*stride];
1637     }
1638     
1639     for(i=0;i<16; i++){
1640         dc+= src[i-stride];
1641     }
1642
1643     dc= 0x01010101*((dc + 16)>>5);
1644
1645     for(i=0; i<16; i++){
1646         ((uint32_t*)(src+i*stride))[0]=
1647         ((uint32_t*)(src+i*stride))[1]=
1648         ((uint32_t*)(src+i*stride))[2]=
1649         ((uint32_t*)(src+i*stride))[3]= dc;
1650     }
1651 }
1652
1653 static void pred16x16_left_dc_c(uint8_t *src, int stride){
1654     int i, dc=0;
1655
1656     for(i=0;i<16; i++){
1657         dc+= src[-1+i*stride];
1658     }
1659     
1660     dc= 0x01010101*((dc + 8)>>4);
1661
1662     for(i=0; i<16; i++){
1663         ((uint32_t*)(src+i*stride))[0]=
1664         ((uint32_t*)(src+i*stride))[1]=
1665         ((uint32_t*)(src+i*stride))[2]=
1666         ((uint32_t*)(src+i*stride))[3]= dc;
1667     }
1668 }
1669
1670 static void pred16x16_top_dc_c(uint8_t *src, int stride){
1671     int i, dc=0;
1672
1673     for(i=0;i<16; i++){
1674         dc+= src[i-stride];
1675     }
1676     dc= 0x01010101*((dc + 8)>>4);
1677
1678     for(i=0; i<16; i++){
1679         ((uint32_t*)(src+i*stride))[0]=
1680         ((uint32_t*)(src+i*stride))[1]=
1681         ((uint32_t*)(src+i*stride))[2]=
1682         ((uint32_t*)(src+i*stride))[3]= dc;
1683     }
1684 }
1685
1686 static void pred16x16_128_dc_c(uint8_t *src, int stride){
1687     int i;
1688
1689     for(i=0; i<16; i++){
1690         ((uint32_t*)(src+i*stride))[0]=
1691         ((uint32_t*)(src+i*stride))[1]=
1692         ((uint32_t*)(src+i*stride))[2]=
1693         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
1694     }
1695 }
1696
1697 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
1698   int i, j, k;
1699   int a;
1700   uint8_t *cm = cropTbl + MAX_NEG_CROP;
1701   const uint8_t * const src0 = src+7-stride;
1702   const uint8_t *src1 = src+8*stride-1;
1703   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
1704   int H = src0[1] - src0[-1];
1705   int V = src1[0] - src2[ 0];
1706   for(k=2; k<=8; ++k) {
1707     src1 += stride; src2 -= stride;
1708     H += k*(src0[k] - src0[-k]);
1709     V += k*(src1[0] - src2[ 0]);
1710   }
1711   if(svq3){
1712     H = ( 5*(H/4) ) / 16;
1713     V = ( 5*(V/4) ) / 16;
1714
1715     /* required for 100% accuracy */
1716     i = H; H = V; V = i;
1717   }else{
1718     H = ( 5*H+32 ) >> 6;
1719     V = ( 5*V+32 ) >> 6;
1720   }
1721
1722   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
1723   for(j=16; j>0; --j) {
1724     int b = a;
1725     a += V;
1726     for(i=-16; i<0; i+=4) {
1727       src[16+i] = cm[ (b    ) >> 5 ];
1728       src[17+i] = cm[ (b+  H) >> 5 ];
1729       src[18+i] = cm[ (b+2*H) >> 5 ];
1730       src[19+i] = cm[ (b+3*H) >> 5 ];
1731       b += 4*H;
1732     }
1733     src += stride;
1734   }
1735 }
1736
1737 static void pred16x16_plane_c(uint8_t *src, int stride){
1738     pred16x16_plane_compat_c(src, stride, 0);
1739 }
1740
1741 static void pred8x8_vertical_c(uint8_t *src, int stride){
1742     int i;
1743     const uint32_t a= ((uint32_t*)(src-stride))[0];
1744     const uint32_t b= ((uint32_t*)(src-stride))[1];
1745     
1746     for(i=0; i<8; i++){
1747         ((uint32_t*)(src+i*stride))[0]= a;
1748         ((uint32_t*)(src+i*stride))[1]= b;
1749     }
1750 }
1751
1752 static void pred8x8_horizontal_c(uint8_t *src, int stride){
1753     int i;
1754
1755     for(i=0; i<8; i++){
1756         ((uint32_t*)(src+i*stride))[0]=
1757         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
1758     }
1759 }
1760
1761 static void pred8x8_128_dc_c(uint8_t *src, int stride){
1762     int i;
1763
1764     for(i=0; i<4; i++){
1765         ((uint32_t*)(src+i*stride))[0]= 
1766         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
1767     }
1768     for(i=4; i<8; i++){
1769         ((uint32_t*)(src+i*stride))[0]= 
1770         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
1771     }
1772 }
1773
1774 static void pred8x8_left_dc_c(uint8_t *src, int stride){
1775     int i;
1776     int dc0, dc2;
1777
1778     dc0=dc2=0;
1779     for(i=0;i<4; i++){
1780         dc0+= src[-1+i*stride];
1781         dc2+= src[-1+(i+4)*stride];
1782     }
1783     dc0= 0x01010101*((dc0 + 2)>>2);
1784     dc2= 0x01010101*((dc2 + 2)>>2);
1785
1786     for(i=0; i<4; i++){
1787         ((uint32_t*)(src+i*stride))[0]=
1788         ((uint32_t*)(src+i*stride))[1]= dc0;
1789     }
1790     for(i=4; i<8; i++){
1791         ((uint32_t*)(src+i*stride))[0]=
1792         ((uint32_t*)(src+i*stride))[1]= dc2;
1793     }
1794 }
1795
1796 static void pred8x8_top_dc_c(uint8_t *src, int stride){
1797     int i;
1798     int dc0, dc1;
1799
1800     dc0=dc1=0;
1801     for(i=0;i<4; i++){
1802         dc0+= src[i-stride];
1803         dc1+= src[4+i-stride];
1804     }
1805     dc0= 0x01010101*((dc0 + 2)>>2);
1806     dc1= 0x01010101*((dc1 + 2)>>2);
1807
1808     for(i=0; i<4; i++){
1809         ((uint32_t*)(src+i*stride))[0]= dc0;
1810         ((uint32_t*)(src+i*stride))[1]= dc1;
1811     }
1812     for(i=4; i<8; i++){
1813         ((uint32_t*)(src+i*stride))[0]= dc0;
1814         ((uint32_t*)(src+i*stride))[1]= dc1;
1815     }
1816 }
1817
1818
1819 static void pred8x8_dc_c(uint8_t *src, int stride){
1820     int i;
1821     int dc0, dc1, dc2, dc3;
1822
1823     dc0=dc1=dc2=0;
1824     for(i=0;i<4; i++){
1825         dc0+= src[-1+i*stride] + src[i-stride];
1826         dc1+= src[4+i-stride];
1827         dc2+= src[-1+(i+4)*stride];
1828     }
1829     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
1830     dc0= 0x01010101*((dc0 + 4)>>3);
1831     dc1= 0x01010101*((dc1 + 2)>>2);
1832     dc2= 0x01010101*((dc2 + 2)>>2);
1833
1834     for(i=0; i<4; i++){
1835         ((uint32_t*)(src+i*stride))[0]= dc0;
1836         ((uint32_t*)(src+i*stride))[1]= dc1;
1837     }
1838     for(i=4; i<8; i++){
1839         ((uint32_t*)(src+i*stride))[0]= dc2;
1840         ((uint32_t*)(src+i*stride))[1]= dc3;
1841     }
1842 }
1843
1844 static void pred8x8_plane_c(uint8_t *src, int stride){
1845   int j, k;
1846   int a;
1847   uint8_t *cm = cropTbl + MAX_NEG_CROP;
1848   const uint8_t * const src0 = src+3-stride;
1849   const uint8_t *src1 = src+4*stride-1;
1850   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
1851   int H = src0[1] - src0[-1];
1852   int V = src1[0] - src2[ 0];
1853   for(k=2; k<=4; ++k) {
1854     src1 += stride; src2 -= stride;
1855     H += k*(src0[k] - src0[-k]);
1856     V += k*(src1[0] - src2[ 0]);
1857   }
1858   H = ( 17*H+16 ) >> 5;
1859   V = ( 17*V+16 ) >> 5;
1860
1861   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
1862   for(j=8; j>0; --j) {
1863     int b = a;
1864     a += V;
1865     src[0] = cm[ (b    ) >> 5 ];
1866     src[1] = cm[ (b+  H) >> 5 ];
1867     src[2] = cm[ (b+2*H) >> 5 ];
1868     src[3] = cm[ (b+3*H) >> 5 ];
1869     src[4] = cm[ (b+4*H) >> 5 ];
1870     src[5] = cm[ (b+5*H) >> 5 ];
1871     src[6] = cm[ (b+6*H) >> 5 ];
1872     src[7] = cm[ (b+7*H) >> 5 ];
1873     src += stride;
1874   }
1875 }
1876
1877 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1878                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1879                            int src_x_offset, int src_y_offset,
1880                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1881     MpegEncContext * const s = &h->s;
1882     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1883     const int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1884     const int luma_xy= (mx&3) + ((my&3)<<2);
1885     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*s->linesize;
1886     uint8_t * src_cb= pic->data[1] + (mx>>3) + (my>>3)*s->uvlinesize;
1887     uint8_t * src_cr= pic->data[2] + (mx>>3) + (my>>3)*s->uvlinesize;
1888     int extra_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; //FIXME increase edge?, IMHO not worth it
1889     int extra_height= extra_width;
1890     int emu=0;
1891     const int full_mx= mx>>2;
1892     const int full_my= my>>2;
1893     
1894     assert(pic->data[0]);
1895     
1896     if(mx&7) extra_width -= 3;
1897     if(my&7) extra_height -= 3;
1898     
1899     if(   full_mx < 0-extra_width 
1900        || full_my < 0-extra_height 
1901        || full_mx + 16/*FIXME*/ > s->width + extra_width 
1902        || full_my + 16/*FIXME*/ > s->height + extra_height){
1903         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*s->linesize, s->linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, s->width, s->height);
1904             src_y= s->edge_emu_buffer + 2 + 2*s->linesize;
1905         emu=1;
1906     }
1907     
1908     qpix_op[luma_xy](dest_y, src_y, s->linesize); //FIXME try variable height perhaps?
1909     if(!square){
1910         qpix_op[luma_xy](dest_y + delta, src_y + delta, s->linesize);
1911     }
1912     
1913     if(s->flags&CODEC_FLAG_GRAY) return;
1914     
1915     if(emu){
1916         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
1917             src_cb= s->edge_emu_buffer;
1918     }
1919     chroma_op(dest_cb, src_cb, s->uvlinesize, chroma_height, mx&7, my&7);
1920
1921     if(emu){
1922         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
1923             src_cr= s->edge_emu_buffer;
1924     }
1925     chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
1926 }
1927
1928 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1929                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1930                            int x_offset, int y_offset,
1931                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1932                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1933                            int list0, int list1){
1934     MpegEncContext * const s = &h->s;
1935     qpel_mc_func *qpix_op=  qpix_put;
1936     h264_chroma_mc_func chroma_op= chroma_put;
1937     
1938     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
1939     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
1940     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
1941     x_offset += 8*s->mb_x;
1942     y_offset += 8*s->mb_y;
1943     
1944     if(list0){
1945         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1946         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1947                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1948                            qpix_op, chroma_op);
1949
1950         qpix_op=  qpix_avg;
1951         chroma_op= chroma_avg;
1952     }
1953
1954     if(list1){
1955         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1956         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1957                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1958                            qpix_op, chroma_op);
1959     }
1960 }
1961
1962 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1963                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1964                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg)){
1965     MpegEncContext * const s = &h->s;
1966     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1967     const int mb_type= s->current_picture.mb_type[mb_xy];
1968     
1969     assert(IS_INTER(mb_type));
1970     
1971     if(IS_16X16(mb_type)){
1972         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1973                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1974                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1975     }else if(IS_16X8(mb_type)){
1976         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1977                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1978                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1979         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1980                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1981                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1982     }else if(IS_8X16(mb_type)){
1983         mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
1984                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1985                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1986         mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
1987                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1988                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1989     }else{
1990         int i;
1991         
1992         assert(IS_8X8(mb_type));
1993
1994         for(i=0; i<4; i++){
1995             const int sub_mb_type= h->sub_mb_type[i];
1996             const int n= 4*i;
1997             int x_offset= (i&1)<<2;
1998             int y_offset= (i&2)<<1;
1999
2000             if(IS_SUB_8X8(sub_mb_type)){
2001                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2002                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2003                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2004             }else if(IS_SUB_8X4(sub_mb_type)){
2005                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2006                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2007                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2008                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2009                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2010                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2011             }else if(IS_SUB_4X8(sub_mb_type)){
2012                 mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2013                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2014                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2015                 mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2016                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2017                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2018             }else{
2019                 int j;
2020                 assert(IS_SUB_4X4(sub_mb_type));
2021                 for(j=0; j<4; j++){
2022                     int sub_x_offset= x_offset + 2*(j&1);
2023                     int sub_y_offset= y_offset +   (j&2);
2024                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2025                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2026                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2027                 }
2028             }
2029         }
2030     }
2031 }
2032
2033 static void decode_init_vlc(H264Context *h){
2034     static int done = 0;
2035
2036     if (!done) {
2037         int i;
2038         done = 1;
2039
2040         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5, 
2041                  &chroma_dc_coeff_token_len [0], 1, 1,
2042                  &chroma_dc_coeff_token_bits[0], 1, 1);
2043
2044         for(i=0; i<4; i++){
2045             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17, 
2046                      &coeff_token_len [i][0], 1, 1,
2047                      &coeff_token_bits[i][0], 1, 1);
2048         }
2049
2050         for(i=0; i<3; i++){
2051             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2052                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2053                      &chroma_dc_total_zeros_bits[i][0], 1, 1);
2054         }
2055         for(i=0; i<15; i++){
2056             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16, 
2057                      &total_zeros_len [i][0], 1, 1,
2058                      &total_zeros_bits[i][0], 1, 1);
2059         }
2060
2061         for(i=0; i<6; i++){
2062             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7, 
2063                      &run_len [i][0], 1, 1,
2064                      &run_bits[i][0], 1, 1);
2065         }
2066         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16, 
2067                  &run_len [6][0], 1, 1,
2068                  &run_bits[6][0], 1, 1);
2069     }
2070 }
2071
2072 /**
2073  * Sets the intra prediction function pointers.
2074  */
2075 static void init_pred_ptrs(H264Context *h){
2076 //    MpegEncContext * const s = &h->s;
2077
2078     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2079     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2080     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2081     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2082     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2083     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2084     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2085     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2086     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2087     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2088     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2089     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2090
2091     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
2092     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
2093     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
2094     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
2095     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
2096     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
2097     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
2098
2099     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
2100     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
2101     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
2102     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
2103     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
2104     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
2105     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
2106 }
2107
2108 static void free_tables(H264Context *h){
2109     av_freep(&h->intra4x4_pred_mode);
2110     av_freep(&h->non_zero_count);
2111     av_freep(&h->slice_table_base);
2112     h->slice_table= NULL;
2113     
2114     av_freep(&h->mb2b_xy);
2115     av_freep(&h->mb2b8_xy);
2116 }
2117
2118 /**
2119  * allocates tables.
2120  * needs widzh/height
2121  */
2122 static int alloc_tables(H264Context *h){
2123     MpegEncContext * const s = &h->s;
2124     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2125     int x,y;
2126
2127     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2128     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2129     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
2130
2131     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
2132     h->slice_table= h->slice_table_base + s->mb_stride + 1;
2133
2134     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint16_t));
2135     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint16_t));
2136     for(y=0; y<s->mb_height; y++){
2137         for(x=0; x<s->mb_width; x++){
2138             const int mb_xy= x + y*s->mb_stride;
2139             const int b_xy = 4*x + 4*y*h->b_stride;
2140             const int b8_xy= 2*x + 2*y*h->b8_stride;
2141         
2142             h->mb2b_xy [mb_xy]= b_xy;
2143             h->mb2b8_xy[mb_xy]= b8_xy;
2144         }
2145     }
2146     
2147     return 0;
2148 fail:
2149     free_tables(h);
2150     return -1;
2151 }
2152
2153 static void common_init(H264Context *h){
2154     MpegEncContext * const s = &h->s;
2155
2156     s->width = s->avctx->width;
2157     s->height = s->avctx->height;
2158     s->codec_id= s->avctx->codec->id;
2159     
2160     init_pred_ptrs(h);
2161
2162     s->decode=1; //FIXME
2163 }
2164
2165 static int decode_init(AVCodecContext *avctx){
2166     H264Context *h= avctx->priv_data;
2167     MpegEncContext * const s = &h->s;
2168
2169     s->avctx = avctx;
2170     common_init(h);
2171
2172     s->out_format = FMT_H264;
2173     s->workaround_bugs= avctx->workaround_bugs;
2174
2175     // set defaults
2176     s->progressive_sequence=1;
2177 //    s->decode_mb= ff_h263_decode_mb;
2178     s->low_delay= 1;
2179     avctx->pix_fmt= PIX_FMT_YUV420P;
2180
2181     decode_init_vlc(h);
2182     
2183     return 0;
2184 }
2185
2186 static void frame_start(H264Context *h){
2187     MpegEncContext * const s = &h->s;
2188     int i;
2189
2190     MPV_frame_start(s, s->avctx);
2191     ff_er_frame_start(s);
2192     h->mmco_index=0;
2193
2194     assert(s->linesize && s->uvlinesize);
2195
2196     for(i=0; i<16; i++){
2197         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2198         h->chroma_subblock_offset[i]= 2*((scan8[i] - scan8[0])&7) + 2*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2199     }
2200     for(i=0; i<4; i++){
2201         h->block_offset[16+i]=
2202         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2203     }
2204
2205 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2206 }
2207
2208 static void hl_decode_mb(H264Context *h){
2209     MpegEncContext * const s = &h->s;
2210     const int mb_x= s->mb_x;
2211     const int mb_y= s->mb_y;
2212     const int mb_xy= mb_x + mb_y*s->mb_stride;
2213     const int mb_type= s->current_picture.mb_type[mb_xy];
2214     uint8_t  *dest_y, *dest_cb, *dest_cr;
2215     int linesize, uvlinesize /*dct_offset*/;
2216     int i;
2217
2218     if(!s->decode)
2219         return;
2220
2221     if(s->mb_skiped){
2222     }
2223
2224     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2225     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2226     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2227
2228     if (h->mb_field_decoding_flag) {
2229         linesize = s->linesize * 2;
2230         uvlinesize = s->uvlinesize * 2;
2231         if(mb_y&1){ //FIXME move out of this func?
2232             dest_y -= s->linesize*15;
2233             dest_cb-= s->linesize*7;
2234             dest_cr-= s->linesize*7;
2235         }
2236     } else {
2237         linesize = s->linesize;
2238         uvlinesize = s->uvlinesize;
2239 //        dct_offset = s->linesize * 16;
2240     }
2241
2242     if(IS_INTRA(mb_type)){
2243         if(!(s->flags&CODEC_FLAG_GRAY)){
2244             h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2245             h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2246         }
2247
2248         if(IS_INTRA4x4(mb_type)){
2249             if(!s->encoding){
2250                 for(i=0; i<16; i++){
2251                     uint8_t * const ptr= dest_y + h->block_offset[i];
2252                     uint8_t *topright= ptr + 4 - linesize;
2253                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2254                     const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2255                     int tr;
2256
2257                     if(!topright_avail){
2258                         tr= ptr[3 - linesize]*0x01010101;
2259                         topright= (uint8_t*) &tr;
2260                     }
2261
2262                     h->pred4x4[ dir ](ptr, topright, linesize);
2263                     if(h->non_zero_count_cache[ scan8[i] ]){
2264                         if(s->codec_id == CODEC_ID_H264)
2265                             h264_add_idct_c(ptr, h->mb + i*16, linesize);
2266                         else
2267                             svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2268                     }
2269                 }
2270             }
2271         }else{
2272             h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2273             if(s->codec_id == CODEC_ID_H264)
2274                 h264_luma_dc_dequant_idct_c(h->mb, s->qscale);
2275             else
2276                 svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2277         }
2278     }else if(s->codec_id == CODEC_ID_H264){
2279         hl_motion(h, dest_y, dest_cb, dest_cr,
2280                   s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab, 
2281                   s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab);
2282     }
2283
2284
2285     if(!IS_INTRA4x4(mb_type)){
2286         if(s->codec_id == CODEC_ID_H264){
2287             for(i=0; i<16; i++){
2288                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2289                     uint8_t * const ptr= dest_y + h->block_offset[i];
2290                     h264_add_idct_c(ptr, h->mb + i*16, linesize);
2291                 }
2292             }
2293         }else{
2294             for(i=0; i<16; i++){
2295                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2296                     uint8_t * const ptr= dest_y + h->block_offset[i];
2297                     svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2298                 }
2299             }
2300         }
2301     }
2302
2303     if(!(s->flags&CODEC_FLAG_GRAY)){
2304         chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp);
2305         chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp);
2306         if(s->codec_id == CODEC_ID_H264){
2307             for(i=16; i<16+4; i++){
2308                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2309                     uint8_t * const ptr= dest_cb + h->block_offset[i];
2310                     h264_add_idct_c(ptr, h->mb + i*16, uvlinesize);
2311                 }
2312             }
2313             for(i=20; i<20+4; i++){
2314                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2315                     uint8_t * const ptr= dest_cr + h->block_offset[i];
2316                     h264_add_idct_c(ptr, h->mb + i*16, uvlinesize);
2317                 }
2318             }
2319         }else{
2320             for(i=16; i<16+4; i++){
2321                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2322                     uint8_t * const ptr= dest_cb + h->block_offset[i];
2323                     svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2324                 }
2325             }
2326             for(i=20; i<20+4; i++){
2327                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2328                     uint8_t * const ptr= dest_cr + h->block_offset[i];
2329                     svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2330                 }
2331             }
2332         }
2333     }
2334 }
2335
2336 static void decode_mb_cabac(H264Context *h){
2337 //    MpegEncContext * const s = &h->s;
2338 }
2339
2340 /**
2341  * fills the default_ref_list.
2342  */
2343 static int fill_default_ref_list(H264Context *h){
2344     MpegEncContext * const s = &h->s;
2345     int i;
2346     Picture sorted_short_ref[16];
2347     
2348     if(h->slice_type==B_TYPE){
2349         int out_i;
2350         int limit= -1;
2351
2352         for(out_i=0; out_i<h->short_ref_count; out_i++){
2353             int best_i=-1;
2354             int best_poc=-1;
2355
2356             for(i=0; i<h->short_ref_count; i++){
2357                 const int poc= h->short_ref[i]->poc;
2358                 if(poc > limit && poc < best_poc){
2359                     best_poc= poc;
2360                     best_i= i;
2361                 }
2362             }
2363             
2364             assert(best_i != -1);
2365             
2366             limit= best_poc;
2367             sorted_short_ref[out_i]= *h->short_ref[best_i];
2368         }
2369     }
2370
2371     if(s->picture_structure == PICT_FRAME){
2372         if(h->slice_type==B_TYPE){
2373             const int current_poc= s->current_picture_ptr->poc;
2374             int list;
2375
2376             for(list=0; list<2; list++){
2377                 int index=0;
2378
2379                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++){
2380                     const int i2= list ? h->short_ref_count - i - 1 : i;
2381                     const int poc= sorted_short_ref[i2].poc;
2382                     
2383                     if(sorted_short_ref[i2].reference != 3) continue; //FIXME refernce field shit
2384
2385                     if((list==1 && poc > current_poc) || (list==0 && poc < current_poc)){
2386                         h->default_ref_list[list][index  ]= sorted_short_ref[i2];
2387                         h->default_ref_list[list][index++].pic_id= sorted_short_ref[i2].frame_num;
2388                     }
2389                 }
2390
2391                 for(i=0; i<h->long_ref_count && index < h->ref_count[ list ]; i++){
2392                     if(h->long_ref[i]->reference != 3) continue;
2393
2394                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
2395                     h->default_ref_list[ list ][index++].pic_id= i;;
2396                 }
2397                 
2398                 if(h->long_ref_count > 1 && h->short_ref_count==0){
2399                     Picture temp= h->default_ref_list[1][0];
2400                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
2401                     h->default_ref_list[1][0] = temp;
2402                 }
2403
2404                 if(index < h->ref_count[ list ])
2405                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
2406             }
2407         }else{
2408             int index=0;
2409             for(i=0; i<h->short_ref_count && index < h->ref_count[0]; i++){
2410                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
2411                 h->default_ref_list[0][index  ]= *h->short_ref[i];
2412                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2413             }
2414             for(i=0; i<h->long_ref_count && index < h->ref_count[0]; i++){
2415                 if(h->long_ref[i]->reference != 3) continue;
2416                 h->default_ref_list[0][index  ]= *h->long_ref[i];
2417                 h->default_ref_list[0][index++].pic_id= i;;
2418             }
2419             if(index < h->ref_count[0])
2420                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2421         }
2422     }else{ //FIELD
2423         if(h->slice_type==B_TYPE){
2424         }else{
2425             //FIXME second field balh
2426         }
2427     }
2428     return 0;
2429 }
2430
2431 static int decode_ref_pic_list_reordering(H264Context *h){
2432     MpegEncContext * const s = &h->s;
2433     int list;
2434     
2435     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move beofre func
2436     
2437     for(list=0; list<2; list++){
2438         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2439
2440         if(get_bits1(&s->gb)){
2441             int pred= h->curr_pic_num;
2442             int index;
2443
2444             for(index=0; ; index++){
2445                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2446                 int pic_id;
2447                 int i;
2448                 
2449                 
2450                 if(index >= h->ref_count[list]){
2451                     fprintf(stderr, "reference count overflow\n");
2452                     return -1;
2453                 }
2454                 
2455                 if(reordering_of_pic_nums_idc<3){
2456                     if(reordering_of_pic_nums_idc<2){
2457                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2458
2459                         if(abs_diff_pic_num >= h->max_pic_num){
2460                             fprintf(stderr, "abs_diff_pic_num overflow\n");
2461                             return -1;
2462                         }
2463
2464                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2465                         else                                pred+= abs_diff_pic_num;
2466                         pred &= h->max_pic_num - 1;
2467                     
2468                         for(i= h->ref_count[list]-1; i>=index; i--){
2469                             if(h->ref_list[list][i].pic_id == pred && h->ref_list[list][i].long_ref==0)
2470                                 break;
2471                         }
2472                     }else{
2473                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2474
2475                         for(i= h->ref_count[list]-1; i>=index; i--){
2476                             if(h->ref_list[list][i].pic_id == pic_id && h->ref_list[list][i].long_ref==1)
2477                                 break;
2478                         }
2479                     }
2480
2481                     if(i < index){
2482                         fprintf(stderr, "reference picture missing during reorder\n");
2483                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2484                     }else if(i > index){
2485                         Picture tmp= h->ref_list[list][i];
2486                         for(; i>index; i--){
2487                             h->ref_list[list][i]= h->ref_list[list][i-1];
2488                         }
2489                         h->ref_list[list][index]= tmp;
2490                     }
2491                 }else if(reordering_of_pic_nums_idc==3) 
2492                     break;
2493                 else{
2494                     fprintf(stderr, "illegal reordering_of_pic_nums_idc\n");
2495                     return -1;
2496                 }
2497             }
2498         }
2499
2500         if(h->slice_type!=B_TYPE) break;
2501     }
2502     return 0;    
2503 }
2504
2505 static int pred_weight_table(H264Context *h){
2506     MpegEncContext * const s = &h->s;
2507     int list, i;
2508     
2509     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2510     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2511
2512     for(list=0; list<2; list++){
2513         for(i=0; i<h->ref_count[list]; i++){
2514             int luma_weight_flag, chroma_weight_flag;
2515             
2516             luma_weight_flag= get_bits1(&s->gb);
2517             if(luma_weight_flag){
2518                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2519                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2520             }
2521
2522             chroma_weight_flag= get_bits1(&s->gb);
2523             if(chroma_weight_flag){
2524                 int j;
2525                 for(j=0; j<2; j++){
2526                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2527                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2528                 }
2529             }
2530         }
2531         if(h->slice_type != B_TYPE) break;
2532     }
2533     return 0;
2534 }
2535
2536 /**
2537  * instantaneos decoder refresh.
2538  */
2539 static void idr(H264Context *h){
2540     int i;
2541
2542     for(i=0; i<h->long_ref_count; i++){
2543         h->long_ref[i]->reference=0;
2544         h->long_ref[i]= NULL;
2545     }
2546     h->long_ref_count=0;
2547
2548     for(i=0; i<h->short_ref_count; i++){
2549         h->short_ref[i]->reference=0;
2550         h->short_ref[i]= NULL;
2551     }
2552     h->short_ref_count=0;
2553 }
2554
2555 /**
2556  *
2557  * @return the removed picture or NULL if an error occures
2558  */
2559 static Picture * remove_short(H264Context *h, int frame_num){
2560     MpegEncContext * const s = &h->s;
2561     int i;
2562     
2563     if(s->avctx->debug&FF_DEBUG_MMCO)
2564         printf("remove short %d count %d\n", frame_num, h->short_ref_count);
2565     
2566     for(i=0; i<h->short_ref_count; i++){
2567         Picture *pic= h->short_ref[i];
2568         if(s->avctx->debug&FF_DEBUG_MMCO)
2569             printf("%d %d %p\n", i, pic->frame_num, pic);
2570         if(pic->frame_num == frame_num){
2571             h->short_ref[i]= NULL;
2572             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
2573             h->short_ref_count--;
2574             return pic;
2575         }
2576     }
2577     return NULL;
2578 }
2579
2580 /**
2581  *
2582  * @return the removed picture or NULL if an error occures
2583  */
2584 static Picture * remove_long(H264Context *h, int i){
2585     Picture *pic;
2586
2587     if(i >= h->long_ref_count) return NULL;
2588     pic= h->long_ref[i];
2589     if(pic==NULL) return NULL;
2590     
2591     h->long_ref[i]= NULL;
2592     memmove(&h->long_ref[i], &h->long_ref[i+1], (h->long_ref_count - i - 1)*sizeof(Picture*));
2593     h->long_ref_count--;
2594
2595     return pic;
2596 }
2597
2598 /**
2599  * Executes the reference picture marking (memory management control operations).
2600  */
2601 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
2602     MpegEncContext * const s = &h->s;
2603     int i;
2604     int current_is_long=0;
2605     Picture *pic;
2606     
2607     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
2608         printf("no mmco here\n");
2609         
2610     for(i=0; i<mmco_count; i++){
2611         if(s->avctx->debug&FF_DEBUG_MMCO)
2612             printf("mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
2613
2614         switch(mmco[i].opcode){
2615         case MMCO_SHORT2UNUSED:
2616             pic= remove_short(h, mmco[i].short_frame_num);
2617             if(pic==NULL) return -1;
2618             pic->reference= 0;
2619             break;
2620         case MMCO_SHORT2LONG:
2621             pic= remove_long(h, mmco[i].long_index);
2622             if(pic) pic->reference=0;
2623             
2624             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
2625             h->long_ref[ mmco[i].long_index ]->long_ref=1;
2626             break;
2627         case MMCO_LONG2UNUSED:
2628             pic= remove_long(h, mmco[i].long_index);
2629             if(pic==NULL) return -1;
2630             pic->reference= 0;
2631             break;
2632         case MMCO_LONG:
2633             pic= remove_long(h, mmco[i].long_index);
2634             if(pic) pic->reference=0;
2635             
2636             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
2637             h->long_ref[ mmco[i].long_index ]->long_ref=1;
2638             h->long_ref_count++;
2639             
2640             current_is_long=1;
2641             break;
2642         case MMCO_SET_MAX_LONG:
2643             assert(mmco[i].long_index <= 16);
2644             while(mmco[i].long_index < h->long_ref_count){
2645                 pic= remove_long(h, mmco[i].long_index);
2646                 pic->reference=0;
2647             }
2648             while(mmco[i].long_index > h->long_ref_count){
2649                 h->long_ref[ h->long_ref_count++ ]= NULL;
2650             }
2651             break;
2652         case MMCO_RESET:
2653             while(h->short_ref_count){
2654                 pic= remove_short(h, h->short_ref[0]->frame_num);
2655                 pic->reference=0;
2656             }
2657             while(h->long_ref_count){
2658                 pic= remove_long(h, h->long_ref_count-1);
2659                 pic->reference=0;
2660             }
2661             break;
2662         default: assert(0);
2663         }
2664     }
2665     
2666     if(!current_is_long){
2667         pic= remove_short(h, s->current_picture_ptr->frame_num);
2668         if(pic){
2669             pic->reference=0;
2670             fprintf(stderr, "illegal short term buffer state detected\n");
2671         }
2672         
2673         if(h->short_ref_count)
2674             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
2675
2676         h->short_ref[0]= s->current_picture_ptr;
2677         h->short_ref[0]->long_ref=0;
2678         h->short_ref_count++;
2679     }
2680     
2681     return 0; 
2682 }
2683
2684 static int decode_ref_pic_marking(H264Context *h){
2685     MpegEncContext * const s = &h->s;
2686     int i;
2687     
2688     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
2689         s->broken_link= get_bits1(&s->gb) -1;
2690         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
2691         if(h->mmco[0].long_index == -1)
2692             h->mmco_index= 0;
2693         else{
2694             h->mmco[0].opcode= MMCO_LONG;
2695             h->mmco_index= 1;
2696         } 
2697     }else{
2698         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
2699             for(i= h->mmco_index; i<MAX_MMCO_COUNT; i++) { 
2700                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
2701
2702                 h->mmco[i].opcode= opcode;
2703                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
2704                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
2705 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
2706                         fprintf(stderr, "illegal short ref in memory management control operation %d\n", mmco);
2707                         return -1;
2708                     }*/
2709                 }
2710                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
2711                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
2712                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
2713                         fprintf(stderr, "illegal long ref in memory management control operation %d\n", opcode);
2714                         return -1;
2715                     }
2716                 }
2717                     
2718                 if(opcode > MMCO_LONG){
2719                     fprintf(stderr, "illegal memory management control operation %d\n", opcode);
2720                     return -1;
2721                 }
2722             }
2723             h->mmco_index= i;
2724         }else{
2725             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
2726
2727             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
2728                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
2729                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
2730                 h->mmco_index= 1;
2731             }else
2732                 h->mmco_index= 0;
2733         }
2734     }
2735     
2736     return 0; 
2737 }
2738
2739 static int init_poc(H264Context *h){
2740     MpegEncContext * const s = &h->s;
2741     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
2742     int field_poc[2];
2743
2744     if(h->nal_unit_type == NAL_IDR_SLICE){
2745         h->frame_num_offset= 0;
2746     }else{
2747         if(h->frame_num < h->prev_frame_num)
2748             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
2749         else
2750             h->frame_num_offset= h->prev_frame_num_offset;
2751     }
2752
2753     if(h->sps.poc_type==0){
2754         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
2755
2756         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
2757             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
2758         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
2759             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
2760         else
2761             h->poc_msb = h->prev_poc_msb;
2762 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
2763         field_poc[0] = 
2764         field_poc[1] = h->poc_msb + h->poc_lsb;
2765         if(s->picture_structure == PICT_FRAME) 
2766             field_poc[1] += h->delta_poc_bottom;
2767     }else if(h->sps.poc_type==1){
2768         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
2769         int i;
2770
2771         if(h->sps.poc_cycle_length != 0)
2772             abs_frame_num = h->frame_num_offset + h->frame_num;
2773         else
2774             abs_frame_num = 0;
2775
2776         if(h->nal_ref_idc==0 && abs_frame_num > 0)
2777             abs_frame_num--;
2778             
2779         expected_delta_per_poc_cycle = 0;
2780         for(i=0; i < h->sps.poc_cycle_length; i++)
2781             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
2782
2783         if(abs_frame_num > 0){
2784             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
2785             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
2786
2787             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
2788             for(i = 0; i <= frame_num_in_poc_cycle; i++)
2789                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
2790         } else
2791             expectedpoc = 0;
2792
2793         if(h->nal_ref_idc == 0) 
2794             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
2795         
2796         field_poc[0] = expectedpoc + h->delta_poc[0];
2797         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
2798
2799         if(s->picture_structure == PICT_FRAME)
2800             field_poc[1] += h->delta_poc[1];
2801     }else{
2802         int poc;
2803         if(h->nal_unit_type == NAL_IDR_SLICE){
2804             poc= 0;
2805         }else{
2806             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
2807             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
2808         }
2809         field_poc[0]= poc;
2810         field_poc[1]= poc;
2811     }
2812     
2813     if(s->picture_structure != PICT_BOTTOM_FIELD)
2814         s->current_picture_ptr->field_poc[0]= field_poc[0];
2815     if(s->picture_structure != PICT_TOP_FIELD)
2816         s->current_picture_ptr->field_poc[1]= field_poc[1];
2817     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
2818         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
2819
2820     return 0;
2821 }
2822
2823 /**
2824  * decodes a slice header.
2825  * this will allso call MPV_common_init() and frame_start() as needed
2826  */
2827 static int decode_slice_header(H264Context *h){
2828     MpegEncContext * const s = &h->s;
2829     int first_mb_in_slice, pps_id;
2830     int num_ref_idx_active_override_flag;
2831     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
2832     float new_aspect;
2833
2834     s->current_picture.reference= h->nal_ref_idc != 0;
2835
2836     first_mb_in_slice= get_ue_golomb(&s->gb);
2837
2838     h->slice_type= get_ue_golomb(&s->gb);
2839     if(h->slice_type > 9){
2840         fprintf(stderr, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
2841     }
2842     if(h->slice_type > 4){
2843         h->slice_type -= 5;
2844         h->slice_type_fixed=1;
2845     }else
2846         h->slice_type_fixed=0;
2847     
2848     h->slice_type= slice_type_map[ h->slice_type ];
2849     
2850     s->pict_type= h->slice_type; // to make a few old func happy, its wrong though
2851         
2852     pps_id= get_ue_golomb(&s->gb);
2853     if(pps_id>255){
2854         fprintf(stderr, "pps_id out of range\n");
2855         return -1;
2856     }
2857     h->pps= h->pps_buffer[pps_id];
2858     if(h->pps.slice_group_count == 0){
2859         fprintf(stderr, "non existing PPS referenced\n");
2860         return -1;
2861     }
2862
2863     h->sps= h->sps_buffer[ h->pps.sps_id ];
2864     if(h->sps.log2_max_frame_num == 0){
2865         fprintf(stderr, "non existing SPS referenced\n");
2866         return -1;
2867     }
2868     
2869     s->mb_width= h->sps.mb_width;
2870     s->mb_height= h->sps.mb_height;
2871     
2872     h->b_stride=  s->mb_width*4;
2873     h->b8_stride= s->mb_width*2;
2874
2875     s->mb_x = first_mb_in_slice % s->mb_width;
2876     s->mb_y = first_mb_in_slice / s->mb_width; //FIXME AFFW
2877     
2878     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
2879     if(h->sps.frame_mbs_only_flag)
2880         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
2881     else
2882         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
2883     
2884     if(s->aspected_height) //FIXME emms at end of slice ?
2885         new_aspect= h->sps.sar_width*s->width / (float)(s->height*h->sps.sar_height);
2886     else
2887         new_aspect=0;
2888
2889     if (s->context_initialized 
2890         && (   s->width != s->avctx->width || s->height != s->avctx->height 
2891             || ABS(new_aspect - s->avctx->aspect_ratio) > 0.001)) {
2892         free_tables(h);
2893         MPV_common_end(s);
2894     }
2895     if (!s->context_initialized) {
2896         if (MPV_common_init(s) < 0)
2897             return -1;
2898
2899         alloc_tables(h);
2900
2901         s->avctx->width = s->width;
2902         s->avctx->height = s->height;
2903         s->avctx->aspect_ratio= new_aspect;
2904     }
2905
2906     if(first_mb_in_slice == 0){
2907         frame_start(h);
2908     }
2909
2910     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
2911     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
2912
2913     if(h->sps.frame_mbs_only_flag){
2914         s->picture_structure= PICT_FRAME;
2915     }else{
2916         if(get_bits1(&s->gb)) //field_pic_flag
2917             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
2918         else
2919             s->picture_structure= PICT_FRAME;
2920     }
2921
2922     if(s->picture_structure==PICT_FRAME){
2923         h->curr_pic_num=   h->frame_num;
2924         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
2925     }else{
2926         h->curr_pic_num= 2*h->frame_num;
2927         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
2928     }
2929         
2930     if(h->nal_unit_type == NAL_IDR_SLICE){
2931         get_ue_golomb(&s->gb); /* idr_pic_id */
2932     }
2933    
2934     if(h->sps.poc_type==0){
2935         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
2936         
2937         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
2938             h->delta_poc_bottom= get_se_golomb(&s->gb);
2939         }
2940     }
2941     
2942     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
2943         h->delta_poc[0]= get_se_golomb(&s->gb);
2944         
2945         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
2946             h->delta_poc[1]= get_se_golomb(&s->gb);
2947     }
2948     
2949     init_poc(h);
2950     
2951     if(h->pps.redundant_pic_cnt_present){
2952         h->redundant_pic_count= get_ue_golomb(&s->gb);
2953     }
2954
2955     //set defaults, might be overriden a few line later
2956     h->ref_count[0]= h->pps.ref_count[0];
2957     h->ref_count[1]= h->pps.ref_count[1];
2958
2959     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
2960         if(h->slice_type == B_TYPE){
2961             h->direct_spatial_mv_pred= get_bits1(&s->gb);
2962         }
2963         num_ref_idx_active_override_flag= get_bits1(&s->gb);
2964     
2965         if(num_ref_idx_active_override_flag){
2966             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
2967             if(h->slice_type==B_TYPE)
2968                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
2969
2970             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
2971                 fprintf(stderr, "reference overflow\n");
2972                 return -1;
2973             }
2974         }
2975     }
2976
2977     if(first_mb_in_slice == 0){
2978         fill_default_ref_list(h);
2979     }
2980
2981     decode_ref_pic_list_reordering(h);
2982
2983     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE )) 
2984        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
2985         pred_weight_table(h);
2986     
2987     if(s->current_picture.reference)
2988         decode_ref_pic_marking(h);
2989     //FIXME CABAC stuff
2990
2991     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb); //slice_qp_delta
2992     //FIXME qscale / qp ... stuff
2993     if(h->slice_type == SP_TYPE){
2994         get_bits1(&s->gb); /* sp_for_switch_flag */
2995     }
2996     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
2997         get_se_golomb(&s->gb); /* slice_qs_delta */
2998     }
2999
3000     if( h->pps.deblocking_filter_parameters_present ) {
3001         h->disable_deblocking_filter_idc= get_ue_golomb(&s->gb);
3002         if( h->disable_deblocking_filter_idc  !=  1 ) {
3003             h->slice_alpha_c0_offset_div2= get_se_golomb(&s->gb);
3004             h->slice_beta_offset_div2= get_se_golomb(&s->gb);
3005         }
3006     }else
3007         h->disable_deblocking_filter_idc= 0;
3008
3009 #if 0 //FMO
3010     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3011         slice_group_change_cycle= get_bits(&s->gb, ?);
3012 #endif
3013
3014     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3015         printf("mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d\n", 
3016                first_mb_in_slice, 
3017                av_get_pict_type_char(h->slice_type),
3018                pps_id, h->frame_num,
3019                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
3020                h->ref_count[0], h->ref_count[1],
3021                s->qscale,
3022                h->disable_deblocking_filter_idc
3023                );
3024     }
3025
3026     return 0;
3027 }
3028
3029 /**
3030  *
3031  */
3032 static inline int get_level_prefix(GetBitContext *gb){
3033     unsigned int buf;
3034     int log;
3035     
3036     OPEN_READER(re, gb);
3037     UPDATE_CACHE(re, gb);
3038     buf=GET_CACHE(re, gb);
3039     
3040     log= 32 - av_log2(buf);
3041 #ifdef TRACE
3042     print_bin(buf>>(32-log), log);
3043     printf("%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
3044 #endif
3045
3046     LAST_SKIP_BITS(re, gb, log);
3047     CLOSE_READER(re, gb);
3048
3049     return log-1;
3050 }
3051
3052 /**
3053  * decodes a residual block.
3054  * @param n block index
3055  * @param scantable scantable
3056  * @param max_coeff number of coefficients in the block
3057  * @return <0 if an error occured
3058  */
3059 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, int qp, int max_coeff){
3060     MpegEncContext * const s = &h->s;
3061     const uint16_t *qmul= dequant_coeff[qp];
3062     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
3063     int level[16], run[16];
3064     int suffix_length, zeros_left, coeff_num, coeff_token, total_coeff, i, trailing_ones;
3065
3066     //FIXME put trailing_onex into the context
3067
3068     if(n == CHROMA_DC_BLOCK_INDEX){
3069         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
3070         total_coeff= coeff_token>>2;
3071     }else{    
3072         if(n == LUMA_DC_BLOCK_INDEX){
3073             total_coeff= pred_non_zero_count(h, 0);
3074             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
3075             total_coeff= coeff_token>>2;
3076         }else{
3077             total_coeff= pred_non_zero_count(h, n);
3078             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
3079             total_coeff= coeff_token>>2;
3080             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
3081         }
3082     }
3083
3084     //FIXME set last_non_zero?
3085
3086     if(total_coeff==0)
3087         return 0;
3088         
3089     trailing_ones= coeff_token&3;
3090     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
3091     assert(total_coeff<=16);
3092     
3093     for(i=0; i<trailing_ones; i++){
3094         level[i]= 1 - 2*get_bits1(gb);
3095     }
3096
3097     suffix_length= total_coeff > 10 && trailing_ones < 3;
3098
3099     for(; i<total_coeff; i++){
3100         const int prefix= get_level_prefix(gb);
3101         int level_code, mask;
3102
3103         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
3104             if(suffix_length)
3105                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
3106             else
3107                 level_code= (prefix<<suffix_length); //part
3108         }else if(prefix==14){
3109             if(suffix_length)
3110                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
3111             else
3112                 level_code= prefix + get_bits(gb, 4); //part
3113         }else if(prefix==15){
3114             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
3115             if(suffix_length==0) level_code+=15; //FIXME doesnt make (much)sense
3116         }else{
3117             fprintf(stderr, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
3118             return -1;
3119         }
3120
3121         if(i==trailing_ones && i<3) level_code+= 2; //FIXME split first iteration
3122
3123         mask= -(level_code&1);
3124         level[i]= (((2+level_code)>>1) ^ mask) - mask;
3125
3126         if(suffix_length==0) suffix_length=1; //FIXME split first iteration
3127
3128 #if 1
3129         if(ABS(level[i]) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
3130 #else        
3131         if((2+level_code)>>1) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
3132         ? == prefix > 2 or sth
3133 #endif
3134         tprintf("level: %d suffix_length:%d\n", level[i], suffix_length);
3135     }
3136
3137     if(total_coeff == max_coeff)
3138         zeros_left=0;
3139     else{
3140         if(n == CHROMA_DC_BLOCK_INDEX)
3141             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
3142         else
3143             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
3144     }
3145     
3146     for(i=0; i<total_coeff-1; i++){
3147         if(zeros_left <=0)
3148             break;
3149         else if(zeros_left < 7){
3150             run[i]= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
3151         }else{
3152             run[i]= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
3153         }
3154         zeros_left -= run[i];
3155     }
3156
3157     if(zeros_left<0){
3158         fprintf(stderr, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
3159         return -1;
3160     }
3161     
3162     for(; i<total_coeff-1; i++){
3163         run[i]= 0;
3164     }
3165
3166     run[i]= zeros_left;
3167
3168     coeff_num=-1;
3169     if(n > 24){
3170         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into rundecode?
3171             int j;
3172
3173             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
3174             j= scantable[ coeff_num ];
3175
3176             block[j]= level[i];
3177         }
3178     }else{
3179         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into  rundecode?
3180             int j;
3181
3182             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
3183             j= scantable[ coeff_num ];
3184
3185             block[j]= level[i] * qmul[j];
3186 //            printf("%d %d  ", block[j], qmul[j]);
3187         }
3188     }
3189     return 0;
3190 }
3191
3192 /**
3193  * decodes a macroblock
3194  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
3195  */
3196 static int decode_mb(H264Context *h){
3197     MpegEncContext * const s = &h->s;
3198     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
3199     int mb_type, partition_count, cbp;
3200
3201     s->dsp.clear_blocks(h->mb); //FIXME avoid if allready clear (move after skip handlong?    
3202
3203     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
3204     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
3205                 down the code */
3206     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
3207         if(s->mb_skip_run==-1)
3208             s->mb_skip_run= get_ue_golomb(&s->gb);
3209         
3210         if (s->mb_skip_run--) {
3211             int mx, my;
3212             /* skip mb */
3213 //FIXME b frame
3214             mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0;
3215
3216             memset(h->non_zero_count[mb_xy], 0, 16);
3217             memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
3218
3219             if(h->sps.mb_aff && s->mb_skip_run==0 && (s->mb_y&1)==0){
3220                 h->mb_field_decoding_flag= get_bits1(&s->gb);
3221             }
3222
3223             if(h->mb_field_decoding_flag)
3224                 mb_type|= MB_TYPE_INTERLACED;
3225             
3226             fill_caches(h, mb_type); //FIXME check what is needed and what not ...
3227             pred_pskip_motion(h, &mx, &my);
3228             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
3229             fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
3230             write_back_motion(h, mb_type);
3231
3232             s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type
3233             h->slice_table[ mb_xy ]= h->slice_num;
3234
3235             h->prev_mb_skiped= 1;
3236             return 0;
3237         }
3238     }
3239     if(h->sps.mb_aff /* && !field pic FIXME needed? */){
3240         if((s->mb_y&1)==0)
3241             h->mb_field_decoding_flag = get_bits1(&s->gb);
3242     }else
3243         h->mb_field_decoding_flag=0; //FIXME som ed note ?!
3244     
3245     h->prev_mb_skiped= 0;
3246     
3247     mb_type= get_ue_golomb(&s->gb);
3248     if(h->slice_type == B_TYPE){
3249         if(mb_type < 23){
3250             partition_count= b_mb_type_info[mb_type].partition_count;
3251             mb_type=         b_mb_type_info[mb_type].type;
3252         }else{
3253             mb_type -= 23;
3254             goto decode_intra_mb;
3255         }
3256     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
3257         if(mb_type < 5){
3258             partition_count= p_mb_type_info[mb_type].partition_count;
3259             mb_type=         p_mb_type_info[mb_type].type;
3260         }else{
3261             mb_type -= 5;
3262             goto decode_intra_mb;
3263         }
3264     }else{
3265        assert(h->slice_type == I_TYPE);
3266 decode_intra_mb:
3267         if(mb_type > 25){
3268             fprintf(stderr, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
3269             return -1;
3270         }
3271         partition_count=0;
3272         cbp= i_mb_type_info[mb_type].cbp;
3273         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
3274         mb_type= i_mb_type_info[mb_type].type;
3275     }
3276
3277     if(h->mb_field_decoding_flag)
3278         mb_type |= MB_TYPE_INTERLACED;
3279
3280     s->current_picture.mb_type[mb_xy]= mb_type;
3281     h->slice_table[ mb_xy ]= h->slice_num;
3282     
3283     if(IS_INTRA_PCM(mb_type)){
3284         const uint8_t *ptr;
3285         int x, y;
3286         
3287         // we assume these blocks are very rare so we dont optimize it
3288         align_get_bits(&s->gb);
3289         
3290         ptr= s->gb.buffer + get_bits_count(&s->gb);
3291     
3292         for(y=0; y<16; y++){
3293             const int index= 4*(y&3) + 64*(y>>2);
3294             for(x=0; x<16; x++){
3295                 h->mb[index + (x&3) + 16*(x>>2)]= *(ptr++);
3296             }
3297         }
3298         for(y=0; y<8; y++){
3299             const int index= 256 + 4*(y&3) + 32*(y>>2);
3300             for(x=0; x<8; x++){
3301                 h->mb[index + (x&3) + 16*(x>>2)]= *(ptr++);
3302             }
3303         }
3304         for(y=0; y<8; y++){
3305             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
3306             for(x=0; x<8; x++){
3307                 h->mb[index + (x&3) + 16*(x>>2)]= *(ptr++);
3308             }
3309         }
3310     
3311         skip_bits(&s->gb, 384); //FIXME check /fix the bitstream readers
3312         
3313         memset(h->non_zero_count[mb_xy], 16, 16);
3314         
3315         return 0;
3316     }
3317         
3318     fill_caches(h, mb_type);
3319
3320     //mb_pred
3321     if(IS_INTRA(mb_type)){
3322 //            init_top_left_availability(h);
3323             if(IS_INTRA4x4(mb_type)){
3324                 int i;
3325
3326 //                fill_intra4x4_pred_table(h);
3327                 for(i=0; i<16; i++){
3328                     const int mode_coded= !get_bits1(&s->gb);
3329                     const int predicted_mode=  pred_intra_mode(h, i);
3330                     int mode;
3331
3332                     if(mode_coded){
3333                         const int rem_mode= get_bits(&s->gb, 3);
3334                         if(rem_mode<predicted_mode)
3335                             mode= rem_mode;
3336                         else
3337                             mode= rem_mode + 1;
3338                     }else{
3339                         mode= predicted_mode;
3340                     }
3341                     
3342                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
3343                 }
3344                 write_back_intra_pred_mode(h);
3345                 if( check_intra4x4_pred_mode(h) < 0)
3346                     return -1;
3347             }else{
3348                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
3349                 if(h->intra16x16_pred_mode < 0)
3350                     return -1;
3351             }
3352             h->chroma_pred_mode= get_ue_golomb(&s->gb);
3353
3354             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
3355             if(h->chroma_pred_mode < 0)
3356                 return -1;
3357     }else if(partition_count==4){
3358         int i, j, sub_partition_count[4], list, ref[2][4];
3359         
3360         if(h->slice_type == B_TYPE){
3361             for(i=0; i<4; i++){
3362                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
3363                 if(h->sub_mb_type[i] >=13){
3364                     fprintf(stderr, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
3365                     return -1;
3366                 }
3367                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
3368                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
3369             }
3370         }else{
3371             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
3372             for(i=0; i<4; i++){
3373                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
3374                 if(h->sub_mb_type[i] >=4){
3375                     fprintf(stderr, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
3376                     return -1;
3377                 }
3378                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
3379                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
3380             }
3381         }
3382         
3383         for(list=0; list<2; list++){
3384             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
3385             if(ref_count == 0) continue;
3386             for(i=0; i<4; i++){
3387                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
3388                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
3389                 }else{
3390                  //FIXME
3391                     ref[list][i] = -1;
3392                 }
3393             }
3394         }
3395         
3396         for(list=0; list<2; list++){
3397             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
3398             if(ref_count == 0) continue;
3399
3400             for(i=0; i<4; i++){
3401                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
3402                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
3403
3404                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
3405                     const int sub_mb_type= h->sub_mb_type[i];
3406                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
3407                     for(j=0; j<sub_partition_count[i]; j++){
3408                         int mx, my;
3409                         const int index= 4*i + block_width*j;
3410                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
3411                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
3412                         mx += get_se_golomb(&s->gb);
3413                         my += get_se_golomb(&s->gb);
3414                         tprintf("final mv:%d %d\n", mx, my);
3415
3416                         if(IS_SUB_8X8(sub_mb_type)){
3417                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= 
3418                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
3419                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= 
3420                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
3421                         }else if(IS_SUB_8X4(sub_mb_type)){
3422                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
3423                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
3424                         }else if(IS_SUB_4X8(sub_mb_type)){
3425                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
3426                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
3427                         }else{
3428                             assert(IS_SUB_4X4(sub_mb_type));
3429                             mv_cache[ 0 ][0]= mx;
3430                             mv_cache[ 0 ][1]= my;
3431                         }
3432                     }
3433                 }else{
3434                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
3435                     p[0] = p[1]=
3436                     p[8] = p[9]= 0;
3437                 }
3438             }
3439         }
3440     }else if(!IS_DIRECT(mb_type)){
3441         int list, mx, my, i;
3442          //FIXME we should set ref_idx_l? to 0 if we use that later ...
3443         if(IS_16X16(mb_type)){
3444             for(list=0; list<2; list++){
3445                 if(h->ref_count[0]>0){
3446                     if(IS_DIR(mb_type, 0, list)){
3447                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
3448                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
3449                     }
3450                 }
3451             }
3452             for(list=0; list<2; list++){
3453                 if(IS_DIR(mb_type, 0, list)){
3454                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
3455                     mx += get_se_golomb(&s->gb);
3456                     my += get_se_golomb(&s->gb);
3457                     tprintf("final mv:%d %d\n", mx, my);
3458
3459                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
3460                 }
3461             }
3462         }
3463         else if(IS_16X8(mb_type)){
3464             for(list=0; list<2; list++){
3465                 if(h->ref_count[list]>0){
3466                     for(i=0; i<2; i++){
3467                         if(IS_DIR(mb_type, i, list)){
3468                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
3469                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
3470                         }
3471                     }
3472                 }
3473             }
3474             for(list=0; list<2; list++){
3475                 for(i=0; i<2; i++){
3476                     if(IS_DIR(mb_type, i, list)){
3477                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
3478                         mx += get_se_golomb(&s->gb);
3479                         my += get_se_golomb(&s->gb);
3480                         tprintf("final mv:%d %d\n", mx, my);
3481
3482                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
3483                     }
3484                 }
3485             }
3486         }else{
3487             assert(IS_8X16(mb_type));
3488             for(list=0; list<2; list++){
3489                 if(h->ref_count[list]>0){
3490                     for(i=0; i<2; i++){
3491                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
3492                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
3493                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
3494                         }
3495                     }
3496                 }
3497             }
3498             for(list=0; list<2; list++){
3499                 for(i=0; i<2; i++){
3500                     if(IS_DIR(mb_type, i, list)){
3501                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
3502                         mx += get_se_golomb(&s->gb);
3503                         my += get_se_golomb(&s->gb);
3504                         tprintf("final mv:%d %d\n", mx, my);
3505
3506                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
3507                     }
3508                 }
3509             }
3510         }
3511     }
3512     
3513     if(IS_INTER(mb_type))
3514         write_back_motion(h, mb_type);
3515     
3516     if(!IS_INTRA16x16(mb_type)){
3517         cbp= get_ue_golomb(&s->gb);
3518         if(cbp > 47){
3519             fprintf(stderr, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
3520             return -1;
3521         }
3522         
3523         if(IS_INTRA4x4(mb_type))
3524             cbp= golomb_to_intra4x4_cbp[cbp];
3525         else
3526             cbp= golomb_to_inter_cbp[cbp];
3527     }
3528
3529     if(cbp || IS_INTRA16x16(mb_type)){
3530         int i8x8, i4x4, chroma_idx;
3531         int chroma_qp, dquant;
3532         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
3533         const uint8_t *scan, *dc_scan;
3534         
3535 //        fill_non_zero_count_cache(h);
3536
3537         if(IS_INTERLACED(mb_type)){
3538             scan= field_scan;
3539             dc_scan= luma_dc_field_scan;
3540         }else{
3541             scan= zigzag_scan;
3542             dc_scan= luma_dc_zigzag_scan;
3543         }
3544
3545         dquant= get_se_golomb(&s->gb);
3546
3547         if( dquant > 25 || dquant < -26 ){
3548             fprintf(stderr, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
3549             return -1;
3550         }
3551         
3552         s->qscale += dquant;
3553         if(((unsigned)s->qscale) > 51){
3554             if(s->qscale<0) s->qscale+= 52;
3555             else            s->qscale-= 52;
3556         }
3557         
3558         h->chroma_qp= chroma_qp= get_chroma_qp(h, s->qscale);
3559         if(IS_INTRA16x16(mb_type)){
3560             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, s->qscale, 16) < 0){
3561                 return -1; //FIXME continue if partotioned and other retirn -1 too
3562             }
3563
3564             assert((cbp&15) == 0 || (cbp&15) == 15);
3565
3566             if(cbp&15){
3567                 for(i8x8=0; i8x8<4; i8x8++){
3568                     for(i4x4=0; i4x4<4; i4x4++){
3569                         const int index= i4x4 + 4*i8x8;
3570                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, s->qscale, 15) < 0 ){
3571                             return -1;
3572                         }
3573                     }
3574                 }
3575             }else{
3576                 memset(&h->non_zero_count_cache[8], 0, 8*4); //FIXME stupid & slow
3577             }
3578         }else{
3579             for(i8x8=0; i8x8<4; i8x8++){
3580                 if(cbp & (1<<i8x8)){
3581                     for(i4x4=0; i4x4<4; i4x4++){
3582                         const int index= i4x4 + 4*i8x8;
3583                         
3584                         if( decode_residual(h, gb, h->mb + 16*index, index, scan, s->qscale, 16) <0 ){
3585                             return -1;
3586                         }
3587                     }
3588                 }else{
3589                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
3590                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
3591                 }
3592             }
3593         }
3594         
3595         if(cbp&0x30){
3596             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
3597                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, chroma_qp, 4) < 0){
3598                     return -1;
3599                 }
3600         }
3601
3602         if(cbp&0x20){
3603             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
3604                 for(i4x4=0; i4x4<4; i4x4++){
3605                     const int index= 16 + 4*chroma_idx + i4x4;
3606                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, chroma_qp, 15) < 0){
3607                         return -1;
3608                     }
3609                 }
3610             }
3611         }else{
3612             uint8_t * const nnz= &h->non_zero_count_cache[0];
3613             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
3614             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
3615         }
3616     }else{
3617         memset(&h->non_zero_count_cache[8], 0, 8*5);
3618     }
3619     write_back_non_zero_count(h);
3620
3621     return 0;
3622 }
3623
3624 static int decode_slice(H264Context *h){
3625     MpegEncContext * const s = &h->s;
3626     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
3627
3628     s->mb_skip_run= -1;
3629     
3630 #if 1
3631     for(;;){
3632         int ret= decode_mb(h);
3633             
3634         hl_decode_mb(h);
3635         
3636         if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ?
3637             s->mb_y++;
3638             ret= decode_mb(h);
3639             
3640             hl_decode_mb(h);
3641             s->mb_y--;
3642         }
3643
3644         if(ret<0){
3645             fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
3646             ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
3647
3648             return -1;
3649         }
3650         
3651         if(++s->mb_x >= s->mb_width){
3652             s->mb_x=0;
3653             ff_draw_horiz_band(s, 16*s->mb_y, 16);
3654             if(++s->mb_y >= s->mb_height){
3655                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
3656
3657                 if(get_bits_count(&s->gb) == s->gb.size_in_bits){
3658                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
3659
3660                     return 0;
3661                 }else{
3662                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
3663
3664                     return -1;
3665                 }
3666             }
3667         }
3668         
3669         if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
3670             if(get_bits_count(&s->gb) == s->gb.size_in_bits){
3671                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
3672
3673                 return 0;
3674             }else{
3675                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
3676
3677                 return -1;
3678             }
3679         }
3680     }
3681 #endif
3682 #if 0
3683     for(;s->mb_y < s->mb_height; s->mb_y++){
3684         for(;s->mb_x < s->mb_width; s->mb_x++){
3685             int ret= decode_mb(h);
3686             
3687             hl_decode_mb(h);
3688
3689             if(ret<0){
3690                 fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
3691                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
3692
3693                 return -1;
3694             }
3695         
3696             if(++s->mb_x >= s->mb_width){
3697                 s->mb_x=0;
3698                 if(++s->mb_y >= s->mb_height){
3699                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
3700                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
3701
3702                         return 0;
3703                     }else{
3704                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
3705
3706                         return -1;
3707                     }
3708                 }
3709             }
3710         
3711             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
3712                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
3713                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
3714
3715                     return 0;
3716                 }else{
3717                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
3718
3719                     return -1;
3720                 }
3721             }
3722         }
3723         s->mb_x=0;
3724         ff_draw_horiz_band(s, 16*s->mb_y, 16);
3725     }
3726 #endif
3727     return -1; //not reached
3728 }
3729
3730 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
3731     MpegEncContext * const s = &h->s;
3732     int aspect_ratio_info_present_flag, aspect_ratio_idc;
3733
3734     aspect_ratio_info_present_flag= get_bits1(&s->gb);
3735     
3736     if( aspect_ratio_info_present_flag ) {
3737         aspect_ratio_idc= get_bits(&s->gb, 8);
3738         if( aspect_ratio_idc == EXTENDED_SAR ) {
3739             sps->sar_width= get_bits(&s->gb, 16);
3740             sps->sar_height= get_bits(&s->gb, 16);
3741         }else if(aspect_ratio_idc < 16){
3742             sps->sar_width=  pixel_aspect[aspect_ratio_idc][0];
3743             sps->sar_height= pixel_aspect[aspect_ratio_idc][1];
3744         }else{
3745             fprintf(stderr, "illegal aspect ratio\n");
3746             return -1;
3747         }
3748     }else{
3749         sps->sar_width= 
3750         sps->sar_height= 0;
3751     }
3752 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
3753 #if 0
3754 | overscan_info_present_flag                        |0  |u(1)    |
3755 | if( overscan_info_present_flag )                  |   |        |
3756 |  overscan_appropriate_flag                        |0  |u(1)    |
3757 | video_signal_type_present_flag                    |0  |u(1)    |
3758 | if( video_signal_type_present_flag ) {            |   |        |
3759 |  video_format                                     |0  |u(3)    |
3760 |  video_full_range_flag                            |0  |u(1)    |
3761 |  colour_description_present_flag                  |0  |u(1)    |
3762 |  if( colour_description_present_flag ) {          |   |        |
3763 |   colour_primaries                                |0  |u(8)    |
3764 |   transfer_characteristics                        |0  |u(8)    |
3765 |   matrix_coefficients                             |0  |u(8)    |
3766 |  }                                                |   |        |
3767 | }                                                 |   |        |
3768 | chroma_location_info_present_flag                 |0  |u(1)    |
3769 | if ( chroma_location_info_present_flag ) {        |   |        |
3770 |  chroma_sample_location_type_top_field            |0  |ue(v)   |
3771 |  chroma_sample_location_type_bottom_field         |0  |ue(v)   |
3772 | }                                                 |   |        |
3773 | timing_info_present_flag                          |0  |u(1)    |
3774 | if( timing_info_present_flag ) {                  |   |        |
3775 |  num_units_in_tick                                |0  |u(32)   |
3776 |  time_scale                                       |0  |u(32)   |
3777 |  fixed_frame_rate_flag                            |0  |u(1)    |
3778 | }                                                 |   |        |
3779 | nal_hrd_parameters_present_flag                   |0  |u(1)    |
3780 | if( nal_hrd_parameters_present_flag  = =  1)      |   |        |
3781 |  hrd_parameters( )                                |   |        |
3782 | vcl_hrd_parameters_present_flag                   |0  |u(1)    |
3783 | if( vcl_hrd_parameters_present_flag  = =  1)      |   |        |
3784 |  hrd_parameters( )                                |   |        |
3785 | if( ( nal_hrd_parameters_present_flag  = =  1  | ||   |        |
3786 |                                                   |   |        |
3787 |( vcl_hrd_parameters_present_flag  = =  1 ) )      |   |        |
3788 |  low_delay_hrd_flag                               |0  |u(1)    |
3789 | bitstream_restriction_flag                        |0  |u(1)    |
3790 | if( bitstream_restriction_flag ) {                |0  |u(1)    |
3791 |  motion_vectors_over_pic_boundaries_flag          |0  |u(1)    |
3792 |  max_bytes_per_pic_denom                          |0  |ue(v)   |
3793 |  max_bits_per_mb_denom                            |0  |ue(v)   |
3794 |  log2_max_mv_length_horizontal                    |0  |ue(v)   |
3795 |  log2_max_mv_length_vertical                      |0  |ue(v)   |
3796 |  num_reorder_frames                               |0  |ue(v)   |
3797 |  max_dec_frame_buffering                          |0  |ue(v)   |
3798 | }                                                 |   |        |
3799 |}                                                  |   |        |
3800 #endif
3801     return 0;
3802 }
3803
3804 static inline int decode_seq_parameter_set(H264Context *h){
3805     MpegEncContext * const s = &h->s;
3806     int profile_idc, level_idc;
3807     int sps_id, i;
3808     SPS *sps;
3809     
3810     profile_idc= get_bits(&s->gb, 8);
3811     get_bits1(&s->gb);   //constraint_set0_flag
3812     get_bits1(&s->gb);   //constraint_set1_flag
3813     get_bits1(&s->gb);   //constraint_set2_flag
3814     get_bits(&s->gb, 5); // reserved
3815     level_idc= get_bits(&s->gb, 8);
3816     sps_id= get_ue_golomb(&s->gb);
3817     
3818     sps= &h->sps_buffer[ sps_id ];
3819     sps->profile_idc= profile_idc;
3820     sps->level_idc= level_idc;
3821     
3822     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
3823     sps->poc_type= get_ue_golomb(&s->gb);
3824     
3825     if(sps->poc_type == 0){ //FIXME #define
3826         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
3827     } else if(sps->poc_type == 1){//FIXME #define
3828         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
3829         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
3830         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
3831         sps->poc_cycle_length= get_ue_golomb(&s->gb);
3832         
3833         for(i=0; i<sps->poc_cycle_length; i++)
3834             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
3835     }
3836     if(sps->poc_type > 2){
3837         fprintf(stderr, "illegal POC type %d\n", sps->poc_type);
3838         return -1;
3839     }
3840
3841     sps->ref_frame_count= get_ue_golomb(&s->gb);
3842     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
3843     sps->mb_width= get_ue_golomb(&s->gb) + 1;
3844     sps->mb_height= get_ue_golomb(&s->gb) + 1;
3845     sps->frame_mbs_only_flag= get_bits1(&s->gb);
3846     if(!sps->frame_mbs_only_flag)
3847         sps->mb_aff= get_bits1(&s->gb);
3848     else
3849         sps->mb_aff= 0;
3850
3851     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
3852
3853     sps->crop= get_bits1(&s->gb);
3854     if(sps->crop){
3855         sps->crop_left  = get_ue_golomb(&s->gb);
3856         sps->crop_right = get_ue_golomb(&s->gb);
3857         sps->crop_top   = get_ue_golomb(&s->gb);
3858         sps->crop_bottom= get_ue_golomb(&s->gb);
3859         if(sps->crop_left || sps->crop_top){
3860             fprintf(stderr, "insane croping not completly supported, this could look slightly wrong ...\n");
3861         }
3862     }else{
3863         sps->crop_left  = 
3864         sps->crop_right = 
3865         sps->crop_top   = 
3866         sps->crop_bottom= 0;
3867     }
3868
3869     sps->vui_parameters_present_flag= get_bits1(&s->gb);
3870     if( sps->vui_parameters_present_flag )
3871         decode_vui_parameters(h, sps);
3872     
3873     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3874         printf("sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n", 
3875                sps_id, sps->profile_idc, sps->level_idc,
3876                sps->poc_type,
3877                sps->ref_frame_count,
3878                sps->mb_width, sps->mb_height,
3879                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
3880                sps->direct_8x8_inference_flag ? "8B8" : "",
3881                sps->crop_left, sps->crop_right, 
3882                sps->crop_top, sps->crop_bottom, 
3883                sps->vui_parameters_present_flag ? "VUI" : ""
3884                );
3885     }
3886     return 0;
3887 }
3888
3889 static inline int decode_picture_parameter_set(H264Context *h){
3890     MpegEncContext * const s = &h->s;
3891     int pps_id= get_ue_golomb(&s->gb);
3892     PPS *pps= &h->pps_buffer[pps_id];
3893     
3894     pps->sps_id= get_ue_golomb(&s->gb);
3895     pps->cabac= get_bits1(&s->gb);
3896     pps->pic_order_present= get_bits1(&s->gb);
3897     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
3898     if(pps->slice_group_count > 1 ){
3899         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
3900 fprintf(stderr, "FMO not supported\n");
3901         switch(pps->mb_slice_group_map_type){
3902         case 0:
3903 #if 0
3904 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
3905 |    run_length[ i ]                                |1  |ue(v)   |
3906 #endif
3907             break;
3908         case 2:
3909 #if 0
3910 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
3911 |{                                                  |   |        |
3912 |    top_left_mb[ i ]                               |1  |ue(v)   |
3913 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
3914 |   }                                               |   |        |
3915 #endif
3916             break;
3917         case 3:
3918         case 4:
3919         case 5:
3920 #if 0
3921 |   slice_group_change_direction_flag               |1  |u(1)    |
3922 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
3923 #endif
3924             break;
3925         case 6:
3926 #if 0
3927 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
3928 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
3929 |)                                                  |   |        |
3930 |    slice_group_id[ i ]                            |1  |u(v)    |
3931 #endif
3932             break;
3933         }
3934     }
3935     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3936     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3937     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
3938         fprintf(stderr, "reference overflow (pps)\n");
3939         return -1;
3940     }
3941     
3942     pps->weighted_pred= get_bits1(&s->gb);
3943     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
3944     pps->init_qp= get_se_golomb(&s->gb) + 26;
3945     pps->init_qs= get_se_golomb(&s->gb) + 26;
3946     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
3947     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
3948     pps->constrained_intra_pred= get_bits1(&s->gb);
3949     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
3950     
3951     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3952         printf("pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s\n", 
3953                pps_id, pps->sps_id,
3954                pps->cabac ? "CABAC" : "CAVLC",
3955                pps->slice_group_count,
3956                pps->ref_count[0], pps->ref_count[1],
3957                pps->weighted_pred ? "weighted" : "",
3958                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
3959                pps->deblocking_filter_parameters_present ? "LPAR" : "",
3960                pps->constrained_intra_pred ? "CONSTR" : "",
3961                pps->redundant_pic_cnt_present ? "REDU" : ""
3962                );
3963     }
3964     
3965     return 0;
3966 }
3967
3968 /**
3969  * finds the end of the current frame in the bitstream.
3970  * @return the position of the first byte of the next frame, or -1
3971  */
3972 static int find_frame_end(MpegEncContext *s, uint8_t *buf, int buf_size){
3973     ParseContext *pc= &s->parse_context;
3974     int i;
3975     uint32_t state;
3976 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
3977 //    mb_addr= pc->mb_addr - 1;
3978     state= pc->state;
3979     //FIXME this will fail with slices
3980     for(i=0; i<buf_size; i++){
3981         state= (state<<8) | buf[i];
3982         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
3983             if(pc->frame_start_found){
3984                 pc->state=-1; 
3985                 pc->frame_start_found= 0;
3986                 return i-3;
3987             }
3988             pc->frame_start_found= 1;
3989         }
3990     }
3991     
3992     pc->state= state;
3993     return END_NOT_FOUND;
3994 }
3995
3996 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
3997     MpegEncContext * const s = &h->s;
3998     AVCodecContext * const avctx= s->avctx;
3999     int buf_index=0;
4000 #if 0
4001     int i;
4002     for(i=0; i<32; i++){
4003         printf("%X ", buf[i]);
4004     }
4005 #endif
4006     for(;;){
4007         int consumed;
4008         int dst_length;
4009         int bit_length;
4010         uint8_t *ptr;
4011         
4012         // start code prefix search
4013         for(; buf_index + 3 < buf_size; buf_index++){
4014             // this should allways succeed in the first iteration
4015             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
4016                 break;
4017         }
4018         
4019         if(buf_index+3 >= buf_size) break;
4020         
4021         buf_index+=3;
4022         
4023         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, buf_size - buf_index);
4024         if(ptr[dst_length - 1] == 0) dst_length--;
4025         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
4026
4027         if(s->avctx->debug&FF_DEBUG_STARTCODE){
4028             printf("NAL %d at %d length %d\n", h->nal_unit_type, buf_index, dst_length);
4029         }
4030         
4031         buf_index += consumed;
4032
4033         if(h->nal_ref_idc < s->hurry_up)
4034             continue;
4035         
4036         switch(h->nal_unit_type){
4037         case NAL_IDR_SLICE:
4038             idr(h); //FIXME ensure we dont loose some frames if there is reordering
4039         case NAL_SLICE:
4040             init_get_bits(&s->gb, ptr, bit_length);
4041             h->intra_gb_ptr=
4042             h->inter_gb_ptr= &s->gb;
4043             s->data_partitioning = 0;
4044             
4045             if(decode_slice_header(h) < 0) return -1;
4046             if(h->redundant_pic_count==0)
4047                 decode_slice(h);
4048             break;
4049         case NAL_DPA:
4050             init_get_bits(&s->gb, ptr, bit_length);
4051             h->intra_gb_ptr=
4052             h->inter_gb_ptr= NULL;
4053             s->data_partitioning = 1;
4054             
4055             if(decode_slice_header(h) < 0) return -1;
4056             break;
4057         case NAL_DPB:
4058             init_get_bits(&h->intra_gb, ptr, bit_length);
4059             h->intra_gb_ptr= &h->intra_gb;
4060             break;
4061         case NAL_DPC:
4062             init_get_bits(&h->inter_gb, ptr, bit_length);
4063             h->inter_gb_ptr= &h->inter_gb;
4064
4065             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning)
4066                 decode_slice(h);
4067             break;
4068         case NAL_SEI:
4069             break;
4070         case NAL_SPS:
4071             init_get_bits(&s->gb, ptr, bit_length);
4072             decode_seq_parameter_set(h);
4073             
4074             if(s->flags& CODEC_FLAG_LOW_DELAY)
4075                 s->low_delay=1;
4076       
4077             avctx->has_b_frames= !s->low_delay;
4078             break;
4079         case NAL_PPS:
4080             init_get_bits(&s->gb, ptr, bit_length);
4081             
4082             decode_picture_parameter_set(h);
4083
4084             break;
4085         case NAL_PICTURE_DELIMITER:
4086             break;
4087         case NAL_FILTER_DATA:
4088             break;
4089         }        
4090
4091         //FIXME move after where irt is set
4092         s->current_picture.pict_type= s->pict_type;
4093         s->current_picture.key_frame= s->pict_type == I_TYPE;
4094     }
4095     
4096     if(!s->current_picture_ptr) return buf_index; //no frame
4097     
4098     h->prev_frame_num_offset= h->frame_num_offset;
4099     h->prev_frame_num= h->frame_num;
4100     if(s->current_picture_ptr->reference){
4101         h->prev_poc_msb= h->poc_msb;
4102         h->prev_poc_lsb= h->poc_lsb;
4103     }
4104     if(s->current_picture_ptr->reference)
4105         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
4106     else
4107         assert(h->mmco_index==0);
4108
4109     ff_er_frame_end(s);
4110     MPV_frame_end(s);
4111
4112     return buf_index;
4113 }
4114
4115 /**
4116  * retunrs the number of bytes consumed for building the current frame
4117  */
4118 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
4119     if(s->flags&CODEC_FLAG_TRUNCATED){
4120         pos -= s->parse_context.last_index;
4121         if(pos<0) pos=0; // FIXME remove (uneeded?)
4122         
4123         return pos;
4124     }else{
4125         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
4126         if(pos+10>buf_size) pos=buf_size; // oops ;)
4127
4128         return pos;
4129     }
4130 }
4131
4132 static int decode_frame(AVCodecContext *avctx, 
4133                              void *data, int *data_size,
4134                              uint8_t *buf, int buf_size)
4135 {
4136     H264Context *h = avctx->priv_data;
4137     MpegEncContext *s = &h->s;
4138     AVFrame *pict = data; 
4139     int buf_index;
4140     
4141     s->flags= avctx->flags;
4142
4143     *data_size = 0;
4144    
4145    /* no supplementary picture */
4146     if (buf_size == 0) {
4147         return 0;
4148     }
4149     
4150     if(s->flags&CODEC_FLAG_TRUNCATED){
4151         int next= find_frame_end(s, buf, buf_size);
4152         
4153         if( ff_combine_frame(s, next, &buf, &buf_size) < 0 )
4154             return buf_size;
4155 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
4156     }
4157
4158     if(s->avctx->extradata_size && s->picture_number==0){
4159         if(0 < decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) ) 
4160             return -1;
4161     }
4162
4163     buf_index=decode_nal_units(h, buf, buf_size);
4164     if(buf_index < 0) 
4165         return -1;
4166
4167     //FIXME do something with unavailable reference frames    
4168  
4169 //    if(ret==FRAME_SKIPED) return get_consumed_bytes(s, buf_index, buf_size);
4170 #if 0
4171     if(s->pict_type==B_TYPE || s->low_delay){
4172         *pict= *(AVFrame*)&s->current_picture;
4173     } else {
4174         *pict= *(AVFrame*)&s->last_picture;
4175     }
4176 #endif
4177     if(!s->current_picture_ptr){
4178         fprintf(stderr, "error, NO frame\n");
4179         return -1;
4180     }
4181
4182     *pict= *(AVFrame*)&s->current_picture; //FIXME 
4183     ff_print_debug_info(s, s->current_picture_ptr);
4184     assert(pict->data[0]);
4185 //printf("out %d\n", (int)pict->data[0]);
4186 #if 0 //?
4187
4188     /* Return the Picture timestamp as the frame number */
4189     /* we substract 1 because it is added on utils.c    */
4190     avctx->frame_number = s->picture_number - 1;
4191 #endif
4192 #if 0
4193     /* dont output the last pic after seeking */
4194     if(s->last_picture_ptr || s->low_delay)
4195     //Note this isnt a issue as a IDR pic should flush teh buffers
4196 #endif
4197         *data_size = sizeof(AVFrame);
4198     return get_consumed_bytes(s, buf_index, buf_size);
4199 }
4200 #if 0
4201 static inline void fill_mb_avail(H264Context *h){
4202     MpegEncContext * const s = &h->s;
4203     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4204
4205     if(s->mb_y){
4206         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
4207         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
4208         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
4209     }else{
4210         h->mb_avail[0]=
4211         h->mb_avail[1]=
4212         h->mb_avail[2]= 0;
4213     }
4214     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
4215     h->mb_avail[4]= 1; //FIXME move out
4216     h->mb_avail[5]= 0; //FIXME move out
4217 }
4218 #endif
4219
4220 #if 0 //selftest
4221 #define COUNT 8000
4222 #define SIZE (COUNT*40)
4223 int main(){
4224     int i;
4225     uint8_t temp[SIZE];
4226     PutBitContext pb;
4227     GetBitContext gb;
4228 //    int int_temp[10000];
4229     DSPContext dsp;
4230     AVCodecContext avctx;
4231     
4232     dsputil_init(&dsp, &avctx);
4233
4234     init_put_bits(&pb, temp, SIZE, NULL, NULL);
4235     printf("testing unsigned exp golomb\n");
4236     for(i=0; i<COUNT; i++){
4237         START_TIMER
4238         set_ue_golomb(&pb, i);
4239         STOP_TIMER("set_ue_golomb");
4240     }
4241     flush_put_bits(&pb);
4242     
4243     init_get_bits(&gb, temp, 8*SIZE);
4244     for(i=0; i<COUNT; i++){
4245         int j, s;
4246         
4247         s= show_bits(&gb, 24);
4248         
4249         START_TIMER
4250         j= get_ue_golomb(&gb);
4251         if(j != i){
4252             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
4253 //            return -1;
4254         }
4255         STOP_TIMER("get_ue_golomb");
4256     }
4257     
4258     
4259     init_put_bits(&pb, temp, SIZE, NULL, NULL);
4260     printf("testing signed exp golomb\n");
4261     for(i=0; i<COUNT; i++){
4262         START_TIMER
4263         set_se_golomb(&pb, i - COUNT/2);
4264         STOP_TIMER("set_se_golomb");
4265     }
4266     flush_put_bits(&pb);
4267     
4268     init_get_bits(&gb, temp, 8*SIZE);
4269     for(i=0; i<COUNT; i++){
4270         int j, s;
4271         
4272         s= show_bits(&gb, 24);
4273         
4274         START_TIMER
4275         j= get_se_golomb(&gb);
4276         if(j != i - COUNT/2){
4277             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
4278 //            return -1;
4279         }
4280         STOP_TIMER("get_se_golomb");
4281     }
4282
4283     printf("testing 4x4 (I)DCT\n");
4284     
4285     DCTELEM block[16];
4286     uint8_t src[16], ref[16];
4287     uint64_t error= 0, max_error=0;
4288
4289     for(i=0; i<COUNT; i++){
4290         int j;
4291 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
4292         for(j=0; j<16; j++){
4293             ref[j]= random()%255;
4294             src[j]= random()%255;
4295         }
4296
4297         h264_diff_dct_c(block, src, ref, 4);
4298         
4299         //normalize
4300         for(j=0; j<16; j++){
4301 //            printf("%d ", block[j]);
4302             block[j]= block[j]*4;
4303             if(j&1) block[j]= (block[j]*4 + 2)/5;
4304             if(j&4) block[j]= (block[j]*4 + 2)/5;
4305         }
4306 //        printf("\n");
4307         
4308         h264_add_idct_c(ref, block, 4);
4309 /*        for(j=0; j<16; j++){
4310             printf("%d ", ref[j]);
4311         }
4312         printf("\n");*/
4313             
4314         for(j=0; j<16; j++){
4315             int diff= ABS(src[j] - ref[j]);
4316             
4317             error+= diff*diff;
4318             max_error= FFMAX(max_error, diff);
4319         }
4320     }
4321     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
4322 #if 0
4323     printf("testing quantizer\n");
4324     for(qp=0; qp<52; qp++){
4325         for(i=0; i<16; i++)
4326             src1_block[i]= src2_block[i]= random()%255;
4327         
4328     }
4329 #endif
4330     printf("Testing NAL layer\n");
4331     
4332     uint8_t bitstream[COUNT];
4333     uint8_t nal[COUNT*2];
4334     H264Context h;
4335     memset(&h, 0, sizeof(H264Context));
4336     
4337     for(i=0; i<COUNT; i++){
4338         int zeros= i;
4339         int nal_length;
4340         int consumed;
4341         int out_length;
4342         uint8_t *out;
4343         int j;
4344         
4345         for(j=0; j<COUNT; j++){
4346             bitstream[j]= (random() % 255) + 1;
4347         }
4348         
4349         for(j=0; j<zeros; j++){
4350             int pos= random() % COUNT;
4351             while(bitstream[pos] == 0){
4352                 pos++;
4353                 pos %= COUNT;
4354             }
4355             bitstream[pos]=0;
4356         }
4357         
4358         START_TIMER
4359         
4360         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
4361         if(nal_length<0){
4362             printf("encoding failed\n");
4363             return -1;
4364         }
4365         
4366         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
4367
4368         STOP_TIMER("NAL")
4369         
4370         if(out_length != COUNT){
4371             printf("incorrect length %d %d\n", out_length, COUNT);
4372             return -1;
4373         }
4374         
4375         if(consumed != nal_length){
4376             printf("incorrect consumed length %d %d\n", nal_length, consumed);
4377             return -1;
4378         }
4379         
4380         if(memcmp(bitstream, out, COUNT)){
4381             printf("missmatch\n");
4382             return -1;
4383         }
4384     }
4385     
4386     printf("Testing RBSP\n");
4387     
4388     
4389     return 0;
4390 }
4391 #endif
4392
4393
4394 static int decode_end(AVCodecContext *avctx)
4395 {
4396     H264Context *h = avctx->priv_data;
4397     MpegEncContext *s = &h->s;
4398     
4399     free_tables(h); //FIXME cleanup init stuff perhaps
4400     MPV_common_end(s);
4401
4402 //    memset(h, 0, sizeof(H264Context));
4403         
4404     return 0;
4405 }
4406
4407
4408 AVCodec h264_decoder = {
4409     "h264",
4410     CODEC_TYPE_VIDEO,
4411     CODEC_ID_H264,
4412     sizeof(H264Context),
4413     decode_init,
4414     NULL,
4415     decode_end,
4416     decode_frame,
4417     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
4418 };
4419
4420 #include "svq3.c"