* all: re-import of the CVS.

git-svn-id: svn://svn.videolan.org/x264/trunk@1 df754926-b1dd-0310-bc7b-ec298dee348c
2004-06-03 19:29:33 +00:00 · 2004-06-03 19:29:33 +00:00 · 5dc0aae2f9
commit 5dc0aae2f9
95 changed files with 32275 additions and 0 deletions
--- a/.cvsignore
+++ b/.cvsignore
@ -0,0 +1,3 @@
+.depend
+x264
+checkasm
--- a/35
+++ b/35
@ -0,0 +1,35 @@
+# $Id: AUTHORS,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+# 
+# The format of this file was inspired by the Linux kernel CREDITS file.
+# Authors are listed alphabetically.
+# 
+# The fields are: name (N), email (E), web-address (W), CVS account login (C),
+# PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
+
+N: Laurent Aimar
+E: fenrir AT via.ecp DOT fr
+C: fenrir
+D: Intial import, maintainer
+D: i386 asm (mmx/mmx2)
+S: France
+
+N: Eric Petit
+E: titer AT videolan DOT org
+C: titer
+D: Altivec
+D: BeOS and MacOS X ports.
+S: France
+
+N: Min Chen
+E: chenm001 AT 163 DOT com
+C: chenm001
+D: Win32/VC 6.0 port
+D: gcc asm to nasm conversion
+D: vfw interface
+S: China
+
+N: Justin Clay
+E: justin_clay AT hotmail DOT com
+C: wheatgerm
+D: Inital work on vfw
+S: Nova Scotia, Canada
--- a/340
+++ b/340
@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
--- a/67
+++ b/67
@ -0,0 +1,67 @@
+# $Id: Jamfile,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+#
+
+# Compilers
+CC = gcc ;
+AS = nasm ;
+
+# Flags
+# To profile: -fprofile-arcs
+# Once done : -fbranch-probabilities
+CCFLAGS = -g -Wall -W ;
+
+# Globals defines
+DEFINES = DEBUG __X264__ ;
+
+# Optims
+OPTIM = -O3 -funroll-loops ;
+
+# Headers rep
+HDRS = . core core/i366 decoder encoder ;
+
+SOURCES_C = core/mc.c core/predict.c core/pixel.c core/macroblock.c
+            core/frame.c core/dct.c core/cpu.c core/cabac.c
+            core/common.c core/mdate.c core/csp.c
+            encoder/analyse.c encoder/me.c encoder/ratecontrol.c
+            encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c
+            encoder/encoder.c ;
+
+SOURCES_X86 = core/i386/cpu.asm ;
+SOURCES_MMX = core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c core/i386/dct.asm core/i386/pixel.asm core/i386/mc.asm ;
+
+SOURCES_ALTIVEC = core/ppc/mc.c core/ppc/pixel.c ;
+
+# libx264
+SOURCES_X264 = $(SOURCES_C) ;
+if $(OS) = LINUX
+{
+    DEFINES      += ARCH_X86 HAVE_MMXEXT HAVE_MALLOC_H ;
+    SOURCES_X264 += $(SOURCES_MMX) ;
+    SOURCES_X264 += $(SOURCES_X86) ;
+    ASFLAGS = -f elf ;
+
+    # Don't ask why
+    NOARUPDATE = false ;
+}
+if $(OS) = MACOSX
+{
+    DEFINES      += HAVE_ALTIVEC ;
+    SOURCES_X264 += $(SOURCES_ALTIVEC) ;
+    CCFLAGS      += -faltivec ;
+#    OPTIM        += -falign-loops=16 ;
+}
+Library libx264 : $(SOURCES_X264) ;
+
+# x264
+LINKLIBS += -lm ;
+LinkLibraries x264 : libx264.a ;
+Main x264 : x264.c ;
+
+# checkasm
+LinkLibraries checkasm : libx264.a ;
+Main checkasm : testing/checkasm.c ;
+
+# XXX Do not remove *.o files
+actions quietly updated piecemeal together RmTemps
+{
+}
--- a/58
+++ b/58
@ -0,0 +1,58 @@
+# Makefile: tuned for i386/MMX system only
+# For ppc append
+#  SRCS: core/ppc/mc.c core/ppc/pixel.c 
+#  Defines: HAVE_ALTIVEC
+#  CFLAGS: -faltivec
+#
+CC=gcc
+CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+
+SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c \
+       x264.c
+
+AS= nasm
+# for linux
+ASFLAGS=-f elf
+# for cygwin
+#ASFLAGS=-f gnuwin32 -DPREFIX
+
+ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm  core/i386/mc.asm
+OBJASM= $(ASMSRC:%.asm=%.o)
+
+OBJS = $(SRCS:%.c=%.o)
+DEP  = depend
+
+default: $(DEP) x264
+
+libx264.a: $(OBJS) $(OBJASM)
+	ar rc libx264.a $(OBJS) $(OBJASM)
+
+x264: libx264.a x264.o
+	$(CC) $(CFLAGS) -o x264 x264.o libx264.a -lm
+
+checkasm: testing/checkasm.c libx264.a
+	$(CC) $(CFLAGS) -o checkasm $< libx264.a -lm
+
+%.o: %.asm
+	$(AS) $(ASFLAGS) -o $@ $<
+
+.depend: $(SRCS) x264.c
+	$(CC) -MM $(CFLAGS) $(SRCS) x264.c 1> .depend
+
+depend: .depend
+ifneq ($(wildcard .depend),)
+include .depend
+endif
+
+clean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
+distclean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
--- a/Makefile.cygwin
+++ b/Makefile.cygwin
@ -0,0 +1,52 @@
+# Makefile: tuned for i386/MMX cygwin system only
+#
+CC=gcc
+CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
+
+SRCS=  core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c \
+       x264.c
+
+AS= nasm
+#for cygwin
+ASFLAGS=-f win32 -DPREFIX
+
+ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
+OBJASM= $(ASMSRC:%.asm=%.o)
+
+OBJS = $(SRCS:%.c=%.o)
+DEP  = depend
+
+default: $(DEP) x264
+
+libx264.a: $(OBJS) $(OBJASM)
+	ar rc libx264.a $(OBJS) $(OBJASM)
+
+x264: libx264.a x264.o
+	$(CC) $(CFLAGS) -o x264 x264.o libx264.a -lm
+
+checkasm: testing/checkasm.c libx264.a
+	$(CC) $(CFLAGS) -o checkasm $< libx264.a -lm
+
+%.o: %.asm
+	$(AS) $(ASFLAGS) -o $@ $<
+
+.depend: $(SRCS) x264.c
+	$(CC) -MM $(CFLAGS) $(SRCS) x264.c 1> .depend
+
+depend: .depend
+ifneq ($(wildcard .depend),)
+include .depend
+endif
+
+clean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
+distclean:
+	rm -f $(OBJS) $(OBJASM) *.a x264.o .depend x264
+
--- a/65
+++ b/65
@ -0,0 +1,65 @@
+ It is far from complete, anyway :
+
+General:
+--------
+ Encoder:
+ ########
+
+ * CABAC: check if adaptive model is really working. (I didn't see any improvments)
+
+ * Field support : no and I probably won't do it.
+
+ * Slice A/B/C (partion): is there any interest doing it ? (Shouldn't be hard).
+    - extend x264_t
+    - review x264_macroblock_write_cavlc
+
+ * Intra encoding:
+    - in I_4x4 mode, some predict mode aren't tested and not supported :
+    when some pixels  around are unavailble but could  be predicted from
+    others. (see the norm)
+
+ * Inter coding:
+    - D_4x8 D_8x4 and D_4x4 ME P block -> done but too slow.
+    - B_ types.
+    - scene change detection.
+    - long terme ?
+    - ...
+
+ * B frame: B_L0/L1/BI work in cavlc (need more testing).
+    -> need to do all others mb type (first B_DIRECT and B_SKIP)
+    -> cabac
+    -> look at weighted prediction (should give bettter result)
+    -> better analyse algo (as always ;)
+
+ * Speed issue (oprofile is your friend)
+    - mc.c:30% and pixel.c:20% (mc is used by ME)
+    - Motion Estimation -> try better/faster algos.
+    - loop filter
+    - stream writing (bs)
+    - ...
+
+ * Time spent: (test file: 720x576, mmx, mmxext)
+    CAVLC: analyse=73% encode=15% write=4% filter=6%
+    CABAC: analyse=69% encode=16% write=8% filter=5%
+
+ * Limitations:
+    - frame width/height %16 == 0 only.
+
+ * ...
+
+ Decoder:
+ ########
+
+ * Currently decoder/* won't even compile, and anyway is unusable.
+
+ Build:
+ ######
+ * Port gcc inlined asm to nasm file (BUT without any speed loss else
+   that will be rejected).
+
+Coding issue:
+-------------
+ * table : somes are duplicated -> find a solution (easy).
+ * documentations ? (mouaaaarfff ;)
+ * ...
+
--- a/build/cygwin/Makefile
+++ b/build/cygwin/Makefile
@ -0,0 +1,102 @@
+##############################################################################
+#
+# Makefile for lib264.a and x264
+#
+# Author: x264 by Laurent Aimar <fenrir@via.ecp.fr>
+#
+# $Id: Makefile,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+##############################################################################
+
+# Current dir
+DIR_CUR=$(shell pwd)
+
+# Path to src
+DIR_SRC=$(DIR_CUR)/../..
+
+# Sources
+SRC_C= core/mc.c core/predict.c core/pixel.c core/macroblock.c \
+       core/frame.c core/dct.c core/cpu.c core/cabac.c \
+       core/common.c core/mdate.c core/csp.c \
+       encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
+       encoder/set.c encoder/macroblock.c encoder/cabac.c encoder/cavlc.c \
+       encoder/encoder.c \
+       core/i386/mc-c.c core/i386/dct-c.c core/i386/predict.c
+
+SRC_ASM= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
+
+# Alias
+RM= rm -rf
+
+##############################################################################
+# CFLAGS
+##############################################################################
+
+# Constants which should not be modified
+# The `mingw-runtime` package is required when building with -mno-cygwin
+CFLAGS += -I$(DIR_SRC)
+CFLAGS += -mno-cygwin
+CFLAGS += -D__X264__ -DARCH_X86 -DHAVE_MMXEXT -D_CYGWIN
+
+# Optional Compiler options
+CFLAGS += -g -Wall -DDEBUG
+CFLAGS += -O3
+CFLAGS += -finline-functions
+CFLAGS += -funroll-loops
+CFLAGS += -ffast-math
+
+
+##############################################################################
+# Compiler flags for linking stage
+##############################################################################
+
+LDFLAGS += -L$(DIR_LIB) -lx264
+
+##############################################################################
+# ASM
+##############################################################################
+AS= nasm
+ASFLAGS= -f gnuwin32 -DPREFIX
+##############################################################################
+# Rules
+##############################################################################
+
+OBJECTS = $(SRC_C:.c=.obj)
+OBJECTS+= $(SRC_ASM:.asm=.obj)
+
+.SUFFIXES: .obj .asm .c
+
+DIR_BUILD= $(DIR_CUR)/bin
+VPATH = $(DIR_SRC):$(DIR_BUILD)
+
+all: libx264.a x264.exe
+
+$(DIR_BUILD):
+	@echo " D: $(DIR_BUILD)"
+	@mkdir -p $(DIR_BUILD)
+
+.asm.obj:
+	@echo " A: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(AS) $(ASFLAGS) -o $(DIR_BUILD)/$@ $<
+
+.c.obj:
+	@echo " C: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(CC) $(CFLAGS) -c -o $(DIR_BUILD)/$@ $<
+
+libx264.a: $(DIR_BUILD) $(OBJECTS)
+	@echo " L: $(@F)"
+	@cd $(DIR_BUILD) && \
+	ar rc libx264.a $(OBJECTS) && \
+	cp -f libx264.a $(DIR_SRC)
+
+x264.exe: $(DIR_BUILD) $(OBJECTS) x264.obj
+	@echo " L: $(@F)"
+	@cd $(DIR_BUILD) && \
+	$(CC) $(CFLAGS) -o x264 x264.obj libx264.a -lm && \
+	cp -f x264.exe $(DIR_SRC)
+
+clean:
+	@echo " Cl: Object files and target lib"
+	@$(RM) $(DIR_BUILD)
+
--- a/build/win32/libx264.dsp
+++ b/build/win32/libx264.dsp
@ -0,0 +1,742 @@
+# Microsoft Developer Studio Project File - Name="libx264" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Static Library" 0x0104
+
+CFG=libx264 - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "libx264.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "libx264.mak" CFG="libx264 - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "libx264 - Win32 Release" (based on "Win32 (x86) Static Library")
+!MESSAGE "libx264 - Win32 Debug" (based on "Win32 (x86) Static Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "NDEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /D "ARCH_X86" /FD /c
+# SUBTRACT CPP /YX
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"bin/libx264.lib"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "_DEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /D "ARCH_X86" /FD /GZ /c
+# SUBTRACT CPP /YX
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"bin/libx264.lib"
+
+!ENDIF 
+
+# Begin Target
+
+# Name "libx264 - Win32 Release"
+# Name "libx264 - Win32 Debug"
+# Begin Group "Enc"
+
+# PROP Default_Filter ".c"
+# Begin Group "enc_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\encoder\analyse.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\macroblock.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\me.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\ratecontrol.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\set.h
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\encoder\analyse.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\cabac.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\cavlc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\encoder.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\me.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\ratecontrol.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\encoder\set.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/enc_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/enc_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "Core"
+
+# PROP Default_Filter ".c;.h;"
+# Begin Group "core_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\core\bs.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cabac.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\clip1.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\common.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cpu.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\csp.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\dct.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\frame.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\macroblock.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\pixel.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\predict.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\set.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\vlc.h
+# End Source File
+# End Group
+# Begin Group "I386"
+
+# PROP Default_Filter "*.h,*.c,*.asm"
+# Begin Source File
+
+SOURCE=..\..\core\i386\cpu.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\cpu.asm
+InputName=cpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\cpu.asm
+InputName=cpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE="..\..\core\i386\dct-c.c"
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\dct.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\dct.asm
+InputName=dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\dct.asm
+InputName=dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\dct.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\pixel.asm
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\Release
+InputPath=..\..\core\i386\pixel.asm
+InputName=pixel
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\Debug
+InputPath=..\..\core\i386\pixel.asm
+InputName=pixel
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\i386\pixel.h
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\core\cabac.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\common.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\cpu.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\csp.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\dct.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\frame.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\mdate.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\pixel.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\core\predict.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/core_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/core_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "Dec"
+
+# PROP Default_Filter ".c"
+# Begin Group "dec_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\decoder\macroblock.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\set.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\vlc.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\decoder\decoder.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\macroblock.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\set.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\decoder\vlc.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/dec_release"
+# PROP Exclude_From_Build 1
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/dec_debug"
+# PROP Exclude_From_Build 1
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "extras"
+
+# PROP Default_Filter ".c"
+# Begin Group "extras_h"
+
+# PROP Default_Filter ".h"
+# Begin Source File
+
+SOURCE=..\..\extras\getopt.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\extras\stdint.h
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Source File
+
+SOURCE=..\..\extras\getopt.c
+
+!IF  "$(CFG)" == "libx264 - Win32 Release"
+
+# PROP Intermediate_Dir "obj/util_release"
+
+!ELSEIF  "$(CFG)" == "libx264 - Win32 Debug"
+
+# PROP Intermediate_Dir "obj/util_debug"
+
+!ENDIF 
+
+# End Source File
+# End Group
+# End Target
+# End Project
--- a/build/win32/x264.dsp
+++ b/build/win32/x264.dsp
@ -0,0 +1,94 @@
+# Microsoft Developer Studio Project File - Name="x264" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=x264 - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "x264.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "x264.mak" CFG="x264 - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "x264 - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "x264 - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "x264 - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "obj/x264_Release"
+# PROP Intermediate_Dir "obj/x264_Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /I "./core" /I "./encode" /I "./decode" /I "../../extras" /I "../.." /D "NDEBUG" /D "_CONSOLE" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"bin/x264.exe"
+
+!ELSEIF  "$(CFG)" == "x264 - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "obj/x264_Debug"
+# PROP Intermediate_Dir "obj/x264_Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /I "../../core" /I "../../encode" /I "../../decode" /I "../../extras" /I "../.." /D "_DEBUG" /D "_CONSOLE" /D "WIN32" /D "_MBCS" /D "__X264__" /D "HAVE_MMXEXT" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"bin/x264.exe" /pdbtype:sept
+
+!ENDIF 
+
+# Begin Target
+
+# Name "x264 - Win32 Release"
+# Name "x264 - Win32 Debug"
+# Begin Source File
+
+SOURCE=..\..\x264.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\x264.h
+# End Source File
+# End Target
+# End Project
--- a/build/win32/x264.dsw
+++ b/build/win32/x264.dsw
@ -0,0 +1,44 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "libx264"=.\libx264.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Project: "x264"=.\x264.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name libx264
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
--- a/core/bs.h
+++ b/core/bs.h
@ -0,0 +1,423 @@
+/*****************************************************************************
+ * bs.h :
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: bs.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifdef _BS_H
+#warning FIXME Multiple inclusion of bs.h
+#else
+#define _BS_H
+
+typedef struct bs_s
+{
+    uint8_t *p_start;
+    uint8_t *p;
+    uint8_t *p_end;
+
+    int     i_left;    /* i_count number of available bits */
+} bs_t;
+
+static inline void bs_init( bs_t *s, void *p_data, int i_data )
+{
+    s->p_start = p_data;
+    s->p       = p_data;
+    s->p_end   = s->p + i_data;
+    s->i_left  = 8;
+}
+static inline int bs_pos( bs_t *s )
+{
+    return( 8 * ( s->p - s->p_start ) + 8 - s->i_left );
+}
+static inline int bs_eof( bs_t *s )
+{
+    return( s->p >= s->p_end ? 1: 0 );
+}
+static inline uint32_t bs_read( bs_t *s, int i_count )
+{
+     static uint32_t i_mask[33] ={0x00,
+                                  0x01,      0x03,      0x07,      0x0f,
+                                  0x1f,      0x3f,      0x7f,      0xff,
+                                  0x1ff,     0x3ff,     0x7ff,     0xfff,
+                                  0x1fff,    0x3fff,    0x7fff,    0xffff,
+                                  0x1ffff,   0x3ffff,   0x7ffff,   0xfffff,
+                                  0x1fffff,  0x3fffff,  0x7fffff,  0xffffff,
+                                  0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff,
+                                  0x1fffffff,0x3fffffff,0x7fffffff,0xffffffff};
+    int      i_shr;
+    uint32_t i_result = 0;
+
+    while( i_count > 0 )
+    {
+        if( s->p >= s->p_end )
+        {
+            break;
+        }
+
+        if( ( i_shr = s->i_left - i_count ) >= 0 )
+        {
+            /* more in the buffer than requested */
+            i_result |= ( *s->p >> i_shr )&i_mask[i_count];
+            s->i_left -= i_count;
+            if( s->i_left == 0 )
+            {
+                s->p++;
+                s->i_left = 8;
+            }
+            return( i_result );
+        }
+        else
+        {
+            /* less in the buffer than requested */
+           i_result |= (*s->p&i_mask[s->i_left]) << -i_shr;
+           i_count  -= s->i_left;
+           s->p++;
+           s->i_left = 8;
+        }
+    }
+
+    return( i_result );
+}
+
+#if 0
+/* Only > i386 */
+static uint32_t bswap32( uint32_t x )
+{
+    asm( "bswap   %0": "=r" (x):"0" (x));
+    return x;
+}
+/* work only for i_count <= 32 - 7 */
+static inline uint32_t bs_read( bs_t *s, int i_count )
+{
+    if( s->p < s->p_end && i_count > 0 )
+    {
+#if 0
+        uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
+#else
+        uint32_t i_cache = bswap32( *((uint32_t*)s->p) ) << (8-s->i_left);
+#endif
+        uint32_t i_ret = i_cache >> ( 32 - i_count);
+
+        s->i_left -= i_count;
+#if 0
+        if( s->i_left <= 0 )
+        {
+            int i_skip = (8-s->i_left) >> 3;
+
+            s->p += i_skip;
+
+            s->i_left += i_skip << 3;
+        }
+#else
+        while( s->i_left <= 0 )
+        {
+            s->p++;
+            s->i_left += 8;
+        }
+#endif
+        return i_ret;
+    }
+    return 0;
+}
+
+#endif
+static inline uint32_t bs_read1( bs_t *s )
+{
+
+    if( s->p < s->p_end )
+    {
+        unsigned int i_result;
+
+        s->i_left--;
+        i_result = ( *s->p >> s->i_left )&0x01;
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+        return i_result;
+    }
+
+    return 0;
+}
+static inline uint32_t bs_show( bs_t *s, int i_count )
+{
+#if 0
+    bs_t     s_tmp = *s;
+    return bs_read( &s_tmp, i_count );
+#else
+    if( s->p < s->p_end && i_count > 0 )
+    {
+        uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
+        return( i_cache >> ( 32 - i_count) );
+    }
+    return 0;
+#endif
+}
+
+/* TODO optimize */
+static inline void bs_skip( bs_t *s, int i_count )
+{
+    s->i_left -= i_count;
+
+    while( s->i_left <= 0 )
+    {
+        s->p++;
+        s->i_left += 8;
+    }
+}
+
+
+static inline int bs_read_ue( bs_t *s )
+{
+    int i = 0;
+
+    while( bs_read1( s ) == 0 && s->p < s->p_end && i < 32 )
+    {
+        i++;
+    }
+    return( ( 1 << i) - 1 + bs_read( s, i ) );
+}
+static inline int bs_read_se( bs_t *s )
+{
+    int val = bs_read_ue( s );
+
+    return val&0x01 ? (val+1)/2 : -(val/2);
+}
+
+static inline int bs_read_te( bs_t *s, int x )
+{
+    if( x == 1 )
+    {
+        return 1 - bs_read1( s );
+    }
+    else if( x > 1 )
+    {
+        return bs_read_ue( s );
+    }
+    return 0;
+}
+
+/* TODO optimize (write x bits at once) */
+static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
+{
+    while( i_count > 0 )
+    {
+        if( s->p >= s->p_end )
+        {
+            break;
+        }
+
+        i_count--;
+
+        if( ( i_bits >> i_count )&0x01 )
+        {
+            *s->p |= 1 << ( s->i_left - 1 );
+        }
+        else
+        {
+            *s->p &= ~( 1 << ( s->i_left - 1 ) );
+        }
+        s->i_left--;
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+    }
+}
+
+static inline void bs_write1( bs_t *s, uint32_t i_bits )
+{
+    if( s->p < s->p_end )
+    {
+        s->i_left--;
+
+        if( i_bits&0x01 )
+        {
+            *s->p |= 1 << s->i_left;
+        }
+        else
+        {
+            *s->p &= ~( 1 << s->i_left );
+        }
+        if( s->i_left == 0 )
+        {
+            s->p++;
+            s->i_left = 8;
+        }
+    }
+}
+
+static inline void bs_align( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        s->i_left = 8;
+        s->p++;
+    }
+}
+static inline void bs_align_0( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 0 );
+    }
+}
+static inline void bs_align_1( bs_t *s )
+{
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 1 );
+    }
+}
+
+
+
+/* golomb functions */
+
+static inline void bs_write_ue( bs_t *s, unsigned int val )
+{
+    int i_size = 0;
+    static const int i_size0_255[256] =
+    {
+        1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+    };
+
+    if( val == 0 )
+    {
+        bs_write( s, 1, 1 );
+    }
+    else
+    {
+        unsigned int tmp = ++val;
+
+        if( tmp >= 0x00010000 )
+        {
+            i_size += 16;
+            tmp >>= 16;
+        }
+        if( tmp >= 0x100 )
+        {
+            i_size += 8;
+            tmp >>= 8;
+        }
+        i_size += i_size0_255[tmp];
+
+        bs_write( s, 2 * i_size - 1, val );
+    }
+}
+
+static inline void bs_write_se( bs_t *s, int val )
+{
+    bs_write_ue( s, val <= 0 ? -val * 2 : val * 2 - 1);
+}
+
+static inline void bs_write_te( bs_t *s, int x, int val )
+{
+    if( x == 1 )
+    {
+        bs_write( s, 1, ~val );
+    }
+    else if( x > 1 )
+    {
+        bs_write_ue( s, val );
+    }
+}
+
+static inline void bs_rbsp_trailing( bs_t *s )
+{
+    bs_write( s, 1, 1 );
+    if( s->i_left != 8 )
+    {
+        bs_write( s, s->i_left, 0x00 );
+    }
+}
+
+static inline int bs_size_ue( unsigned int val )
+{
+    static const int i_size0_254[255] =
+    {
+        1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
+        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+        11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
+        11,11,11,11,11,11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+        13,13,13,13,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+    };
+
+    if( val < 255 )
+    {
+        return i_size0_254[val];
+    }
+    else
+    {
+        int i_size = 0;
+
+        val++;
+
+        if( val >= 0x10000 )
+        {
+            i_size += 32;
+            val = (val >> 16) - 1;
+        }
+        if( val >= 0x100 )
+        {
+            i_size += 16;
+            val = (val >> 8) - 1;
+        }
+        return i_size0_254[val] + i_size;
+    }
+}
+
+static inline int bs_size_se( int val )
+{
+    return bs_size_ue( val <= 0 ? -val * 2 : val * 2 - 1);
+}
+
+static inline int bs_size_te( int x, int val )
+{
+    if( x == 1 )
+    {
+        return 1;
+    }
+    else if( x > 1 )
+    {
+        return bs_size_ue( val );
+    }
+    return 0;
+}
+
+
+
+#endif
--- a/core/cabac.c
+++ b/core/cabac.c
--- a/core/cabac.h
+++ b/core/cabac.h
@ -0,0 +1,78 @@
+/*****************************************************************************
+ * cabac.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cabac.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CABAC_H
+#define _CABAC_H 1
+
+typedef struct
+{
+    /* model */
+    struct
+    {
+        int i_model;
+        int i_cost;
+    } slice[3];
+
+    /* context */
+    struct
+    {
+        int i_state;
+        int i_mps;
+        int i_count;
+    } ctxstate[399];
+
+    /* state */
+    int i_low;
+    int i_range;
+
+    int i_sym_cnt;
+
+    /* bit stream */
+    int b_first_bit;
+    int i_bits_outstanding;
+    bs_t *s;
+
+} x264_cabac_t;
+
+/* encoder/decoder: init the contexts given i_slice_type, the quantif and the model */
+void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
+
+/* decoder only: */
+void x264_cabac_decode_init    ( x264_cabac_t *cb, bs_t *s );
+int  x264_cabac_decode_decision( x264_cabac_t *cb, int i_ctx_idx );
+int  x264_cabac_decode_bypass  ( x264_cabac_t *cb );
+int  x264_cabac_decode_terminal( x264_cabac_t *cb );
+
+/* encoder only: adaptive model init */
+void x264_cabac_model_init( x264_cabac_t *cb );
+int  x264_cabac_model_get ( x264_cabac_t *cb, int i_slice_type );
+void x264_cabac_model_update( x264_cabac_t *cb, int i_slice_type, int i_qp );
+/* encoder only: */
+void x264_cabac_encode_init ( x264_cabac_t *cb, bs_t *s );
+void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx_idx, int b );
+void x264_cabac_encode_bypass( x264_cabac_t *cb, int b );
+void x264_cabac_encode_terminal( x264_cabac_t *cb, int b );
+void x264_cabac_encode_flush( x264_cabac_t *cb );
+
+
+#endif
--- a/core/clip1.h
+++ b/core/clip1.h
@ -0,0 +1,71 @@
+/*****************************************************************************
+ * clip1.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: clip1.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CLIP1_H
+#define _CLIP1_H 1
+
+/* Clip1 table
+ * XXX : only for tap filter.
+ *
+ * With tap filter (( 1, -5, 20, 20, -5, 1 ) + 16 )/ 32
+ * -> (-2*5 * 255+16)/32 <= out <= (2*1*255 + 2*20*255+16)/32
+ * -> -80 <= out <= 335
+ * So we need a table of 80+335+1 = 416 entries
+ */
+
+static const uint8_t x264_mc_clip1_table[80+1+335] =
+{
+    /* -80 -> -1 */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,
+    /* 0 -> 255 */
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+    36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+    54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+    72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+    90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,101,102,103,104,105,106,107,
+    108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+    126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
+    144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
+    162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
+    180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
+    198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
+    216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
+    234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
+    252,253,254,255,
+    /* 256 -> 340 */
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,
+};
+
+static inline uint8_t x264_mc_clip1( int x )
+{
+    return x264_mc_clip1_table[x+80];
+}
+
+#endif
--- a/core/common.c
+++ b/core/common.c
@ -0,0 +1,300 @@
+/*****************************************************************************
+ * common.c: h264 library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: common.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "common.h"
+#include "cpu.h"
+
+/****************************************************************************
+ * x264_param_default:
+ ****************************************************************************/
+void    x264_param_default( x264_param_t *param )
+{
+    /* */
+    memset( param, 0, sizeof( x264_param_t ) );
+
+    /* CPU autodetect */
+    param->cpu = x264_cpu_detect();
+    fprintf( stderr, "x264: cpu capabilities: %s%s%s%s%s%s\n",
+             param->cpu&X264_CPU_MMX ? "MMX " : "",
+             param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
+             param->cpu&X264_CPU_SSE ? "SSE " : "",
+             param->cpu&X264_CPU_SSE2 ? "SSE2 " : "",
+             param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
+             param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
+
+
+    /* Video properties */
+    param->i_csp           = X264_CSP_I420;
+    param->i_width         = 0;
+    param->i_height        = 0;
+    param->vui.i_sar_width = 0;
+    param->vui.i_sar_height= 0;
+    param->f_fps           = 25.0;
+
+    /* Encoder parameters */
+    param->i_frame_reference = 1;
+    param->i_idrframe = 2;
+    param->i_iframe = 60;
+    param->i_bframe = 0;
+
+    param->b_deblocking_filter = 1;
+    param->i_deblocking_filter_alphac0 = 0;
+    param->i_deblocking_filter_beta = 0;
+
+    param->b_cabac = 0;
+    param->i_cabac_init_idc = -1;
+
+    param->i_bitrate = 3000;
+    param->i_qp_constant = 26;
+
+    param->analyse.intra = X264_ANALYSE_I4x4;
+    param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16;
+}
+
+/****************************************************************************
+ * x264_picture_alloc:
+ ****************************************************************************/
+void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+{
+    pic->i_type = X264_TYPE_AUTO;
+    pic->i_qpplus1 = 0;
+    pic->img.i_csp = i_csp;
+    switch( i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+        case X264_CSP_YV12:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height / 2 );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width / 2;
+            pic->img.i_stride[2] = i_width / 2;
+            break;
+
+        case X264_CSP_I422:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 2 * i_width * i_height );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 2;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width / 2;
+            pic->img.i_stride[2] = i_width / 2;
+            break;
+
+        case X264_CSP_I444:
+            pic->img.i_plane = 3;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height );
+            pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
+            pic->img.plane[2] = pic->img.plane[1] + i_width * i_height;
+            pic->img.i_stride[0] = i_width;
+            pic->img.i_stride[1] = i_width;
+            pic->img.i_stride[2] = i_width;
+            break;
+
+        case X264_CSP_YUYV:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 2 * i_width * i_height );
+            pic->img.i_stride[0] = 2 * i_width;
+            break;
+
+        case X264_CSP_RGB:
+        case X264_CSP_BGR:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 3 * i_width * i_height );
+            pic->img.i_stride[0] = 3 * i_width;
+            break;
+
+        case X264_CSP_BGRA:
+            pic->img.i_plane = 1;
+            pic->img.plane[0] = x264_malloc( 4 * i_width * i_height );
+            pic->img.i_stride[0] = 4 * i_width;
+            break;
+
+        default:
+            fprintf( stderr, "invalid CSP\n" );
+            pic->img.i_plane = 0;
+            break;
+    }
+}
+
+/****************************************************************************
+ * x264_picture_clean:
+ ****************************************************************************/
+void x264_picture_clean( x264_picture_t *pic )
+{
+    x264_free( pic->img.plane[0] );
+
+    /* just to be safe */
+    memset( pic, 0, sizeof( x264_picture_t ) );
+}
+
+/****************************************************************************
+ * x264_nal_encode:
+ ****************************************************************************/
+int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal )
+{
+    uint8_t *dst = p_data;
+    uint8_t *src = nal->p_payload;
+    uint8_t *end = &nal->p_payload[nal->i_payload];
+
+    int i_count = 0;
+
+    /* FIXME this code doesn't check overflow */
+
+    if( b_annexeb )
+    {
+        /* long nal start code (we always use long ones)*/
+        *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x01;
+    }
+
+    /* nal header */
+    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
+
+    while( src < end )
+    {
+        if( i_count == 2 && *src <= 0x03 )
+        {
+            *dst++ = 0x03;
+            i_count = 0;
+        }
+        if( *src == 0 )
+        {
+            i_count++;
+        }
+        else
+        {
+            i_count = 0;
+        }
+        *dst++ = *src++;
+    }
+    *pi_data = dst - (uint8_t*)p_data;
+
+    return *pi_data;
+}
+
+/****************************************************************************
+ * x264_nal_decode:
+ ****************************************************************************/
+int x264_nal_decode( x264_nal_t *nal, void *p_data, int i_data )
+{
+    uint8_t *src = p_data;
+    uint8_t *end = &src[i_data];
+    uint8_t *dst = nal->p_payload;
+
+    nal->i_type    = src[0]&0x1f;
+    nal->i_ref_idc = (src[0] >> 5)&0x03;
+
+    src++;
+
+    while( src < end )
+    {
+        if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00  && src[2] == 0x03 )
+        {
+            *dst++ = 0x00;
+            *dst++ = 0x00;
+
+            src += 3;
+            continue;
+        }
+        *dst++ = *src++;
+    }
+
+    nal->i_payload = dst - (uint8_t*)p_data;
+    return 0;
+}
+
+
+
+/****************************************************************************
+ * x264_malloc:
+ ****************************************************************************/
+void *x264_malloc( int i_size )
+{
+#ifdef HAVE_MALLOC_H
+    return memalign( 16, i_size );
+#else
+    uint8_t * buf;
+    uint8_t * align_buf;
+    buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
+              sizeof( int ) );
+    align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
+    align_buf -= (long) align_buf & 15;
+    *( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
+    *( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
+    return align_buf;
+#endif
+}
+
+/****************************************************************************
+ * x264_free:
+ ****************************************************************************/
+void x264_free( void *p )
+{
+    if( p )
+    {
+#ifdef HAVE_MALLOC_H
+        free( p );
+#else
+        free( *( ( ( void **) p ) - 1 ) );
+#endif
+    }
+}
+
+/****************************************************************************
+ * x264_realloc:
+ ****************************************************************************/
+void *x264_realloc( void *p, int i_size )
+{
+#ifdef HAVE_MALLOC_H
+    return realloc( p, i_size );
+#else
+    int       i_old_size = 0;
+    uint8_t * p_new;
+    if( p )
+    {
+        i_old_size = *( (int*) ( (uint8_t*) p ) - sizeof( void ** ) -
+                         sizeof( int ) );
+    }
+    p_new = x264_malloc( i_size );
+    if( i_old_size > 0 && i_size > 0 )
+    {
+        memcpy( p_new, p, ( i_old_size < i_size ) ? i_old_size : i_size );
+    }
+    x264_free( p );
+    return p_new;
+#endif
+}
+
--- a/core/common.h
+++ b/core/common.h
@ -0,0 +1,344 @@
+/*****************************************************************************
+ * common.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: common.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _COMMON_H
+#define _COMMON_H 1
+
+#include <stdint.h>
+
+#include "../x264.h"
+#include "bs.h"
+#include "set.h"
+#include "predict.h"
+#include "pixel.h"
+#include "mc.h"
+#include "frame.h"
+#include "dct.h"
+#include "cabac.h"
+#include "csp.h"
+
+#define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
+#define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
+#define X264_ABS(a)   ( (a)< 0 ? -(a) : (a) )
+
+/* x264_malloc : will do or emulate a memalign
+ * XXX you HAVE TO use x264_free for buffer allocated
+ * with x264_malloc
+ */
+void *x264_malloc( int );
+void *x264_realloc( void *p, int i_size );
+void  x264_free( void * );
+
+/* mdate: return the current date in microsecond */
+int64_t x264_mdate( void );
+
+static inline int x264_clip3( int v, int i_min, int i_max )
+{
+    if( v < i_min )
+    {
+        return i_min;
+    }
+    else if( v > i_max )
+    {
+        return i_max;
+    }
+    else
+    {
+        return v;
+    }
+}
+
+enum slice_type_e
+{
+    SLICE_TYPE_P  = 0,
+    SLICE_TYPE_B  = 1,
+    SLICE_TYPE_I  = 2,
+    SLICE_TYPE_SP = 3,
+    SLICE_TYPE_SI = 4
+};
+
+typedef struct
+{
+    x264_sps_t *sps;
+    x264_pps_t *pps;
+
+    int i_type;
+    int i_first_mb;
+
+    int i_pps_id;
+
+    int i_frame_num;
+
+    int b_field_pic;
+    int b_bottom_field;
+
+    int i_idr_pic_id;   /* -1 if nal_type != 5 */
+
+    int i_poc_lsb;
+    int i_delta_poc_bottom;
+
+    int i_delta_poc[2];
+    int i_redundant_pic_cnt;
+
+    int b_direct_spatial_mv_pred;
+
+    int b_num_ref_idx_override;
+    int i_num_ref_idx_l0_active;
+    int i_num_ref_idx_l1_active;
+
+    int i_cabac_init_idc;
+
+    int i_qp_delta;
+    int b_sp_for_swidth;
+    int i_qs_delta;
+
+    /* deblocking filter */
+    int i_disable_deblocking_filter_idc;
+    int i_alpha_c0_offset;
+    int i_beta_offset;
+
+} x264_slice_header_t;
+
+/* From ffmpeg
+ */
+#define X264_SCAN8_SIZE (6*8)
+#define X264_SCAN8_0 (4+1*8)
+
+static const int x264_scan8[16+2*4] =
+{
+    /* Luma */
+    4+1*8, 5+1*8, 4+2*8, 5+2*8,
+    6+1*8, 7+1*8, 6+2*8, 7+2*8,
+    4+3*8, 5+3*8, 4+4*8, 5+4*8,
+    6+3*8, 7+3*8, 6+4*8, 7+4*8,
+
+    /* Cb */
+    1+1*8, 2+1*8,
+    1+2*8, 2+2*8,
+
+    /* Cr */
+    1+4*8, 2+4*8,
+    1+5*8, 2+5*8,
+};
+/*
+   0 1 2 3 4 5 6 7
+ 0
+ 1   B B   L L L L
+ 2   B B   L L L L
+ 3         L L L L
+ 4   R R   L L L L
+ 5   R R
+*/
+
+#define X264_BFRAME_MAX 16
+
+typedef struct x264_ratecontrol_t   x264_ratecontrol_t;
+typedef struct x264_vlc_table_t     x264_vlc_table_t;
+
+struct x264_t
+{
+    /* encoder parameters */
+    x264_param_t    param;
+
+    /* bitstream output */
+    struct
+    {
+        int         i_nal;
+        x264_nal_t  nal[3];         /* for now 3 is enought */
+        int         i_bitstream;    /* size of p_bitstream */
+        uint8_t     *p_bitstream;   /* will hold data for all nal */
+        bs_t        bs;
+    } out;
+
+    /* frame number/poc */
+    int             i_frame;
+    int             i_poc;
+
+    int             i_frame_offset; /* decoding only */
+    int             i_frame_num;    /* decoding only */
+    int             i_poc_msb;      /* decoding only */
+    int             i_poc_lsb;      /* decoding only */
+
+    /* We use only one SPS and one PPS */
+    x264_sps_t      sps_array[32];
+    x264_sps_t      *sps;
+    x264_pps_t      pps_array[256];
+    x264_pps_t      *pps;
+    int             i_idr_pic_id;
+
+    /* Slice header */
+    x264_slice_header_t sh;
+
+    /* cabac context */
+    x264_cabac_t    cabac;
+
+    struct
+    {
+        /* Frames to be encoded */
+        x264_frame_t *current[X264_BFRAME_MAX+1];
+        /* Temporary buffer (eg B frames pending until we reach the I/P) */
+        x264_frame_t *next[X264_BFRAME_MAX+1];
+        /* Unused frames */
+        x264_frame_t *unused[X264_BFRAME_MAX+1];
+
+        /* frames used for reference +1 for decoding */
+        x264_frame_t *reference[16+1];
+
+        int i_last_idr; /* How many I non IDR frames from last IDR */
+        int i_last_i;   /* How many P/B frames from last I */
+    } frames;
+
+    /* current frame being encoded */
+    x264_frame_t    *fenc;
+
+    /* frame being reconstructed */
+    x264_frame_t    *fdec;
+
+    /* references lists */
+    int             i_ref0;
+    x264_frame_t    *fref0[16];       /* ref list 0 */
+    int             i_ref1;
+    x264_frame_t    *fref1[16];       /* ref list 1 */
+
+
+
+    /* Current MB DCT coeffs */
+    struct
+    {
+        DECLARE_ALIGNED( int, luma16x16_dc[16], 16 );
+        DECLARE_ALIGNED( int, chroma_dc[2][4], 16 );
+        struct
+        {
+            DECLARE_ALIGNED( int, residual_ac[15], 16 );
+            DECLARE_ALIGNED( int, luma4x4[16], 16 );
+        } block[16+8];
+    } dct;
+
+    /* MB table and cache for current frame/mb */
+    struct
+    {
+        /* Strides */
+        int     i_mb_stride;
+
+        /* Current index */
+        int     i_mb_x;
+        int     i_mb_y;
+        int     i_mb_xy;
+
+        unsigned int i_neighbour;
+
+        /* mb table */
+        int8_t  *type;                      /* mb type */
+        int8_t  *qp;                        /* mb qp */
+        int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
+        int8_t  (*intra4x4_pred_mode)[7];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+        uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
+        int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
+        int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
+        int16_t (*mvd[2])[2];               /* mb mv difference with predict. set to 0 if intra. cabac only */
+        int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only */
+
+        /* current value */
+        int     i_type;
+        int     i_partition;
+        int     i_sub_partition[4];
+
+        int     i_cbp_luma;
+        int     i_cbp_chroma;
+
+        int     i_intra16x16_pred_mode;
+        int     i_chroma_pred_mode;
+
+        struct
+        {
+            /* pointer over mb of the frame to be compressed */
+            uint8_t *p_fenc[3];
+
+            /* pointer over mb of the frame to be reconstrucated  */
+            uint8_t *p_fdec[3];
+
+            /* pointer over mb of the references */
+            uint8_t *p_fref[2][16][3];
+
+            /* common stride */
+            int     i_stride[3];
+        } pic;
+
+        /* cache */
+        struct
+        {
+            /* real intra4x4_pred_mode if I_4X4, I_PRED_4x4_DC if mb available, -1 if not */
+            int     intra4x4_pred_mode[X264_SCAN8_SIZE];
+
+            /* i_non_zero_count if availble else 0x80 */
+            int     non_zero_count[X264_SCAN8_SIZE];
+
+            /* -1 if unused, -2 if unavaible */
+            int8_t  ref[2][X264_SCAN8_SIZE];
+
+            /* 0 if non avaible */
+            int16_t mv[2][X264_SCAN8_SIZE][2];
+            int16_t mvd[2][X264_SCAN8_SIZE][2];
+        } cache;
+
+        /* */
+        int     i_last_qp;  /* last qp */
+        int     i_last_dqp; /* last delta qp */
+
+    } mb;
+
+    /* rate control encoding only */
+    x264_ratecontrol_t *rc;
+
+    /* stats */
+    struct
+    {
+        /* per slice info */
+        int   i_slice_count[5];
+        int   i_slice_size[5];
+        float f_psnr_y[5];
+        float f_psnr_u[5];
+        float f_psnr_v[5];
+        int   i_mb_count[5][18];
+    } stat;
+
+    /* CPU functions dependants */
+    x264_predict_t      predict_16x16[4+3];
+    x264_predict_t      predict_8x8[4+3];
+    x264_predict_t      predict_4x4[9+3];
+
+    x264_pixel_function_t pixf;
+    x264_mc_function_t    mc[2];
+    x264_dct_function_t   dctf;
+    x264_csp_function_t   csp;
+
+    /* vlc table for decoding purpose only */
+    x264_vlc_table_t *x264_coeff_token_lookup[5];
+    x264_vlc_table_t *x264_level_prefix_lookup;
+    x264_vlc_table_t *x264_total_zeros_lookup[15];
+    x264_vlc_table_t *x264_total_zeros_dc_lookup[3];
+    x264_vlc_table_t *x264_run_before_lookup[7];
+};
+
+#endif
+
--- a/core/cpu.c
+++ b/core/cpu.c
@ -0,0 +1,233 @@
+/*****************************************************************************
+ * cpu.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cpu.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "cpu.h"
+
+#ifdef ARCH_X86
+extern int  x264_cpu_cpuid_test( void );
+extern uint32_t  x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
+extern void x264_emms( void );
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t cpu = 0;
+
+    uint32_t eax, ebx, ecx, edx;
+    int      b_amd;
+
+
+    if( !x264_cpu_cpuid_test() )
+    {
+        /* No cpuid */
+        return 0;
+    }
+
+    x264_cpu_cpuid( 0, &eax, &ebx, &ecx, &edx);
+    if( eax == 0 )
+    {
+        return 0;
+    }
+    b_amd   = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
+    if( (edx&0x00800000) == 0 )
+    {
+        /* No MMX */
+        return 0;
+    }
+    cpu = X264_CPU_MMX;
+    if( (edx&0x02000000) )
+    {
+        /* SSE - identical to AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
+    }
+    if( (edx&0x04000000) )
+    {
+        /* Is it OK ? */
+        cpu |= X264_CPU_SSE2;
+    }
+
+    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
+    if( eax < 0x80000001 )
+    {
+        /* no extended capabilities */
+        return cpu;
+    }
+
+    x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
+    if( edx&0x80000000 )
+    {
+        cpu |= X264_CPU_3DNOW;
+    }
+    if( b_amd && (edx&0x00400000) )
+    {
+        /* AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT;
+    }
+
+    return cpu;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+    if( cpu&(X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_3DNOW|X264_CPU_3DNOWEXT) )
+    {
+        x264_emms();
+    }
+}
+
+
+#if 0
+/*
+ * XXX: adapted from libmpeg2 */
+#if 0
+#define cpuid(op,eax,ebx,ecx,edx)   \
+    __asm__ ("push %%ebx\n\t"       \
+             "cpuid\n\t"            \
+             "movl %%ebx,%1\n\t"    \
+             "pop %%ebx"        \
+             : "=a" (eax),      \
+               "=r" (ebx),      \
+               "=c" (ecx),      \
+               "=d" (edx)       \
+             : "a" (op)         \
+             : "cc")
+#endif
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t cpu = 0;
+
+    uint32_t eax, ebx, ecx, edx;
+    int      b_amd;
+
+
+    /* Test if cpuid is supported */
+    asm volatile(
+        "pushf\n"
+        "pushf\n"
+        "pop %0\n"
+        "movl %0,%1\n"
+        "xorl $0x200000,%0\n"
+        "push %0\n"
+        "popf\n"
+        "pushf\n"
+        "pop %0\n"
+        "popf\n"
+         : "=r" (eax), "=r" (ebx) : : "cc");
+
+    if( eax == ebx )
+    {
+        /* No cpuid */
+        return 0;
+    }
+
+    cpuid( 0, eax, ebx, ecx, edx);
+    if( eax == 0 )
+    {
+        return 0;
+    }
+    b_amd   = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    cpuid( 1, eax, ebx, ecx, edx );
+    if( (edx&0x00800000) == 0 )
+    {
+        /* No MMX */
+        return 0;
+    }
+    cpu = X264_CPU_MMX;
+    if( (edx&0x02000000) )
+    {
+        /* SSE - identical to AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
+    }
+    if( (edx&0x04000000) )
+    {
+        /* Is it OK ? */
+        cpu |= X264_CPU_SSE2;
+    }
+
+    cpuid( 0x80000000, eax, ebx, ecx, edx );
+    if( eax < 0x80000001 )
+    {
+        /* no extended capabilities */
+        return cpu;
+    }
+
+    cpuid( 0x80000001, eax, ebx, ecx, edx );
+    if( edx&0x80000000 )
+    {
+        cpu |= X264_CPU_3DNOW;
+    }
+    if( b_amd && (edx&0x00400000) )
+    {
+        /* AMD MMX extensions */
+        cpu |= X264_CPU_MMXEXT;
+    }
+
+    return cpu;
+}
+#endif
+
+#elif defined( HAVE_ALTIVEC )
+#include <sys/sysctl.h>
+
+uint32_t x264_cpu_detect( void )
+{
+    /* Thx VLC */
+    uint32_t cpu = 0;
+    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
+    int      has_altivec = 0;
+    size_t   length = sizeof( has_altivec );
+    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
+
+    if( error == 0 && has_altivec != 0 )
+    {
+        cpu |= X264_CPU_ALTIVEC;
+    }
+
+    return cpu;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+}
+
+#else
+
+uint32_t x264_cpu_detect( void )
+{
+    return 0;
+}
+
+void     x264_cpu_restore( uint32_t cpu )
+{
+}
+
+#endif
+
--- a/core/cpu.h
+++ b/core/cpu.h
@ -0,0 +1,32 @@
+/*****************************************************************************
+ * cpu.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cpu.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CPU_H
+#define _CPU_H 1
+
+uint32_t x264_cpu_detect( void );
+
+/* probably MMX(EXT) centric but .... */
+void     x264_cpu_restore( uint32_t cpu );
+
+#endif
--- a/core/csp.c
+++ b/core/csp.c
@ -0,0 +1,379 @@
+/*****************************************************************************
+ * csp.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: csp.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common.h"
+
+static inline void plane_copy( uint8_t *dst, int i_dst,
+                               uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        memcpy( dst, src, w );
+        dst += i_dst;
+        src += i_src;
+    }
+}
+static inline void plane_copy_vflip( uint8_t *dst, int i_dst,
+                                     uint8_t *src, int i_src, int w, int h)
+{
+    plane_copy( dst, i_dst, src + (h -1)*i_src, -i_src, w, h );
+}
+
+static inline void plane_subsamplev2( uint8_t *dst, int i_dst,
+                                      uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        uint8_t *d = dst;
+        uint8_t *s = src;
+        int     i;
+        for( i = 0; i < w; i++ )
+        {
+            *d++ = ( s[0] + s[i_src] + 1 ) >> 1;
+            s++;
+        }
+        dst += i_dst;
+        src += 2 * i_src;
+    }
+}
+
+static inline void plane_subsamplev2_vlip( uint8_t *dst, int i_dst,
+                                           uint8_t *src, int i_src, int w, int h)
+{
+    plane_subsamplev2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
+}
+
+static inline void plane_subsamplehv2( uint8_t *dst, int i_dst,
+                                       uint8_t *src, int i_src, int w, int h)
+{
+    for( ; h > 0; h-- )
+    {
+        uint8_t *d = dst;
+        uint8_t *s = src;
+        int     i;
+        for( i = 0; i < w; i++ )
+        {
+            *d++ = ( s[0] + s[1] + s[i_src] + s[i_src+1] + 1 ) >> 2;
+            s += 2;
+        }
+        dst += i_dst;
+        src += 2 * i_src;
+    }
+}
+
+static inline void plane_subsamplehv2_vlip( uint8_t *dst, int i_dst,
+                                            uint8_t *src, int i_src, int w, int h)
+{
+    plane_subsamplehv2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
+}
+
+static void i420_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+        plane_copy_vflip( frm->plane[1], frm->i_stride[1],
+                          img->plane[1], img->i_stride[1],
+                          i_width / 2, i_height / 2 );
+        plane_copy_vflip( frm->plane[2], frm->i_stride[2],
+                          img->plane[2], img->i_stride[2],
+                          i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+        plane_copy( frm->plane[1], frm->i_stride[1],
+                    img->plane[1], img->i_stride[1],
+                    i_width / 2, i_height / 2 );
+        plane_copy( frm->plane[2], frm->i_stride[2],
+                    img->plane[2], img->i_stride[2],
+                    i_width / 2, i_height / 2 );
+    }
+}
+
+static void yv12_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+        plane_copy_vflip( frm->plane[2], frm->i_stride[2],
+                          img->plane[1], img->i_stride[1],
+                          i_width / 2, i_height / 2 );
+        plane_copy_vflip( frm->plane[1], frm->i_stride[1],
+                          img->plane[2], img->i_stride[2],
+                          i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+        plane_copy( frm->plane[2], frm->i_stride[2],
+                    img->plane[1], img->i_stride[1],
+                    i_width / 2, i_height / 2 );
+        plane_copy( frm->plane[1], frm->i_stride[1],
+                    img->plane[2], img->i_stride[2],
+                    i_width / 2, i_height / 2 );
+    }
+}
+
+static void i422_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+
+        plane_subsamplev2_vlip( frm->plane[1], frm->i_stride[1],
+                                img->plane[1], img->i_stride[1],
+                                i_width / 2, i_height / 2 );
+        plane_subsamplev2_vlip( frm->plane[2], frm->i_stride[2],
+                                img->plane[2], img->i_stride[2],
+                                i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+
+        plane_subsamplev2( frm->plane[1], frm->i_stride[1],
+                           img->plane[1], img->i_stride[1],
+                           i_width / 2, i_height / 2 );
+        plane_subsamplev2( frm->plane[2], frm->i_stride[2],
+                           img->plane[2], img->i_stride[2],
+                           i_width / 2, i_height / 2 );
+    }
+}
+
+static void i444_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        plane_copy_vflip( frm->plane[0], frm->i_stride[0],
+                          img->plane[0], img->i_stride[0],
+                          i_width, i_height );
+
+        plane_subsamplehv2_vlip( frm->plane[1], frm->i_stride[1],
+                                 img->plane[1], img->i_stride[1],
+                                 i_width / 2, i_height / 2 );
+        plane_subsamplehv2_vlip( frm->plane[2], frm->i_stride[2],
+                                 img->plane[2], img->i_stride[2],
+                                 i_width / 2, i_height / 2 );
+    }
+    else
+    {
+        plane_copy( frm->plane[0], frm->i_stride[0],
+                    img->plane[0], img->i_stride[0],
+                    i_width, i_height );
+
+        plane_subsamplehv2( frm->plane[1], frm->i_stride[1],
+                            img->plane[1], img->i_stride[1],
+                            i_width / 2, i_height / 2 );
+        plane_subsamplehv2( frm->plane[2], frm->i_stride[2],
+                            img->plane[2], img->i_stride[2],
+                            i_width / 2, i_height / 2 );
+    }
+}
+static void yuyv_to_i420( x264_frame_t *frm, x264_image_t *img,
+                          int i_width, int i_height )
+{
+    uint8_t *src = img->plane[0];
+    int     i_src= img->i_stride[0];
+
+    uint8_t *y   = frm->plane[0];
+    uint8_t *u   = frm->plane[1];
+    uint8_t *v   = frm->plane[2];
+
+    if( img->i_csp & X264_CSP_VFLIP )
+    {
+        src += ( i_height - 1 ) * i_src;
+        i_src = -i_src;
+    }
+
+    for( ; i_height > 0; i_height -= 2 )
+    {
+        uint8_t *ss = src;
+        uint8_t *yy = y;
+        uint8_t *uu = u;
+        uint8_t *vv = v;
+        int w;
+
+        for( w = i_width; w > 0; w -= 2 )
+        {
+            *yy++ = ss[0];
+            *yy++ = ss[2];
+
+            *uu++ = ( ss[1] + ss[1+i_src] + 1 ) >> 1;
+            *vv++ = ( ss[3] + ss[3+i_src] + 1 ) >> 1;
+
+            ss += 4;
+        }
+        src += i_src;
+        y += frm->i_stride[0];
+        u += frm->i_stride[1];
+        v += frm->i_stride[2];
+
+        ss = src;
+        yy = y;
+        for( w = i_width; w > 0; w -= 2 )
+        {
+            *yy++ = ss[0];
+            *yy++ = ss[2];
+            ss += 4;
+        }
+        src += i_src;
+        y += frm->i_stride[0];
+    }
+}
+
+/* Same value than in XviD */
+#define BITS 8
+#define FIX(f) ((int)((f) * (1 << BITS) + 0.5))
+
+#define Y_R   FIX(0.257)
+#define Y_G   FIX(0.504)
+#define Y_B   FIX(0.098)
+#define Y_ADD 16
+
+#define U_R   FIX(0.148)
+#define U_G   FIX(0.291)
+#define U_B   FIX(0.439)
+#define U_ADD 128
+
+#define V_R   FIX(0.439)
+#define V_G   FIX(0.368)
+#define V_B   FIX(0.071)
+#define V_ADD 128
+#define RGB_TO_I420( name, POS_R, POS_G, POS_B, S_RGB ) \
+static void name( x264_frame_t *frm, x264_image_t *img, \
+                  int i_width, int i_height )           \
+{                                                       \
+    uint8_t *src = img->plane[0];                       \
+    int     i_src= img->i_stride[0];                    \
+    int     i_y  = frm->i_stride[0];                    \
+    uint8_t *y   = frm->plane[0];                       \
+    uint8_t *u   = frm->plane[1];                       \
+    uint8_t *v   = frm->plane[2];                       \
+                                                        \
+    if( img->i_csp & X264_CSP_VFLIP )                   \
+    {                                                   \
+        src += ( i_height - 1 ) * i_src;                \
+        i_src = -i_src;                                 \
+    }                                                   \
+                                                        \
+    for(  ; i_height > 0; i_height -= 2 )               \
+    {                                                   \
+        uint8_t *ss = src;                              \
+        uint8_t *yy = y;                                \
+        uint8_t *uu = u;                                \
+        uint8_t *vv = v;                                \
+        int w;                                          \
+                                                        \
+        for( w = i_width; w > 0; w -= 2 )               \
+        {                                               \
+            int cr = 0,cg = 0,cb = 0;                   \
+            int r, g, b;                                \
+                                                        \
+            /* Luma */                                  \
+            cr = r = ss[POS_R];                         \
+            cg = g = ss[POS_G];                         \
+            cb = b = ss[POS_B];                         \
+                                                        \
+            yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
+                                                        \
+            cr+= r = ss[POS_R+i_src];                   \
+            cg+= g = ss[POS_G+i_src];                   \
+            cb+= b = ss[POS_B+i_src];                   \
+            yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
+            yy++;                                       \
+            ss += S_RGB;                                \
+                                                        \
+            cr+= r = ss[POS_R];                         \
+            cg+= g = ss[POS_G];                         \
+            cb+= b = ss[POS_B];                         \
+                                                        \
+            yy[0] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);    \
+                                                        \
+            cr+= r = ss[POS_R+i_src];                   \
+            cg+= g = ss[POS_G+i_src];                   \
+            cb+= b = ss[POS_B+i_src];                   \
+            yy[i_y] = Y_ADD + ((Y_R * r + Y_G * g + Y_B * b) >> BITS);  \
+            yy++;                                       \
+            ss += S_RGB;                                \
+                                                        \
+            /* Chroma */                                \
+            *uu++ = (uint8_t)(U_ADD + ((-U_R * cr - U_G * cg + U_B * cb) >> (BITS+2)) ); \
+            *vv++ = (uint8_t)(V_ADD + (( V_R * cr - V_G * cg - V_B * cb) >> (BITS+2)) ); \
+        }                                               \
+                                                        \
+        src += 2*i_src;                                   \
+        y += 2*frm->i_stride[0];                        \
+        u += frm->i_stride[1];                          \
+        v += frm->i_stride[2];                          \
+    }                                                   \
+}
+
+RGB_TO_I420( rgb_to_i420,  0, 1, 2, 3 );
+RGB_TO_I420( bgr_to_i420,  2, 1, 0, 3 );
+RGB_TO_I420( bgra_to_i420, 2, 1, 0, 4 );
+
+void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf )
+{
+    switch( i_csp )
+    {
+        case X264_CSP_I420:
+            pf->i420 = i420_to_i420;
+            pf->i422 = i422_to_i420;
+            pf->i444 = i444_to_i420;
+            pf->yv12 = yv12_to_i420;
+            pf->yuyv = yuyv_to_i420;
+            pf->rgb  = rgb_to_i420;
+            pf->bgr  = bgr_to_i420;
+            pf->bgra = bgra_to_i420;
+            break;
+
+        default:
+            /* For now, can't happen */
+            fprintf( stderr, "arg in x264_csp_init\n" );
+            exit( -1 );
+            break;
+    }
+}
+
--- a/core/csp.h
+++ b/core/csp.h
@ -0,0 +1,43 @@
+/*****************************************************************************
+ * csp.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: csp.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _CSP_H
+#define _CSP_H 1
+
+typedef struct
+{
+    void (*i420)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*i422)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*i444)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*yv12)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*yuyv)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*rgb )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*bgr )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+    void (*bgra)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
+} x264_csp_function_t;
+
+
+void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf );
+
+#endif
+
--- a/core/dct.c
+++ b/core/dct.c
@ -0,0 +1,288 @@
+/*****************************************************************************
+ * dct.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+
+#include "dct.h"
+#ifdef HAVE_MMXEXT
+#   include "i386/dct.h"
+#endif
+
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/*
+ * XXX For all dct dc : input could be equal to output so ...
+ */
+
+static void dct2x2dc( int16_t d[2][2] )
+{
+    int tmp[2][2];
+
+    tmp[0][0] = d[0][0] + d[0][1];
+    tmp[1][0] = d[0][0] - d[0][1];
+    tmp[0][1] = d[1][0] + d[1][1];
+    tmp[1][1] = d[1][0] - d[1][1];
+
+    d[0][0] = tmp[0][0] + tmp[0][1];
+    d[0][1] = tmp[1][0] + tmp[1][1];
+    d[1][0] = tmp[0][0] - tmp[0][1];
+    d[1][1] = tmp[1][0] - tmp[1][1];
+}
+
+static void dct4x4dc( int16_t d[4][4] )
+{
+    int16_t tmp[4][4];
+    int s01, s23;
+    int d01, d23;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = d[i][0] + d[i][1];
+        d01 = d[i][0] - d[i][1];
+        s23 = d[i][2] + d[i][3];
+        d23 = d[i][2] - d[i][3];
+
+        tmp[0][i] = s01 + s23;
+        tmp[1][i] = s01 - s23;
+        tmp[2][i] = d01 - d23;
+        tmp[3][i] = d01 + d23;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = tmp[i][0] + tmp[i][1];
+        d01 = tmp[i][0] - tmp[i][1];
+        s23 = tmp[i][2] + tmp[i][3];
+        d23 = tmp[i][2] - tmp[i][3];
+
+        d[0][i] = ( s01 + s23 + 1 ) >> 1;
+        d[1][i] = ( s01 - s23 + 1 ) >> 1;
+        d[2][i] = ( d01 - d23 + 1 ) >> 1;
+        d[3][i] = ( d01 + d23 + 1 ) >> 1;
+    }
+}
+
+static void idct4x4dc( int16_t d[4][4] )
+{
+    int16_t tmp[4][4];
+    int s01, s23;
+    int d01, d23;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = d[0][i] + d[1][i];
+        d01 = d[0][i] - d[1][i];
+        s23 = d[2][i] + d[3][i];
+        d23 = d[2][i] - d[3][i];
+
+        tmp[0][i] = s01 + s23;
+        tmp[1][i] = s01 - s23;
+        tmp[2][i] = d01 - d23;
+        tmp[3][i] = d01 + d23;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        s01 = tmp[i][0] + tmp[i][1];
+        d01 = tmp[i][0] - tmp[i][1];
+        s23 = tmp[i][2] + tmp[i][3];
+        d23 = tmp[i][2] - tmp[i][3];
+
+        d[i][0] = s01 + s23;
+        d[i][1] = s01 - s23;
+        d[i][2] = d01 - d23;
+        d[i][3] = d01 + d23;
+    }
+}
+
+static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    int16_t d[4][4];
+    int16_t tmp[4][4];
+    int y, x;
+    int i;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            d[y][x] = pix1[x] - pix2[x];
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s03 = d[i][0] + d[i][3];
+        const int s12 = d[i][1] + d[i][2];
+        const int d03 = d[i][0] - d[i][3];
+        const int d12 = d[i][1] - d[i][2];
+
+        tmp[0][i] =   s03 +   s12;
+        tmp[1][i] = 2*d03 +   d12;
+        tmp[2][i] =   s03 -   s12;
+        tmp[3][i] =   d03 - 2*d12;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s03 = tmp[i][0] + tmp[i][3];
+        const int s12 = tmp[i][1] + tmp[i][2];
+        const int d03 = tmp[i][0] - tmp[i][3];
+        const int d12 = tmp[i][1] - tmp[i][2];
+
+        dct[0][i] =   s03 +   s12;
+        dct[1][i] = 2*d03 +   d12;
+        dct[2][i] =   s03 -   s12;
+        dct[3][i] =   d03 - 2*d12;
+    }
+}
+
+static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    sub4x4_dct( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    sub4x4_dct( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
+    sub4x4_dct( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
+    sub4x4_dct( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+}
+
+static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    sub8x8_dct( &dct[ 0], pix1, i_pix1, pix2, i_pix2 );
+    sub8x8_dct( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
+    sub8x8_dct( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
+    sub8x8_dct( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+
+static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+{
+    int16_t d[4][4];
+    int16_t tmp[4][4];
+    int x, y;
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s02 = dct[0][i]      + dct[2][i];
+        const int d02 = dct[0][i]      - dct[2][i];
+        const int s13 = dct[1][i]      + (dct[3][i]>>1);
+        const int d13 = (dct[1][i]>>1) -  dct[3][i];
+
+        tmp[0][i] = s02 + s13;
+        tmp[1][i] = d02 + d13;
+        tmp[2][i] = d02 - d13;
+        tmp[3][i] = s02 - s13;
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int s02 =  tmp[i][0]     +  tmp[i][2];
+        const int d02 =  tmp[i][0]     -  tmp[i][2];
+        const int s13 =  tmp[i][1]     + (tmp[i][3]>>1);
+        const int d13 = (tmp[i][1]>>1) -  tmp[i][3];
+
+        d[i][0] = ( s02 + s13 + 32 ) >> 6;
+        d[i][1] = ( d02 + d13 + 32 ) >> 6;
+        d[i][2] = ( d02 - d13 + 32 ) >> 6;
+        d[i][3] = ( s02 - s13 + 32 ) >> 6;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            p_dst[x] = clip_uint8( p_dst[x] + d[y][x] );
+        }
+        p_dst += i_dst;
+    }
+}
+
+static void add8x8_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+{
+    add4x4_idct( p_dst, i_dst,             dct[0] );
+    add4x4_idct( &p_dst[4], i_dst,         dct[1] );
+    add4x4_idct( &p_dst[4*i_dst+0], i_dst, dct[2] );
+    add4x4_idct( &p_dst[4*i_dst+4], i_dst, dct[3] );
+}
+
+static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+{
+    add8x8_idct( &p_dst[0], i_dst, &dct[0] );
+    add8x8_idct( &p_dst[8], i_dst, &dct[4] );
+    add8x8_idct( &p_dst[8*i_dst], i_dst, &dct[8] );
+    add8x8_idct( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+}
+
+
+
+/****************************************************************************
+ * x264_dct_init:
+ ****************************************************************************/
+void x264_dct_init( int cpu, x264_dct_function_t *dctf )
+{
+    dctf->sub4x4_dct    = sub4x4_dct;
+    dctf->add4x4_idct   = add4x4_idct;
+
+    dctf->sub8x8_dct    = sub8x8_dct;
+    dctf->add8x8_idct   = add8x8_idct;
+
+    dctf->sub16x16_dct    = sub16x16_dct;
+    dctf->add16x16_idct   = add16x16_idct;
+
+    dctf->dct4x4dc  = dct4x4dc;
+    dctf->idct4x4dc = idct4x4dc;
+
+    dctf->dct2x2dc  = dct2x2dc;
+    dctf->idct2x2dc = dct2x2dc;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        dctf->sub4x4_dct    = x264_sub4x4_dct_mmxext;
+        dctf->sub8x8_dct    = x264_sub8x8_dct_mmxext;
+        dctf->sub16x16_dct  = x264_sub16x16_dct_mmxext;
+
+        dctf->add4x4_idct   = x264_add4x4_idct_mmxext;
+        dctf->add8x8_idct   = x264_add8x8_idct_mmxext;
+        dctf->add16x16_idct = x264_add16x16_idct_mmxext;
+
+        dctf->dct4x4dc  = x264_dct4x4dc_mmxext;
+        dctf->idct4x4dc = x264_idct4x4dc_mmxext;
+    }
+#endif
+}
+
--- a/core/dct.h
+++ b/core/dct.h
@ -0,0 +1,49 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DCT_H
+#define _DCT_H 1
+
+typedef struct
+{
+    void (*sub4x4_dct)   ( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add4x4_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+
+    void (*sub8x8_dct)   ( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add8x8_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+
+    void (*sub16x16_dct)   ( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add16x16_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+
+
+    void (*dct4x4dc) ( int16_t d[4][4] );
+    void (*idct4x4dc)( int16_t d[4][4] );
+
+    void (*dct2x2dc) ( int16_t d[2][2] );
+    void (*idct2x2dc)( int16_t d[2][2] );
+
+} x264_dct_function_t;
+
+void x264_dct_init( int cpu, x264_dct_function_t *dctf );
+
+#endif
--- a/core/frame.c
+++ b/core/frame.c
@ -0,0 +1,701 @@
+/*****************************************************************************
+ * frame.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: frame.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "macroblock.h"
+
+x264_frame_t *x264_frame_new( x264_t *h )
+{
+    x264_frame_t   *frame = x264_malloc( sizeof( x264_frame_t ) );
+    int i;
+
+    int i_stride;
+    int i_lines;
+
+    /* allocate frame data (+64 for extra data for me) */
+    i_stride = ( ( h->param.i_width  + 15 )&0xfffff0 )+ 64;
+    i_lines  = ( ( h->param.i_height + 15 )&0xfffff0 );
+
+    frame->i_plane = 3;
+    for( i = 0; i < 3; i++ )
+    {
+        int i_divh = 1;
+        int i_divw = 1;
+        if( i > 0 )
+        {
+            if( h->param.i_csp == X264_CSP_I420 )
+                i_divh = i_divw = 2;
+            else if( h->param.i_csp == X264_CSP_I422 )
+                i_divw = 2;
+        }
+        frame->i_stride[i] = i_stride / i_divw;
+        frame->i_lines[i] = i_lines / i_divh;
+        frame->buffer[i] = x264_malloc( frame->i_stride[i] *
+                                        ( frame->i_lines[i] + 64 / i_divh ) );
+
+        frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
+                          frame->i_stride[i] * 32 / i_divh + 32 / i_divw;
+    }
+    frame->i_stride[3] = 0;
+    frame->i_lines[3] = 0;
+    frame->buffer[3] = NULL;
+    frame->plane[3] = NULL;
+
+    frame->i_poc = -1;
+    frame->i_type = X264_TYPE_AUTO;
+    frame->i_qpplus1 = 0;
+
+    return frame;
+}
+
+void x264_frame_delete( x264_frame_t *frame )
+{
+    int i;
+    for( i = 0; i < frame->i_plane; i++ )
+    {
+        x264_free( frame->buffer[i] );
+    }
+    x264_free( frame );
+}
+
+void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
+{
+    dst->i_type     = src->i_type;
+    dst->i_qpplus1  = src->i_qpplus1;
+    dst->i_pts      = src->i_pts;
+
+    switch( src->img.i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+            h->csp.i420( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_YV12:
+            h->csp.yv12( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_I422:
+            h->csp.i422( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_I444:
+            h->csp.i444( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_YUYV:
+            h->csp.yuyv( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_RGB:
+            h->csp.rgb( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_BGR:
+            h->csp.bgr( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+        case X264_CSP_BGRA:
+            h->csp.bgra( dst, &src->img, h->param.i_width, h->param.i_height );
+            break;
+
+        default:
+            fprintf( stderr, "Arg invalid CSP\n" );
+            break;
+    }
+}
+
+
+
+void x264_frame_expand_border( x264_frame_t *frame )
+{
+    int w;
+    int i, y;
+    for( i = 0; i < frame->i_plane; i++ )
+    {
+#define PPIXEL(x, y) ( frame->plane[i] + (x) +(y)*frame->i_stride[i] )
+        w = ( i == 0 ) ? 32 : 16;
+
+        for( y = 0; y < w; y++ )
+        {
+            /* upper band */
+            memcpy( PPIXEL(0,-y-1), PPIXEL(0,0), frame->i_stride[i] - 2 * w);
+            /* up left corner */
+            memset( PPIXEL(-w,-y-1 ), PPIXEL(0,0)[0], w );
+            /* up right corner */
+            memset( PPIXEL(frame->i_stride[i] - 2*w,-y-1), PPIXEL( frame->i_stride[i]-1-2*w,0)[0], w );
+
+            /* lower band */
+            memcpy( PPIXEL(0, frame->i_lines[i]+y), PPIXEL(0,frame->i_lines[i]-1), frame->i_stride[i] - 2 * w );
+            /* low left corner */
+            memset( PPIXEL(-w, frame->i_lines[i]+y), PPIXEL(0,frame->i_lines[i]-1)[0], w);
+            /* low right corner */
+            memset( PPIXEL(frame->i_stride[i]-2*w, frame->i_lines[i]+y), PPIXEL(frame->i_stride[i]-1-2*w,frame->i_lines[i]-1)[0], w);
+
+        }
+        for( y = 0; y < frame->i_lines[i]; y++ )
+        {
+            /* left band */
+            memset( PPIXEL( -w, y ), PPIXEL( 0, y )[0], w );
+            /* right band */
+            memset( PPIXEL( frame->i_stride[i]-2*w, y ), PPIXEL( frame->i_stride[i] - 1-2*w, y )[0], w );
+        }
+#undef PPIXEL
+    }
+}
+
+/* FIXME theses tables are duplicated with the ones in macroblock.c */
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+/* Deblocking filter (p153) */
+static const int i_alpha_table[52] =
+{
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+    255, 255
+};
+static const int i_beta_table[52] =
+{
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18
+};
+static const int i_tc0_table[52][3] =
+{
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
+    { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
+    { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
+    { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
+    { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
+    { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
+    { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
+};
+
+/* From ffmpeg */
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+static inline void deblocking_filter_edgev( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 4 * i_pix_stride;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
+
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( abs( p2 - p0 ) < beta )
+                    {
+                        pix[-2] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( abs( q2 - q0 ) < beta )
+                    {
+                        pix[1] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+        else
+        {
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
+                    {
+                        if( abs( p2 - p0 ) < beta )
+                        {
+                            const int p3 = pix[-4];
+                            /* p0', p1', p2' */
+                            pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* p0' */
+                            pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( abs( q2 - q0 ) < beta )
+                        {
+                            const int q3 = pix[3];
+                            /* q0', q1', q2' */
+                            pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* q0' */
+                            pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }
+                    else
+                    {
+                        /* p0', q0' */
+                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix += i_pix_stride;
+            }
+        }
+    }
+}
+
+static inline void deblocking_filter_edgecv( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 2 * i_pix_stride;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    const int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+        else
+        {
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix += i_pix_stride;
+            }
+        }
+    }
+}
+
+static inline void deblocking_filter_edgeh( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    int i_pix_next  = i_pix_stride;
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 4;
+            continue;
+        }
+
+        if( bS[i] < 4 )
+        {
+            const int tc0 = i_tc0_table[i_index_a][bS[i] - 1];
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int p2 = pix[-3*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+                const int q2 = pix[2*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( abs( p2 - p0 ) < beta )
+                    {
+                        pix[-2*i_pix_next] = p1 + x264_clip3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( abs( q2 - q0 ) < beta )
+                    {
+                        pix[i_pix_next] = q1 + x264_clip3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-i_pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]           = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }
+        else
+        {
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ )
+            {
+                const int p0 = pix[-i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int p2 = pix[-3*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+                const int q2 = pix[2*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    const int p3 = pix[-4*i_pix_next];
+                    const int q3 = pix[ 3*i_pix_next];
+
+                    if( abs( p0 - q0 ) < (( alpha >> 2 ) + 2 ) )
+                    {
+                        if( abs( p2 - p0 ) < beta )
+                        {
+                            /* p0', p1', p2' */
+                            pix[-1*i_pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2*i_pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3*i_pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* p0' */
+                            pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( abs( q2 - q0 ) < beta )
+                        {
+                            /* q0', q1', q2' */
+                            pix[0*i_pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1*i_pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2*i_pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        }
+                        else
+                        {
+                            /* q0' */
+                            pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }
+                    else
+                    {
+                        /* p0' */
+                        pix[-1*i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        /* q0' */
+                        pix[0*i_pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix++;
+            }
+
+        }
+    }
+}
+
+static inline void deblocking_filter_edgech( x264_t *h, uint8_t *pix, int i_pix_stride, int bS[4], int i_QP )
+{
+    int i, d;
+    const int i_index_a = x264_clip3( i_QP + h->sh.i_alpha_c0_offset, 0, 51 );
+    const int alpha = i_alpha_table[i_index_a];
+    const int beta  = i_beta_table[x264_clip3( i_QP + h->sh.i_beta_offset, 0, 51 )];
+
+    int i_pix_next  = i_pix_stride;
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 )
+        {
+            pix += 2;
+            continue;
+        }
+        if( bS[i] < 4 )
+        {
+            int tc = i_tc0_table[i_index_a][bS[i] - 1] + 1;
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1*i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    int i_delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-i_pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]           = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }
+        else
+        {
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ )
+            {
+                const int p0 = pix[-1*i_pix_next];
+                const int p1 = pix[-2*i_pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*i_pix_next];
+
+                if( abs( p0 - q0 ) < alpha &&
+                    abs( p1 - p0 ) < beta &&
+                    abs( q1 - q0 ) < beta )
+                {
+                    pix[-i_pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]           = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix++;
+            }
+        }
+    }
+}
+
+void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
+{
+    const int s8x8 = 2 * h->mb.i_mb_stride;
+    const int s4x4 = 4 * h->mb.i_mb_stride;
+
+    int mb_y, mb_x;
+
+    for( mb_y = 0, mb_x = 0; mb_y < h->sps->i_mb_height; )
+    {
+        const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
+        const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
+        const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
+        int i_edge;
+        int i_dir;
+
+        /* i_dir == 0 -> vertical edge
+         * i_dir == 1 -> horizontal edge */
+        for( i_dir = 0; i_dir < 2; i_dir++ )
+        {
+            int i_start;
+            int i_qp, i_qpn;
+
+            i_start = (( i_dir == 0 && mb_x != 0 ) || ( i_dir == 1 && mb_y != 0 ) ) ? 0 : 1;
+
+            for( i_edge = i_start; i_edge < 4; i_edge++ )
+            {
+                int mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );
+                int mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );
+                int mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );
+
+                int bS[4];  /* filtering strength */
+
+                /* *** Get bS for each 4px for the current edge *** */
+                if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )
+                {
+                    bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 ? 4 : 3 );
+                }
+                else
+                {
+                    int i;
+                    for( i = 0; i < 4; i++ )
+                    {
+                        int x  = i_dir == 0 ? i_edge : i;
+                        int y  = i_dir == 0 ? i      : i_edge;
+                        int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;
+                        int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;
+
+                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||
+                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )
+                        {
+                            bS[i] = 2;
+                        }
+                        else if( i_slice_type == SLICE_TYPE_P )
+                        {
+                            if( h->mb.ref[0][mb_8x8+(x/2)+(y/2)*s8x8] != h->mb.ref[0][mbn_8x8+(xn/2)+(yn/2)*s8x8] ||
+                                abs( h->mb.mv[0][mb_4x4+x+y*s4x4][0] - h->mb.mv[0][mbn_4x4+xn+yn*s4x4][0] ) >= 4 ||
+                                abs( h->mb.mv[0][mb_4x4+x+y*s4x4][1] - h->mb.mv[0][mbn_4x4+xn+yn*s4x4][1] ) >= 4 )
+                            {
+                                bS[i] = 1;
+                            }
+                            else
+                            {
+                                bS[i] = 0;
+                            }
+                        }
+                        else
+                        {
+                            /* FIXME */
+                            fprintf( stderr, "deblocking filter doesn't work yet with B slice\n" );
+                            return;
+                        }
+                    }
+                }
+
+                /* *** filter *** */
+                /* Y plane */
+                i_qp = h->mb.qp[mb_xy];
+                i_qpn= h->mb.qp[mbn_xy];
+
+                if( i_dir == 0 )
+                {
+                    /* vertical edge */
+                    deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge],
+                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1);
+                    if( (i_edge % 2) == 0  )
+                    {
+                        /* U/V planes */
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
+                        deblocking_filter_edgecv( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2],
+                                                      h->fdec->i_stride[1], bS, i_qpc );
+                        deblocking_filter_edgecv( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2],
+                                                  h->fdec->i_stride[2], bS, i_qpc );
+                    }
+                }
+                else
+                {
+                    /* horizontal edge */
+                    deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x],
+                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 );
+                    /* U/V planes */
+                    if( ( i_edge % 2  ) == 0 )
+                    {
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
+                        deblocking_filter_edgech( h, &h->fdec->plane[1][8*(mb_y*h->fdec->i_stride[1]+mb_x)+i_edge*2*h->fdec->i_stride[1]],
+                                                 h->fdec->i_stride[1], bS, i_qpc );
+                        deblocking_filter_edgech( h, &h->fdec->plane[2][8*(mb_y*h->fdec->i_stride[2]+mb_x)+i_edge*2*h->fdec->i_stride[2]],
+                                                 h->fdec->i_stride[2], bS, i_qpc );
+                    }
+                }
+            }
+        }
+
+        /* newt mb */
+        mb_x++;
+        if( mb_x >= h->sps->i_mb_width )
+        {
+            mb_x = 0;
+            mb_y++;
+        }
+    }
+}
+
+
+
+
--- a/core/frame.h
+++ b/core/frame.h
@ -0,0 +1,56 @@
+/*****************************************************************************
+ * frame.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: frame.h,v 1.1 2004/06/03 19:27:06 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _FRAME_H
+#define _FRAME_H 1
+
+typedef struct
+{
+    /* */
+    int     i_poc;
+    int     i_type;
+    int     i_qpplus1;
+    int64_t i_pts;
+
+    /* YUV buffer */
+    int     i_plane;
+    int     i_stride[4];
+    int     i_lines[4];
+    uint8_t *plane[4];
+
+    /* for unrestricted mv we allocate more data than needed
+     * allocated data are stored in buffer */
+    void    *buffer[4];
+
+} x264_frame_t;
+
+x264_frame_t *x264_frame_new( x264_t *h );
+void          x264_frame_delete( x264_frame_t *frame );
+
+void          x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
+
+void          x264_frame_expand_border( x264_frame_t *frame );
+
+void          x264_frame_deblocking_filter( x264_t *h, int i_slice_type );
+
+#endif
--- a/core/i386/cpu.asm
+++ b/core/i386/cpu.asm
@ -0,0 +1,111 @@
+;*****************************************************************************
+;* cpu.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_cpu_cpuid_test
+cglobal x264_cpu_cpuid
+cglobal x264_emms
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid_test:
+    pushfd
+    push    ebx
+
+    pushfd
+    pop     eax
+    mov     ebx, eax
+    xor     eax, 0x200000
+    push    eax
+    popfd
+    pushfd
+    pop     eax
+    xor     eax, ebx
+    
+    pop     ebx
+    popfd
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+;-----------------------------------------------------------------------------
+x264_cpu_cpuid:
+
+    push    ebp
+    mov     ebp,    esp
+    push    ebx
+    push    esi
+    push    edi
+    
+    mov     eax,    [ebp +  8]
+    cpuid
+
+    mov     esi,    [ebp + 12]
+    mov     [esi],  eax
+
+    mov     esi,    [ebp + 16]
+    mov     [esi],  ebx
+
+    mov     esi,    [ebp + 20]
+    mov     [esi],  ecx
+
+    mov     esi,    [ebp + 24]
+    mov     [esi],  edx
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_emms( void )
+;-----------------------------------------------------------------------------
+x264_emms:
+    emms
+    ret
+
--- a/core/i386/dct-c.c
+++ b/core/i386/dct-c.c
@ -0,0 +1,294 @@
+/*****************************************************************************
+ * dct.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+
+#include "../dct.h"
+#include "dct.h"
+
+
+#if 0
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+/* MMP : diff,  MMT: temp */
+#define MMX_LOAD_DIFF_4P( MMP, MMT, MMZ, pix1, pix2 ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" \
+                  "movd (%1), " #MMT "\n" \
+                  "punpcklbw  " #MMZ ", " #MMT "\n" \
+                  "psubw      " #MMT ", " #MMP "\n" : : "r"(pix1), "r"(pix2) )
+
+/* in: out: mma=mma+mmb, mmb=mmb-mma */
+#define MMX_SUMSUB_BA( MMA, MMB ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "psubw " #MMA ", " #MMB "\n" :: )
+
+#define MMX_SUMSUB_BADC( MMA, MMB, MMC, MMD ) \
+    asm volatile( "paddw " #MMB ", " #MMA "\n"\
+                  "paddw " #MMD ", " #MMC "\n"\
+                  "paddw " #MMB ", " #MMB "\n"\
+                  "paddw " #MMD ", " #MMD "\n"\
+                  "psubw " #MMA ", " #MMB "\n"\
+                  "psubw " #MMC ", " #MMD "\n" :: )
+
+/* inputs MMA, MMB output MMA MMT */
+#define MMX_SUMSUB2_AB( MMA, MMB, MMT ) \
+    asm volatile( "movq  " #MMA ", " #MMT "\n" \
+                  "paddw " #MMA ", " #MMA "\n" \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMB ", " #MMT "\n" \
+                  "psubw " #MMB ", " #MMT "\n" :: )
+
+/* inputs MMA, MMB output MMA MMS */
+#define MMX_SUMSUBD2_AB( MMA, MMB, MMT, MMS ) \
+    asm volatile( "movq  " #MMA ", " #MMS "\n" \
+                  "movq  " #MMB ", " #MMT "\n" \
+                  "psraw   $1    , " #MMB "\n"       \
+                  "psraw   $1    , " #MMS "\n"       \
+                  "paddw " #MMB ", " #MMA "\n" \
+                  "psubw " #MMT ", " #MMS "\n" :: )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+#define MMX_STORE_DIFF_4P( MMP, MMT, MM32, MMZ, dst ) \
+    asm volatile( "paddw     " #MM32 "," #MMP "\n" \
+                  "psraw       $6,     " #MMP "\n" \
+                  "movd        (%0),   " #MMT "\n" \
+                  "punpcklbw " #MMZ ", " #MMT "\n" \
+                  "paddsw    " #MMT ", " #MMP "\n" \
+                  "packuswb  " #MMZ ", " #MMP "\n" \
+                  "movd      " #MMP ",   (%0)\n" :: "r"(dst) )
+
+#define UNUSED_LONGLONG( foo ) \
+    static const unsigned long long foo __asm__ (#foo)  __attribute__((unused)) __attribute__((aligned(16)))
+
+UNUSED_LONGLONG( x264_mmx_32 ) = 0x0020002000200020ULL;
+UNUSED_LONGLONG( x264_mmx_1 ) = 0x0001000100010001ULL;
+
+
+/*
+ * XXX For all dct dc : input could be equal to output so ...
+ */
+void x264_dct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE  ( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE  ( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+
+    asm volatile( "movq x264_mmx_1, %%mm6" :: );
+
+    /* Store back */
+    asm volatile(
+        "paddw %%mm6, %%mm0\n"
+        "paddw %%mm6, %%mm4\n"
+
+        "psraw $1,    %%mm0\n"
+        "movq  %%mm0,   (%0)\n"
+        "psraw $1,    %%mm4\n"
+        "movq  %%mm4,  8(%0)\n"
+
+        "paddw %%mm6, %%mm1\n"
+        "paddw %%mm6, %%mm3\n"
+
+        "psraw $1,    %%mm1\n"
+        "movq  %%mm1, 16(%0)\n"
+        "psraw $1,    %%mm3\n"
+        "movq  %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+void x264_idct4x4dc_mmxext( int16_t d[4][4] )
+{
+    /* load DCT */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n" 
+        "movq 24(%0), %%mm3\n" :: "r"(d) );
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 );  /* mm1=s01  mm0=d01  mm3=s23  mm2=d23 */
+    MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 );  /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
+
+    /* in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 */
+    MMX_TRANSPOSE( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 );  /* mm2=s01  mm3=d01  mm0=s23  mm4=d23 */
+    MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 );  /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
+
+    /* in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm0,   (%0)\n"
+        "movq %%mm4,  8(%0)\n"
+        "movq %%mm1, 16(%0)\n" 
+        "movq %%mm3, 24(%0)\n" :: "r"(d) );
+}
+
+/****************************************************************************
+ * subXxX_dct:
+ ****************************************************************************/
+inline void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    /* Reset mm7 */
+    MMX_ZERO( %%mm7 );
+
+    /* Load 4 lines */
+    MMX_LOAD_DIFF_4P( %%mm0, %%mm6, %%mm7, &pix1[0*i_pix1], &pix2[0*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm1, %%mm6, %%mm7, &pix1[1*i_pix1], &pix2[1*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm2, %%mm6, %%mm7, &pix1[2*i_pix1], &pix2[2*i_pix2] );
+    MMX_LOAD_DIFF_4P( %%mm3, %%mm6, %%mm7, &pix1[3*i_pix1], &pix2[3*i_pix2] );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm0, %%mm2, %%mm1 );  /* mm3=s03  mm0=d03  mm2=s12  mm1=d12 */
+
+    MMX_SUMSUB_BA(  %%mm2, %%mm3 );                 /* mm2=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm0, %%mm1, %%mm4 );          /* mm0=2.d03+d12    mm4=d03-2.d12 */
+
+    /* transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 */
+    MMX_TRANSPOSE( %%mm2, %%mm0, %%mm3, %%mm4, %%mm1 );
+
+    MMX_SUMSUB_BADC( %%mm3, %%mm2, %%mm1, %%mm4 );  /* mm3=s03  mm2=d03  mm1=s12  mm4=d12 */
+
+    MMX_SUMSUB_BA(  %%mm1, %%mm3 );                 /* mm1=s03+s12      mm3=s03-s12 */
+    MMX_SUMSUB2_AB( %%mm2, %%mm4, %%mm0 );          /* mm2=2.d03+d12    mm0=d03-2.d12 */
+
+    /* transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 */
+    MMX_TRANSPOSE( %%mm1, %%mm2, %%mm3, %%mm0, %%mm4 );
+
+    /* Store back */
+    asm volatile(
+        "movq %%mm1, (%0)\n"
+        "movq %%mm0, 8(%0)\n"
+        "movq %%mm4, 16(%0)\n"
+        "movq %%mm3, 24(%0)\n" :: "r"(dct) );
+}
+#endif
+
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub4x4_dct_mmxext( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
+    x264_sub4x4_dct_mmxext( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
+}
+
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    x264_sub8x8_dct_mmxext( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
+    x264_sub8x8_dct_mmxext( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+
+
+/****************************************************************************
+ * addXxX_idct:
+ ****************************************************************************/
+#if 0
+inline void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+{
+    /* Load dct coeffs */
+    asm volatile(
+        "movq   (%0), %%mm0\n"
+        "movq  8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n" :: "r"(dct) );
+
+    MMX_SUMSUB_BA  ( %%mm2, %%mm0 );                /* mm2=s02  mm0=d02 */
+    MMX_SUMSUBD2_AB( %%mm1, %%mm3, %%mm5, %%mm4 );  /* mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm1, %%mm2, %%mm4, %%mm0 );  /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
+
+    /* in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0 */
+    MMX_TRANSPOSE  ( %%mm1, %%mm4, %%mm0, %%mm2, %%mm3 );
+
+    MMX_SUMSUB_BA  ( %%mm3, %%mm1 );                /* mm3=s02  mm1=d02 */
+    MMX_SUMSUBD2_AB( %%mm2, %%mm0, %%mm5, %%mm4 );  /* mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
+
+    MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm4, %%mm1 );  /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
+
+    /* in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1 */
+    MMX_TRANSPOSE  ( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 );
+
+    MMX_ZERO( %%mm7 );
+    asm volatile( "movq x264_mmx_32, %%mm6\n" :: );
+
+    MMX_STORE_DIFF_4P( %%mm2, %%mm4, %%mm6, %%mm7, &p_dst[0*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm3, %%mm4, %%mm6, %%mm7, &p_dst[1*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm0, %%mm4, %%mm6, %%mm7, &p_dst[2*i_dst] );
+    MMX_STORE_DIFF_4P( %%mm1, %%mm4, %%mm6, %%mm7, &p_dst[3*i_dst] );
+}
+#endif
+
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
+{
+    x264_add4x4_idct_mmxext( p_dst, i_dst,             dct[0] );
+    x264_add4x4_idct_mmxext( &p_dst[4], i_dst,         dct[1] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+0], i_dst, dct[2] );
+    x264_add4x4_idct_mmxext( &p_dst[4*i_dst+4], i_dst, dct[3] );
+}
+
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
+{
+    x264_add8x8_idct_mmxext( &p_dst[0], i_dst, &dct[0] );
+    x264_add8x8_idct_mmxext( &p_dst[8], i_dst, &dct[4] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst], i_dst, &dct[8] );
+    x264_add8x8_idct_mmxext( &p_dst[8*i_dst+8], i_dst, &dct[12] );
+}
--- a/core/i386/dct.asm
+++ b/core/i386/dct.asm
@ -0,0 +1,313 @@
+;*****************************************************************************
+;* dct.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (initial version)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.04.28  portab all 4x4 function to nasm (CM)                         *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro MMX_ZERO 1
+    pxor    %1, %1
+%endmacro
+
+%macro MMX_LOAD_DIFF_4P 5
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro MMX_SUMSUB_BA 2
+    paddw   %1, %2
+    paddw   %2, %2
+    psubw   %2, %1
+%endmacro
+
+%macro MMX_SUMSUB_BADC 4
+    paddw   %1, %2
+    paddw   %3, %4
+    paddw   %2, %2
+    paddw   %4, %4
+    psubw   %2, %1
+    psubw   %4, %3
+%endmacro
+
+%macro MMX_SUMSUB2_AB 3
+    movq    %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro MMX_SUMSUBD2_AB 4
+    movq    %4, %1
+    movq    %3, %2
+    psraw   %2, $1
+    psraw   %4, $1
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; input ABCD output ADTC
+;-----------------------------------------------------------------------------
+%macro MMX_TRANSPOSE 5
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_STORE_DIFF_4P 5
+    paddw       %1, %3
+    psraw       %1, $6
+    movd        %2, %5
+    punpcklbw   %2, %4
+    paddsw      %1, %2
+    packuswb    %1, %1
+    movd        %5, %1
+%endmacro
+
+;%macro 
+;%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+x264_mmx_1:
+  dw 1, 1, 1, 1
+
+x264_mmx_32:
+  dw 32, 32, 32, 32
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_dct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl dct4x4dc( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_dct4x4dc_mmxext:
+    mov     eax,        [esp+ 4]
+    movq    mm0,        [eax+ 0]
+    movq    mm1,        [eax+ 8]
+    movq    mm2,        [eax+16]
+    movq    mm3,        [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23  mm1=s01-s23  mm2=d01+d23  mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    mm6,        [x264_mmx_1]
+    paddw   mm0,        mm6
+    paddw   mm4,        mm6
+    psraw   mm0,        1
+    movq    [eax+ 0],   mm0
+    psraw   mm4,        1
+    movq    [eax+ 8],   mm4
+    paddw   mm1,        mm6
+    paddw   mm3,        mm6
+    psraw   mm1,        1
+    movq    [eax+16],   mm1
+    psraw   mm3,        1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_idct4x4dc_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+x264_idct4x4dc_mmxext:
+    mov     eax, [esp+ 4]
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+    MMX_SUMSUB_BADC     mm1, mm0, mm3, mm2          ; mm1=s01  mm0=d01  mm3=s23  mm2=d23
+    MMX_SUMSUB_BADC     mm3, mm1, mm2, mm0          ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
+
+    MMX_TRANSPOSE       mm3, mm1, mm0, mm2, mm4     ; in: mm3, mm1, mm0, mm2  out: mm3, mm2, mm4, mm0 
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
+    MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
+
+    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
+
+    movq    [eax+ 0],   mm0
+    movq    [eax+ 8],   mm4
+    movq    [eax+16],   mm1
+    movq    [eax+24],   mm3
+    ret
+
+cglobal x264_sub4x4_dct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+;-----------------------------------------------------------------------------
+x264_sub4x4_dct_mmxext:
+    push    ebx
+    mov     eax, [esp+12]   ; pix1
+    mov     ebx, [esp+16]   ; i_pix1
+    mov     ecx, [esp+20]   ; pix2
+    mov     edx, [esp+24]   ; i_pix2
+
+    MMX_ZERO    mm7
+
+    ; Load 4 lines
+    MMX_LOAD_DIFF_4P    mm0, mm6, mm7, [eax      ], [ecx]
+    MMX_LOAD_DIFF_4P    mm1, mm6, mm7, [eax+ebx  ], [ecx+edx]
+    MMX_LOAD_DIFF_4P    mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+    add     eax, ebx
+    add     ecx, edx
+    MMX_LOAD_DIFF_4P    mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
+
+    MMX_SUMSUB_BADC     mm3, mm0, mm2, mm1          ; mm3=s03  mm0=d03  mm2=s12  mm1=d12
+
+    MMX_SUMSUB_BA       mm2, mm3                    ; mm2=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm0, mm1, mm4               ; mm0=2.d03+d12    mm4=d03-2.d12
+
+    ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
+    MMX_TRANSPOSE       mm2, mm0, mm3, mm4, mm1
+
+    MMX_SUMSUB_BADC     mm3, mm2, mm1, mm4          ; mm3=s03  mm2=d03  mm1=s12  mm4=d12
+
+    MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
+    MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
+
+    ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
+    MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
+
+    mov     eax, [esp+ 8]   ; dct
+    movq    [eax+ 0],   mm1
+    movq    [eax+ 8],   mm0
+    movq    [eax+16],   mm4
+    movq    [eax+24],   mm3
+
+    pop     ebx
+    ret
+
+cglobal x264_add4x4_idct_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+x264_add4x4_idct_mmxext:
+
+    ; Load dct coeffs
+    mov     eax, [esp+12]   ; dct
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+    
+    mov     eax, [esp+ 4]   ; p_dst
+    mov     ecx, [esp+ 8]   ; i_dst
+    lea     edx, [ecx+ecx*2]
+
+    MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
+    MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm1, mm2, mm4, mm0              ; mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13
+
+    ; in: mm1, mm4, mm0, mm2  out: mm1, mm2, mm3, mm0
+    MMX_TRANSPOSE       mm1, mm4, mm0, mm2, mm3
+
+    MMX_SUMSUB_BA       mm3, mm1                        ; mm3=s02  mm1=d02
+    MMX_SUMSUBD2_AB     mm2, mm0, mm5, mm4              ; mm2=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
+
+    MMX_SUMSUB_BADC     mm2, mm3, mm4, mm1              ; mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13
+
+    ; in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1
+    MMX_TRANSPOSE       mm2, mm4, mm1, mm3, mm0
+
+    MMX_ZERO            mm7
+    movq                mm6, [x264_mmx_32]
+    
+    MMX_STORE_DIFF_4P   mm2, mm4, mm6, mm7, [eax]
+    MMX_STORE_DIFF_4P   mm3, mm4, mm6, mm7, [eax+ecx]
+    MMX_STORE_DIFF_4P   mm0, mm4, mm6, mm7, [eax+ecx*2]
+    MMX_STORE_DIFF_4P   mm1, mm4, mm6, mm7, [eax+edx]
+
+    ret
+
--- a/core/i386/dct.h
+++ b/core/i386/dct.h
@ -0,0 +1,38 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: dct.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_DCT_H
+#define _I386_DCT_H 1
+
+void x264_sub4x4_dct_mmxext( int16_t dct[4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+
+void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
+void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
+void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
+
+void x264_dct4x4dc_mmxext( int16_t d[4][4] );
+void x264_idct4x4dc_mmxext( int16_t d[4][4] );
+
+#endif
--- a/core/i386/mc-c.c
+++ b/core/i386/mc-c.c
@ -0,0 +1,940 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../mc.h"
+#include "../clip1.h"
+#include "mc.h"
+
+#define UNUSED_UINT64( foo ) \
+    static const uint64_t foo __asm__ (#foo)  __attribute__((unused))
+
+UNUSED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;
+
+
+#define MMX_ZERO( MMZ ) \
+    asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
+
+#define MMX_INIT( MMV, NAME ) \
+    asm volatile( "movq " #NAME ", " #MMV "\n" :: )
+
+#define MMX_SAVE_4P( MMP, MMZ, dst ) \
+    asm volatile( "packuswb " #MMZ  "," #MMP "\n" \
+                  "movd " #MMP ", (%0)" :: "r"(dst) )
+
+#define MMX_LOAD_4P( MMP, MMZ, pix ) \
+    asm volatile( "movd (%0), " #MMP "\n" \
+                  "punpcklbw  " #MMZ ", " #MMP "\n" : : "r"(pix) )
+
+#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \
+    MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \
+    MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )
+
+#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\
+    MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )
+
+#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \
+    asm volatile( "packuswb " #MMP2  "," #MMP1 "\n" \
+                  "movq " #MMP1 ", (%0)\n" :: "r"(dst) )
+
+
+#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \
+    asm volatile( "movq         (%0)   , " #MMP1 "\n" \
+                  "movq       " #MMP1 ", " #MMP2 "\n" \
+                  "punpcklbw  " #MMZ  ", " #MMP1 "\n" \
+                  "punpckhbw  " #MMZ  ", " #MMP2 "\n" : : "r"(pix) )
+
+#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
+    MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \
+    MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )
+
+#define SBUTTERFLYwd(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpcklwd " #b ", " #a "   \n\t" \
+                  "punpckhwd " #b ", " #t "   \n\t" :: )
+
+#define SBUTTERFLYdq(a,b,t )\
+    asm volatile( "movq " #a ", " #t "        \n\t" \
+                  "punpckldq " #b ", " #a "   \n\t" \
+                  "punpckhdq " #b ", " #t "   \n\t" :: )
+
+/* input ABCD output ADTC  ( or 0?31-2->0123 ) */
+#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
+        SBUTTERFLYwd( MMA, MMB, MMT ); \
+        SBUTTERFLYwd( MMC, MMD, MMB ); \
+        SBUTTERFLYdq( MMA, MMC, MMD ); \
+        SBUTTERFLYdq( MMT, MMB, MMC )
+
+/* first pass MM0 = MM0 -5*MM1 */
+#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" :: )
+                                                   \
+/* second pass MM0 = MM0 + 20*(MM2+MM3) */
+#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \
+    asm volatile( "paddw    " #MMP3 "," #MMP2 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP2 "\n" \
+                  "paddw    " #MMP2 "," #MMP0 "\n" :: )
+
+/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */
+#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                                                   \
+                  "paddw    " #MMP2 "," #MMP0 "\n" \
+                  "paddw    " #MMV  "," #MMP0 "\n" \
+                  "psraw      $5,     " #MMP0 "\n" :: )
+
+#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \
+    asm volatile( "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP3 "\n" \
+                  "psubw    " #MMP1 "," #MMP0 "\n" \
+                  "psubw    " #MMP3 "," #MMP2 "\n" :: )
+
+/* second pass MM0 = MM0 + 20*(MM1+MM2) */
+#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \
+    asm volatile( "paddw    " #MMP2 "," #MMP1 "\n" \
+                  "paddw    " #MMP5 "," #MMP4 "\n" \
+                                                 \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" \
+                  "psllw      $2,     " #MMP1 "\n" \
+                  "psllw      $2,     " #MMP4 "\n" \
+                  "paddw    " #MMP1 "," #MMP0 "\n" \
+                  "paddw    " #MMP4 "," #MMP3 "\n" :: )
+
+#define MMX_LOAD_1r( m1, dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \
+
+#define MMX_SAVE_1r( m1, dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \
+
+#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
+
+#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \
+    asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
+    asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
+
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
+}
+
+static inline void pixel_avg_w4( uint8_t *dst,  int i_dst_stride,
+                                 uint8_t *src1, int i_src1_stride,
+                                 uint8_t *src2, int i_src2_stride,
+                                 int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+static inline void pixel_avg_w8( uint8_t *dst,  int i_dst_stride,
+                                 uint8_t *src1, int i_src1_stride,
+                                 uint8_t *src2, int i_src2_stride,
+                                 int i_height )
+{
+    int y;
+    for( y = 0; y < i_height; y++ )
+    {
+        asm volatile(
+            "movq (%1), %%mm0\n"
+            "movq (%2), %%mm1\n"
+            "pavgb %%mm1, %%mm0\n"
+            "movq %%mm0, (%0)\n"
+            : : "r"(dst), "r"(src1), "r"(src2)
+            );
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+static inline void pixel_avg_w16( uint8_t *dst,  int i_dst_stride,
+                                  uint8_t *src1, int i_src1_stride,
+                                  uint8_t *src2, int i_src2_stride,
+                                  int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        asm volatile(
+            "movq (%1), %%mm0\n"
+            "movq 8(%1), %%mm2\n"
+            "movq (%2), %%mm1\n"
+            "movq 8(%2), %%mm3\n"
+
+            "pavgb %%mm1, %%mm0\n"
+            "movq %%mm0, (%0)\n"
+            "pavgb %%mm3, %%mm2\n"
+            "movq %%mm2, 8(%0)\n"
+            : : "r"(dst), "r"(src1), "r"(src2)
+            );
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+
+typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+
+/*****************************************************************************
+ * MC with width == 4 (height <= 8)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 4 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+
+static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    const int h4 = i_height / 4;
+    uint8_t  srct[4*8*3];
+    uint64_t tmp[4];
+    int y;
+
+    src -= 2;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < h4; y++ )
+    {
+        int i;
+
+        /* Preload data and transpose them */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );
+
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );
+
+        /* we read 2 more bytes that needed */
+        MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );
+
+        /* tap filter */
+        for( i = 0; i < 4; i++ )
+        {
+            MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );
+            MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+            MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+            MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );
+            MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+
+            MMX_SAVE_1r( %%mm0, &tmp[i] );
+        }
+
+        MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );
+        MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
+        MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );
+        MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );
+        MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );
+        MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );
+
+        src += 4 * i_src;
+        dst += 4 * i_dst;
+    }
+}
+static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );
+        MMX_FILTERTAP_P1( %%mm0, %%mm1 );
+        MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
+
+        MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );
+        MMX_SAVE_4P( %%mm0, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int i, x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+4];
+
+        for( i = 0; i < 5+4; i++ )
+        {
+            tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );
+        }
+
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+/* mc I+H */
+static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hh_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src, i_src_stride, tmp, 4, i_height );
+}
+static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[4*8];
+    mc_hv_w4( src, i_src_stride, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height );
+}
+/* H+V */
+static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy31_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src,   i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy13_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hv_w4( src+1,            i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src, i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,   i_src_stride, tmp1, 4, i_height );
+    mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[4*8];
+    uint8_t tmp2[4*8];
+
+    mc_hc_w4( src,              i_src_stride, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height );
+}
+
+
+/*****************************************************************************
+ * MC with width == 8 (height <= 16)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 8 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+
+static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hh_w4( &src[4], i_src, &dst[4], i_dst, i_height );
+}
+static inline void mc_hv_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    int y;
+
+    src -= 2 * i_src;
+
+    MMX_ZERO( %%mm7 );
+    MMX_INIT( %%mm6, x264_w0x10 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        MMX_LOAD_2x8( %%mm0, %%mm5, %%mm1, %%mm2, %%mm7,  &src[0*i_src], i_src );
+        MMX_FILTERTAP2_P1( %%mm0, %%mm1, %%mm5, %%mm2 );
+
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[2*i_src], i_src );
+        MMX_FILTERTAP2_P2( %%mm0, %%mm1, %%mm2, %%mm5, %%mm3, %%mm4 );
+
+        MMX_LOAD_2x8( %%mm1, %%mm3, %%mm2, %%mm4, %%mm7,  &src[4*i_src], i_src );
+        MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
+        MMX_FILTERTAP_P3( %%mm5, %%mm3, %%mm4, %%mm6, %%mm7 );
+
+        MMX_SAVEPACK_8P( %%mm0, %%mm5, %%mm7, dst );
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+
+static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int x, y;
+
+    asm volatile( "pxor %%mm7,        %%mm7\n" : : );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int16_t tap[5+8];
+
+        /* first 8 */
+        asm volatile(
+            "leal   (%0, %1),   %%eax\n"
+
+            "movq       (%0),   %%mm0\n"    /* load pix-2 */
+            "movq       %%mm0,  %%mm2\n"
+            "punpcklbw  %%mm7,  %%mm0\n"
+            "punpckhbw  %%mm7,  %%mm2\n"
+
+            "movq       (%%eax),%%mm1\n"    /* load pix-1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1),%%mm1\n"  /* load pix */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psubw      %%mm3,  %%mm2\n"
+            "psllw      $2,     %%mm3\n"
+            "psubw      %%mm3,  %%mm2\n"
+
+            "movq       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "movq       %%mm1,  %%mm3\n"
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "punpckhbw  %%mm7,  %%mm3\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "paddw      %%mm3,  %%mm2\n"
+
+            "movq       %%mm0,   (%2)\n"
+            "movq       %%mm2,  8(%2)\n"
+
+
+            "addl   $8,         %%eax\n"
+            "addl   $8,         %0\n"
+
+
+            "movd       (%0),   %%mm0\n"    /* load pix-2 */
+            "punpcklbw  %%mm7,  %%mm0\n"
+
+            "movd       (%%eax),%%mm1\n"    /* load pix-1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1),%%mm1\n"  /* load pix */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,2),%%mm1\n"  /* load pix+1 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movd       (%0,%1,4),%%mm1\n"  /* load pix+2 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+            "psllw      $2,     %%mm1\n"
+            "psubw      %%mm1,  %%mm0\n"
+
+            "movd       (%%eax,%1,4),%%mm1\n"  /* load pix+3 */
+            "punpcklbw  %%mm7,  %%mm1\n"
+            "paddw      %%mm1,  %%mm0\n"
+
+            "movq       %%mm0,  16(%2)\n"
+            : : "r"(src-2*i_src_stride-2), "r"(i_src_stride), "r"(&tap[0]) : "%eax" );
+
+        /* last one */
+        tap[8+4] = x264_tapfilter( &src[-2+8+4], i_src_stride );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
+        }
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+
+/* mc I+H */
+static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy30_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hh_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+1, i_src_stride, tmp, 8, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src, i_src_stride, tmp, 8, i_height );
+}
+static void mc_xy03_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[8*16];
+    mc_hv_w8( src, i_src_stride, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 8, i_height );
+}
+/* H+V */
+static void mc_xy11_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy31_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src,   i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy13_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy33_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hv_w8( src+1,            i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy21_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy12_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src, i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy32_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,   i_src_stride, tmp1, 8, i_height );
+    mc_hv_w8( src+1, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[8*16];
+    uint8_t tmp2[8*16];
+
+    mc_hc_w8( src,              i_src_stride, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height );
+}
+
+
+/*****************************************************************************
+ * MC with width == 16 (height <= 16)
+ *****************************************************************************/
+#if 0
+static void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, 16 );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+#else
+extern void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
+#endif
+static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height );
+    mc_hh_w4( &src[ 4], i_src, &dst[ 4], i_dst, i_height );
+    mc_hh_w4( &src[ 8], i_src, &dst[ 8], i_dst, i_height );
+    mc_hh_w4( &src[12], i_src, &dst[12], i_dst, i_height );
+}
+static inline void mc_hv_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hv_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hv_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    mc_hc_w8( src,     i_src_stride, dst,     i_dst_stride, i_height );
+    mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height );
+}
+
+/* mc I+H */
+static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
+}
+/* mc I+V */
+static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
+}
+static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
+}
+/* H+V */
+static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src,   i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv_w16( src+1,            i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src,   i_src_stride, tmp1, 16, i_height );
+    mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc_w16( src,              i_src_stride, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src_stride,
+                                      uint8_t *dst, int i_dst_stride,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { mc_copy_w4,  mc_xy10_w4,    mc_hh_w4,      mc_xy30_w4 },
+            { mc_xy01_w4,  mc_xy11_w4,    mc_xy21_w4,    mc_xy31_w4 },
+            { mc_hv_w4,    mc_xy12_w4,    mc_hc_w4,      mc_xy32_w4 },
+            { mc_xy03_w4,  mc_xy13_w4,    mc_xy23_w4,    mc_xy33_w4 },
+        },
+        {
+            { mc_copy_w8,  mc_xy10_w8,    mc_hh_w8,      mc_xy30_w8 },
+            { mc_xy01_w8,  mc_xy11_w8,    mc_xy21_w8,    mc_xy31_w8 },
+            { mc_hv_w8,    mc_xy12_w8,    mc_hc_w8,      mc_xy32_w8 },
+            { mc_xy03_w8,  mc_xy13_w8,    mc_xy23_w8,    mc_xy33_w8 },
+        },
+        {
+            { mc_copy_w16,  mc_xy10_w16,    mc_hh_w16,      mc_xy30_w16 },
+            { mc_xy01_w16,  mc_xy11_w16,    mc_xy21_w16,    mc_xy31_w16 },
+            { mc_hv_w16,    mc_xy12_w16,    mc_hc_w16,      mc_xy32_w16 },
+            { mc_xy03_w16,  mc_xy13_w16,    mc_xy23_w16,    mc_xy33_w16 },
+        }
+    };
+
+    src += (mvy >> 2) * i_src_stride + (mvx >> 2);
+    if( i_width == 4 )
+    {
+        pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else if( i_width == 8 )
+    {
+        pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else if( i_width == 16 )
+    {
+        pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height );
+    }
+    else
+    {
+        fprintf( stderr, "Error: motion_compensation_luma called with invalid width" );
+    }
+}
+
+void x264_mc_mmxext_init( x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA]   = motion_compensation_luma;
+}
+
--- a/core/i386/mc.asm
+++ b/core/i386/mc.asm
@ -0,0 +1,187 @@
+;*****************************************************************************
+;* mc.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: mc.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
+;*          Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2004.05.17 portab mc_copy_w4/8/16 (CM)                                   *
+;*                                                                           *
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;=============================================================================
+; Local Data (Read Only)
+;=============================================================================
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata data align=16
+%endif
+
+;-----------------------------------------------------------------------------
+; Various memory constants (trigonometric values or rounding values)
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal mc_copy_w4
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w4:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    mov     eax, [esi]
+    mov     [edi], eax
+    mov     eax, [esi+ebx]
+    mov     [edi+edx], eax
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    dec     ecx
+    dec     ecx
+    jne     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w8
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w8:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    [edi], mm0
+    movq    mm1, [esi+ebx]
+    movq    [edi+edx], mm1
+    movq    mm2, [esi+ebx*2]
+    movq    [edi+edx*2], mm2
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm3, [esi+ebx]
+    movq    [edi+edx], mm3
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    
+    sub     ecx, byte 4
+    jnz     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
+
+cglobal mc_copy_w16
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
+;-----------------------------------------------------------------------------
+mc_copy_w16:
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     esi, [esp+16]       ; src
+    mov     edi, [esp+24]       ; dst
+    mov     ebx, [esp+20]       ; i_src_stride
+    mov     edx, [esp+28]       ; i_dst_stride
+    mov     ecx, [esp+32]       ; i_height
+ALIGN 4
+.height_loop
+    movq    mm0, [esi]
+    movq    mm1, [esi+8]
+    movq    [edi], mm0
+    movq    [edi+8], mm1
+    movq    mm2, [esi+ebx]
+    movq    mm3, [esi+ebx+8]
+    movq    [edi+edx], mm2
+    movq    [edi+edx+8], mm3
+    movq    mm4, [esi+ebx*2]
+    movq    mm5, [esi+ebx*2+8]
+    movq    [edi+edx*2], mm4
+    movq    [edi+edx*2+8], mm5
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    movq    mm6, [esi+ebx]
+    movq    mm7, [esi+ebx+8]
+    movq    [edi+edx], mm6
+    movq    [edi+edx+8], mm7
+    lea     esi, [esi+ebx*2]
+    lea     edi, [edi+edx*2]
+    
+    sub     ecx, byte 4
+    jnz     .height_loop
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    ret
--- a/core/i386/mc.h
+++ b/core/i386/mc.h
@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_MC_H
+#define _I386_MC_H 1
+
+void x264_mc_mmxext_init( x264_mc_function_t pf[2] );
+
+#endif
--- a/core/i386/pixel.asm
+++ b/core/i386/pixel.asm
@ -0,0 +1,705 @@
+;*****************************************************************************
+;* pixel.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003 x264 project
+;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro SAD_INC_2x16P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+8]
+    movq    mm4,    [ecx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    movq    mm1,    [eax+ebx]
+    movq    mm2,    [ecx+edx]
+    movq    mm3,    [eax+ebx+8]
+    movq    mm4,    [ecx+edx+8]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x8P 0
+    movq    mm1,    [eax]
+    movq    mm2,    [ecx]
+    movq    mm3,    [eax+ebx]
+    movq    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro SAD_INC_2x4P 0
+    movd    mm1,    [eax]
+    movd    mm2,    [ecx]
+    movd    mm3,    [eax+ebx]
+    movd    mm4,    [ecx+edx]
+
+    psadbw  mm1,    mm2
+    psadbw  mm3,    mm4
+    paddw   mm0,    mm1
+    paddw   mm0,    mm3
+
+    lea     eax,    [eax+2*ebx]
+    lea     ecx,    [ecx+2*edx]
+%endmacro
+
+%macro LOAD_DIFF_4P 5  ; MMP, MMT, MMZ, [pix1], [pix2]
+    movd        %1, %4
+    punpcklbw   %1, %3
+    movd        %2, %5
+    punpcklbw   %2, %3
+    psubw       %1, %2
+%endmacro
+
+%macro LOAD_DIFF_INC_4x4 11 ; p1,p2,p3,p4, t, z, pix1, i_pix1, pix2, i_pix2, offset
+    LOAD_DIFF_4P %1, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %2, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+    LOAD_DIFF_4P %3, %5, %6, [%7+%11],    [%9+%11]
+    LOAD_DIFF_4P %4, %5, %6, [%7+%8+%11], [%9+%10+%11]
+    lea %7, [%7+2*%8]
+    lea %9, [%9+2*%10]
+%endmacro
+
+%macro HADAMARD4_SUB_BADC 4
+    paddw %1,   %2
+    paddw %3,   %4
+    paddw %2,   %2
+    paddw %4,   %4
+    psubw %2,   %1
+    psubw %4,   %3
+%endmacro
+
+%macro HADAMARD4x4 4
+    HADAMARD4_SUB_BADC %1, %2, %3, %4
+    HADAMARD4_SUB_BADC %1, %3, %2, %4
+%endmacro
+
+%macro SBUTTERFLYwd 3
+    movq        %3, %1
+    punpcklwd   %1, %2
+    punpckhwd   %3, %2
+%endmacro
+
+%macro SBUTTERFLYdq 3
+    movq        %3, %1
+    punpckldq   %1, %2
+    punpckhdq   %3, %2
+%endmacro
+
+%macro TRANSPOSE4x4 5   ; abcd-t -> adtc
+    SBUTTERFLYwd %1, %2, %5
+    SBUTTERFLYwd %3, %4, %2
+    SBUTTERFLYdq %1, %3, %4
+    SBUTTERFLYdq %5, %2, %3
+%endmacro
+
+%macro MMX_ABS 2        ; mma, mmt
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%endmacro
+
+%macro MMX_ABS_SUM 3    ; mma, mmt, mms
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+    paddusw %3, %1
+%endmacro
+
+
+%macro MMX_SUM_MM 2     ; mmv, mmt
+    movq    %2, %1
+    psrlq   %1, 32
+    paddusw %1, %2
+    movq    %2, %1
+    psrlq   %1, 16
+    paddusw %1, %2
+    movd    eax,%1
+    and     eax,0xffff
+    shr     eax,1
+%endmacro
+
+%macro HADAMARD4x4_FIRST 0
+    HADAMARD4x4 mm0, mm1, mm2, mm3
+    TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+    HADAMARD4x4 mm0, mm3, mm4, mm2
+    MMX_ABS     mm0, mm7
+    MMX_ABS_SUM mm3, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm2, mm7, mm0
+%endmacro
+
+%macro HADAMARD4x4_NEXT 0
+    HADAMARD4x4 mm1, mm2, mm3, mm4
+    TRANSPOSE4x4 mm1, mm2, mm3, mm4, mm5
+    HADAMARD4x4 mm1, mm4, mm5, mm3
+    MMX_ABS_SUM mm1, mm7, mm0
+    MMX_ABS_SUM mm4, mm7, mm0
+    MMX_ABS_SUM mm5, mm7, mm0
+    MMX_ABS_SUM mm3, mm7, mm0
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+cglobal x264_pixel_sad_16x16_mmxext
+cglobal x264_pixel_sad_16x8_mmxext
+cglobal x264_pixel_sad_8x16_mmxext
+cglobal x264_pixel_sad_8x8_mmxext
+cglobal x264_pixel_sad_8x4_mmxext
+cglobal x264_pixel_sad_4x8_mmxext
+cglobal x264_pixel_sad_4x4_mmxext
+
+cglobal x264_pixel_satd_4x4_mmxext
+cglobal x264_pixel_satd_4x8_mmxext
+cglobal x264_pixel_satd_8x4_mmxext
+cglobal x264_pixel_satd_8x8_mmxext
+cglobal x264_pixel_satd_16x8_mmxext
+cglobal x264_pixel_satd_8x16_mmxext
+cglobal x264_pixel_satd_16x16_mmxext
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x16_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+    SAD_INC_2x16P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x16_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_8x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x8P
+    SAD_INC_2x8P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_4x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm0,    mm0
+
+    SAD_INC_2x4P
+    SAD_INC_2x4P
+
+    movd eax,    mm0
+
+    pop ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+
+    LOAD_DIFF_4P mm0, mm6, mm7, [eax],       [ecx]
+    LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx],   [ecx+edx]
+    LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+    add eax, ebx
+    add ecx, edx
+    LOAD_DIFF_4P mm3, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
+
+    HADAMARD4x4_FIRST
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_4x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x4_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x8_mmxext:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    mm7,    mm7
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ecx,    [esp+16]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x8_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add         eax, ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x16_mmxext:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    mm7,    mm7
+    xor     ebp,    ebp
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    mov     ebp, eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     ebp,    eax
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ecx,    [esp+20]    ; pix2
+
+    LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_FIRST
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
+    HADAMARD4x4_NEXT
+
+    MMX_SUM_MM  mm0, mm7
+    add     eax,    ebp
+
+    pop     ebp
+    pop     ebx
+    ret
+
--- a/core/i386/pixel.h
+++ b/core/i386/pixel.h
@ -0,0 +1,43 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PIXEL_H
+#define _I386_PIXEL_H 1
+
+int x264_pixel_sad_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
+#endif
--- a/core/i386/predict.c
+++ b/core/i386/predict.c
@ -0,0 +1,429 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* XXX predict4x4 are inspired from ffmpeg h264 decoder
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"   /* for keyword inline */
+#include "../predict.h"
+#include "predict.h"
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/****************************************************************************
+ * 16x16 prediction for intra block DC, H, V, P
+ ****************************************************************************/
+static void predict_16x16_dc( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    /* calculate DC value */
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 16 ) >> 5) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_top( uint8_t *src, int i_stride )
+{
+    uint32_t dc = 0;
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[i - i_stride];
+    }
+    dc = (( dc + 8 ) >> 4) * 0x01010101;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+        *p++ = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_128( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 16; i++ )
+    {
+        const uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+
+    }
+}
+static void predict_16x16_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile(
+        "movq  (%0), %%mm0\n"
+        "movq 8(%0), %%mm1\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 16; i++ )
+    {
+        asm volatile(
+            "movq %%mm0,  (%0)\n"
+            "movq %%mm1, 8(%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+
+/****************************************************************************
+ * 8x8 prediction for intra chroma block DC, H, V, P
+ ****************************************************************************/
+static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = 0x80808080;
+        *p++ = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dc0 += src[y * i_stride     - 1];
+        dc1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc0;
+
+        src += i_stride;
+    }
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc1;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+{
+    int y, x;
+    uint32_t dc0 = 0, dc1 = 0;
+
+    for( x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - i_stride];
+        dc1 += src[x + 4 - i_stride];
+    }
+    dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
+    dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 8; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    uint32_t dc0, dc1, dc2, dc3;
+    int i;
+
+    /* First do :
+          s0 s1
+       s2
+       s3
+    */
+    for( i = 0; i < 4; i++ )
+    {
+        s0 += src[i - i_stride];
+        s1 += src[i + 4 - i_stride];
+        s2 += src[-1 + i * i_stride];
+        s3 += src[-1 + (i+4)*i_stride];
+    }
+    /* now calculate
+       dc0 dc1
+       dc2 dc3
+     */
+    dc0 = (( s0 + s2 + 4 ) >> 3)*0x01010101;
+    dc1 = (( s1 + 2 ) >> 2)*0x01010101;
+    dc2 = (( s3 + 2 ) >> 2)*0x01010101;
+    dc3 = (( s1 + s3 + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc0;
+        *p++ = dc1;
+
+        src += i_stride;
+    }
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p++ = dc2;
+        *p++ = dc3;
+
+        src += i_stride;
+    }
+}
+
+static void predict_8x8_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 8; i++ )
+    {
+        uint32_t v = 0x01010101 * src[-1];
+        uint32_t *p = (uint32_t*)src;
+
+        *p++ = v;
+        *p++ = v;
+
+        src += i_stride;
+    }
+}
+static void predict_8x8_v( uint8_t *src, int i_stride )
+{
+    int i;
+
+    asm volatile( "movq  (%0), %%mm0\n" :: "r"(&src[-i_stride]) );
+
+    for( i = 0; i < 8; i++ )
+    {
+        asm volatile( "movq %%mm0,  (%0)\n" :: "r"(src) );
+        src += i_stride;
+    }
+}
+
+
+/****************************************************************************
+ * 4x4 prediction for intra luma block DC, H, V, P
+ ****************************************************************************/
+static void predict_4x4_dc_128( uint8_t *src, int i_stride )
+{
+    int y;
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x80808080;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_left( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_top( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[0 - i_stride] + src[1 - i_stride] +
+                     src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc( uint8_t *src, int i_stride )
+{
+    int y;
+    uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
+                     src[-1+2*i_stride] + src[-1+3*i_stride] +
+                     src[0 - i_stride]  + src[1 - i_stride] +
+                     src[2 - i_stride]  + src[3 - i_stride] + 4 ) >> 3)*0x01010101;
+
+    for( y = 0; y < 4; y++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = dc;
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_h( uint8_t *src, int i_stride )
+{
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+        *p = 0x01010101*src[-1];
+
+        src += i_stride;
+    }
+}
+static void predict_4x4_v( uint8_t *src, int i_stride )
+{
+    uint32_t top = *((uint32_t*)&src[-i_stride]);
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint32_t *p = (uint32_t*)src;
+
+        *p = top;
+
+        src += i_stride;
+    }
+}
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_16x16_V ]     = predict_16x16_v;
+    pf[I_PRED_16x16_H ]     = predict_16x16_h;
+    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
+    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
+    pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top;
+    pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
+}
+
+void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+}
+
+void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] )
+{
+    pf[I_PRED_4x4_V]      = predict_4x4_v;
+    pf[I_PRED_4x4_H]      = predict_4x4_h;
+    pf[I_PRED_4x4_DC]     = predict_4x4_dc;
+    pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left;
+    pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top;
+    pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128;
+}
+
--- a/core/i386/predict.h
+++ b/core/i386/predict.h
@ -0,0 +1,31 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_PREDICT_H
+#define _I386_PREDICT_H 1
+
+void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] );
+void x264_predict_8x8_init_mmxext   ( x264_predict_t pf[7] );
+void x264_predict_4x4_init_mmxext   ( x264_predict_t pf[12] );
+
+#endif
--- a/core/macroblock.c
+++ b/core/macroblock.c
--- a/core/macroblock.h
+++ b/core/macroblock.h
@ -0,0 +1,204 @@
+/*****************************************************************************
+ * macroblock.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _MACROBLOCK_H
+#define _MACROBLOCK_H 1
+
+enum macroblock_position_e
+{
+    MB_LEFT     = 0x01,
+    MB_TOP      = 0x02,
+    MB_TOPRIGHT = 0x04,
+
+    MB_PRIVATE  = 0x10,
+};
+
+
+/* XXX mb_type isn't the one written in the bitstream -> only internal usage */
+#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_16x16 )
+#define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
+enum mb_class_e
+{
+    I_4x4           = 0,
+    I_16x16         = 1,
+    I_PCM           = 2,
+
+    P_L0            = 3,
+    P_8x8           = 4,
+    P_SKIP          = 5,
+
+    B_DIRECT        = 6,
+    B_L0_L0         = 7,
+    B_L0_L1         = 8,
+    B_L0_BI         = 9,
+    B_L1_L0         = 10,
+    B_L1_L1         = 11,
+    B_L1_BI         = 12,
+    B_BI_L0         = 13,
+    B_BI_L1         = 14,
+    B_BI_BI         = 15,
+    B_8x8           = 16,
+    B_SKIP          = 17,
+};
+static const int x264_mb_type_list0_table[18][2] =
+{
+    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {1,1},                  /* P_L0 */
+    {0,0},                  /* P_8x8 */
+    {1,1},                  /* P_SKIP */
+    {0,0},                  /* B_DIRECT */
+    {1,1}, {1,0}, {1,1},    /* B_L0_* */
+    {0,1}, {0,0}, {0,1},    /* B_L1_* */
+    {1,1}, {1,0}, {1,1},    /* B_BI_* */
+    {0,0},                  /* B_8x8 */
+    {0,0}                   /* B_SKIP */
+};
+static const int x264_mb_type_list1_table[18][2] =
+{
+    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {0,0},                  /* P_L0 */
+    {0,0},                  /* P_8x8 */
+    {0,0},                  /* P_SKIP */
+    {0,0},                  /* B_DIRECT */
+    {0,0}, {0,1}, {0,1},    /* B_L0_* */
+    {1,0}, {1,1}, {1,1},    /* B_L1_* */
+    {1,0}, {1,1}, {1,1},    /* B_BI_* */
+    {0,0},                  /* B_8x8 */
+    {0,0}                   /* B_SKIP */
+};
+
+#define IS_SUB4x4(type) ( (type ==D_L0_4x4)||(type ==D_L1_4x4)||(type ==D_BI_4x4))
+#define IS_SUB4x8(type) ( (type ==D_L0_4x8)||(type ==D_L1_4x8)||(type ==D_BI_4x8))
+#define IS_SUB8x4(type) ( (type ==D_L0_8x4)||(type ==D_L1_8x4)||(type ==D_BI_8x4))
+#define IS_SUB8x8(type) ( (type ==D_L0_8x8)||(type ==D_L1_8x8)||(type ==D_BI_8x8)||(type ==D_DIRECT_8x8))
+enum mb_partition_e
+{
+    /* sub partition type for P_8x8 and B_8x8 */
+    D_L0_4x4        = 0,
+    D_L0_8x4        = 1,
+    D_L0_4x8        = 2,
+    D_L0_8x8        = 3,
+
+    /* sub partition type for B_8x8 only */
+    D_L1_4x4        = 4,
+    D_L1_8x4        = 5,
+    D_L1_4x8        = 6,
+    D_L1_8x8        = 7,
+
+    D_BI_4x4        = 8,
+    D_BI_8x4        = 9,
+    D_BI_4x8        = 10,
+    D_BI_8x8        = 11,
+    D_DIRECT_8x8    = 12,
+
+    /* partition */
+    D_8x8           = 13,
+    D_16x8          = 14,
+    D_8x16          = 15,
+    D_16x16         = 16,
+};
+
+static const int x264_mb_partition_count_table[17] =
+{
+    /* sub L0 */
+    4, 2, 2, 1,
+    /* sub L1 */
+    4, 2, 2, 1,
+    /* sub BI */
+    4, 2, 2, 1,
+    /* Direct */
+    1,
+    /* Partition */
+    4, 2, 2, 1
+};
+
+void x264_macroblock_cache_init( x264_t *h );
+void x264_macroblock_cache_load( x264_t *h, int, int );
+void x264_macroblock_cache_save( x264_t *h );
+void x264_macroblock_cache_end( x264_t *h );
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale );
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale );
+void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale );
+
+/* x264_mb_predict_mv_16x16:
+ *      set mvp with predicted mv for D_16x16 block
+ *      h->mb. need only valid values from others block */
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] );
+/* x264_mb_predict_mv_pskip:
+ *      set mvp with predicted mv for P_SKIP
+ *      h->mb. need only valid values from others block */
+void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] );
+/* x264_mb_predict_mv:
+ *      set mvp with predicted mv for all blocks except P_SKIP
+ *      h->mb. need valid ref/partition/sub of current block to be valid
+ *      and valid mv/ref from others block . */
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] );
+
+
+int  x264_mb_predict_intra4x4_mode( x264_t *h, int idx );
+int  x264_mb_predict_non_zero_code( x264_t *h, int idx );
+
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale );
+
+void x264_mb_mc( x264_t *h );
+
+
+static inline void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int ref )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.ref[i_list][X264_SCAN8_0+x+dx+8*(y+dy)] = ref;
+        }
+    }
+}
+static inline void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, int mvx, int mvy )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.mv[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][0] = mvx;
+            h->mb.cache.mv[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][1] = mvy;
+        }
+    }
+}
+static inline void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, int mdx, int mdy )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.mvd[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][0] = mdx;
+            h->mb.cache.mvd[i_list][X264_SCAN8_0+x+dx+8*(y+dy)][1] = mdy;
+        }
+    }
+}
+
+#endif
+
--- a/core/mc.c
+++ b/core/mc.c
@ -0,0 +1,320 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+
+#include "mc.h"
+#include "clip1.h"
+
+#ifdef _MSC_VER
+#undef HAVE_MMXEXT  /* not finished now */
+#endif
+#ifdef HAVE_MMXEXT
+#   include "i386/mc.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#   include "ppc/mc.h"
+#endif
+
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
+}
+
+static inline void pixel_avg( uint8_t *dst,  int i_dst_stride,
+                              uint8_t *src1, int i_src1_stride,
+                              uint8_t *src2, int i_src2_stride,
+                              int i_width, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst_stride;
+        src1 += i_src1_stride;
+        src2 += i_src2_stride;
+    }
+}
+
+typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height );
+
+static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        memcpy( dst, src, i_width );
+
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hh( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) + 16 ) >> 5 );
+        }
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hv( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    int x, y;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src_stride ) + 16 ) >> 5 );
+        }
+        src += i_src_stride;
+        dst += i_dst_stride;
+    }
+}
+static inline void mc_hc( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t *out;
+    uint8_t *pix;
+    int x, y;
+
+    for( x = 0; x < i_width; x++ )
+    {
+        int tap[6];
+
+        pix = &src[x];
+        out = &dst[x];
+
+        tap[0] = x264_tapfilter1( &pix[-2*i_src_stride] );
+        tap[1] = x264_tapfilter1( &pix[-1*i_src_stride] );
+        tap[2] = x264_tapfilter1( &pix[ 0*i_src_stride] );
+        tap[3] = x264_tapfilter1( &pix[ 1*i_src_stride] );
+        tap[4] = x264_tapfilter1( &pix[ 2*i_src_stride] );
+
+        for( y = 0; y < i_height; y++ )
+        {
+            tap[5] = x264_tapfilter1( &pix[ 3*i_src_stride] );
+
+            *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] + 20 * tap[3] -5*tap[4] + tap[5] + 512 ) >> 10 );
+
+            /* Next line */
+            pix += i_src_stride;
+            out += i_dst_stride;
+            tap[0] = tap[1];
+            tap[1] = tap[2];
+            tap[2] = tap[3];
+            tap[3] = tap[4];
+            tap[4] = tap[5];
+        }
+    }
+}
+
+/* mc I+H */
+static void mc_xy10( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src, i_src_stride, tmp, i_width, i_width, i_height );
+}
+static void mc_xy30( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src+1, i_src_stride, tmp, i_width, i_width, i_height );
+}
+/* mc I+V */
+static void mc_xy01( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src, i_src_stride, tmp, i_width, i_width, i_height );
+}
+static void mc_xy03( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv( src, i_src_stride, tmp, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, i_width, i_width, i_height );
+}
+/* H+V */
+static void mc_xy11( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy31( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src+1, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src,   i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy13( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src,              i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy33( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hv( src+1,            i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy21( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy12( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src, i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hv( src, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy32( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src,   i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hv( src+1, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+static void mc_xy23( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+
+    mc_hc( src,              i_src_stride, tmp1, i_width, i_width, i_height );
+    mc_hh( src+i_src_stride, i_src_stride, tmp2, i_width, i_width, i_height );
+    pixel_avg( dst, i_dst_stride, tmp1, i_width, tmp2, i_width, i_width, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src_stride,
+                                      uint8_t *dst, int i_dst_stride,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static pf_mc_t pf_mc[4][4] =    /*XXX [dqy][dqx] */
+    {
+        { mc_copy,  mc_xy10,    mc_hh,      mc_xy30 },
+        { mc_xy01,  mc_xy11,    mc_xy21,    mc_xy31 },
+        { mc_hv,    mc_xy12,    mc_hc,      mc_xy32 },
+        { mc_xy03,  mc_xy13,    mc_xy23,    mc_xy33 },
+    };
+
+    src += (mvy >> 2) * i_src_stride + (mvx >> 2);
+    pf_mc[mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_width, i_height );
+}
+
+/* full chroma mc (ie until 1/8 pixel)*/
+static void motion_compensation_chroma( uint8_t *src, int i_src_stride,
+                                        uint8_t *dst, int i_dst_stride,
+                                        int mvx, int mvy,
+                                        int i_width, int i_height )
+{
+    uint8_t *srcp;
+    int x, y;
+
+    const int d8x = mvx&0x07;
+    const int d8y = mvy&0x07;
+
+    const int cA = (8-d8x)*(8-d8y);
+    const int cB = d8x    *(8-d8y);
+    const int cC = (8-d8x)*d8y;
+    const int cD = d8x    *d8y;
+
+    src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
+    srcp = &src[i_src_stride];
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            dst[x] = ( cA*src[x]  + cB*src[x+1] +
+                       cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6;
+        }
+        dst  += i_dst_stride;
+
+        src   = srcp;
+        srcp += i_src_stride;
+    }
+}
+
+void x264_mc_init( int cpu, x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA]   = motion_compensation_luma;
+    pf[MC_CHROMA] = motion_compensation_chroma;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_mc_mmxext_init( pf );
+    }
+#endif
+#ifdef HAVE_ALTIVEC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        x264_mc_altivec_init( pf );
+    }
+#endif
+}
+
--- a/core/mc.h
+++ b/core/mc.h
@ -0,0 +1,45 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _MC_H
+#define _MC_H 1
+
+/* Do the MC
+ * XXX: Only width = 4, 8 or 16 are valid
+ * width == 4 -> height == 4 or 8
+ * width == 8 -> height == 4 or 8 or 16
+ * width == 16-> height == 8 or 16
+ * */
+
+typedef void (*x264_mc_function_t)(uint8_t *, int, uint8_t *, int,
+                          int mvx, int mvy,
+                          int i_width, int i_height );
+enum
+{
+    MC_LUMA   = 0,
+    MC_CHROMA = 1,
+};
+
+void x264_mc_init( int cpu, x264_mc_function_t pf[2] );
+
+#endif
--- a/core/mdate.c
+++ b/core/mdate.c
@ -0,0 +1,48 @@
+/*****************************************************************************
+ * mdate.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mdate.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#include <sys/time.h>
+#else
+#include <sys/types.h>
+#include <sys/timeb.h>
+#endif
+#include <time.h>
+
+int64_t x264_mdate( void )
+{
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+    struct timeval tv_date;
+
+    gettimeofday( &tv_date, NULL );
+    return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
+#else
+    struct _timeb tb;
+    _ftime(&tb);
+    return ((int64_t)tb.time * (1000) + (int64_t)tb.millitm) * (1000);
+#endif
+}
+
--- a/core/pixel.c
+++ b/core/pixel.c
@ -0,0 +1,228 @@
+/*****************************************************************************
+ * pixel.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "pixel.h"
+
+#ifdef HAVE_MMXEXT
+#   include "i386/pixel.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#   include "ppc/pixel.h"
+#endif
+
+
+/****************************************************************************
+ * pixel_sad_WxH
+ ****************************************************************************/
+#define PIXEL_SAD_C( name, lx, ly ) \
+static int name( uint8_t *pix1, int i_stride_pix1,  \
+                 uint8_t *pix2, int i_stride_pix2 ) \
+{                                                   \
+    int i_sum = 0;                                  \
+    int x, y;                                       \
+    for( y = 0; y < ly; y++ )                       \
+    {                                               \
+        for( x = 0; x < lx; x++ )                   \
+        {                                           \
+            i_sum += abs( pix1[x] - pix2[x] );      \
+        }                                           \
+        pix1 += i_stride_pix1;                      \
+        pix2 += i_stride_pix2;                      \
+    }                                               \
+    return i_sum;                                   \
+}
+
+
+PIXEL_SAD_C( pixel_sad_16x16, 16, 16 )
+PIXEL_SAD_C( pixel_sad_16x8,  16,  8 )
+PIXEL_SAD_C( pixel_sad_8x16,   8, 16 )
+PIXEL_SAD_C( pixel_sad_8x8,    8,  8 )
+PIXEL_SAD_C( pixel_sad_8x4,    8,  4 )
+PIXEL_SAD_C( pixel_sad_4x8,    4,  8 )
+PIXEL_SAD_C( pixel_sad_4x4,    4,  4 )
+
+static void pixel_sub_4x4( int16_t diff[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    int y, x;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            diff[y][x] = pix1[x] - pix2[x];
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+}
+
+static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
+{
+    int16_t tmp[4][4];
+    int16_t diff[4][4];
+    int x, y;
+    int i_satd = 0;
+
+    for( y = 0; y < i_height; y += 4 )
+    {
+        for( x = 0; x < i_width; x += 4 )
+        {
+            int d;
+
+            pixel_sub_4x4( diff, &pix1[x], i_pix1, &pix2[x], i_pix2 );
+
+            for( d = 0; d < 4; d++ )
+            {
+                int s01, s23;
+                int d01, d23;
+
+                s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
+                d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
+
+                tmp[d][0] = s01 + s23;
+                tmp[d][1] = s01 - s23;
+                tmp[d][2] = d01 - d23;
+                tmp[d][3] = d01 + d23;
+            }
+            for( d = 0; d < 4; d++ )
+            {
+                int s01, s23;
+                int d01, d23;
+
+                s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
+                d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
+
+                i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
+            }
+
+        }
+        pix1 += 4 * i_pix1;
+        pix2 += 4 * i_pix2;
+    }
+
+    return i_satd / 2;
+}
+#define PIXEL_SATD_C( name, width, height ) \
+static int name( uint8_t *pix1, int i_stride_pix1, \
+                 uint8_t *pix2, int i_stride_pix2 ) \
+{ \
+    return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
+}
+PIXEL_SATD_C( pixel_satd_16x16, 16, 16 )
+PIXEL_SATD_C( pixel_satd_16x8,  16, 8 )
+PIXEL_SATD_C( pixel_satd_8x16,  8, 16 )
+PIXEL_SATD_C( pixel_satd_8x8,   8, 8 )
+PIXEL_SATD_C( pixel_satd_8x4,   8, 4 )
+PIXEL_SATD_C( pixel_satd_4x8,   4, 8 )
+PIXEL_SATD_C( pixel_satd_4x4,   4, 4 )
+
+
+static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height )
+{
+    int x, y;
+    for( y = 0; y < height; y++ )
+    {
+        for( x = 0; x < width; x++ )
+        {
+            dst[x] = ( dst[x] + src[x] + 1 ) >> 1;
+        }
+        dst += i_dst;
+        src += i_src;
+    }
+}
+
+
+#define PIXEL_AVG_C( name, width, height ) \
+static void name( uint8_t *pix1, int i_stride_pix1, \
+                  uint8_t *pix2, int i_stride_pix2 ) \
+{ \
+    pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
+}
+PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
+PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
+PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
+PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
+PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
+PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
+PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
+
+/****************************************************************************
+ * x264_pixel_init:
+ ****************************************************************************/
+void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
+{
+    pixf->sad[PIXEL_16x16] = pixel_sad_16x16;
+    pixf->sad[PIXEL_16x8]  = pixel_sad_16x8;
+    pixf->sad[PIXEL_8x16]  = pixel_sad_8x16;
+    pixf->sad[PIXEL_8x8]   = pixel_sad_8x8;
+    pixf->sad[PIXEL_8x4]   = pixel_sad_8x4;
+    pixf->sad[PIXEL_4x8]   = pixel_sad_4x8;
+    pixf->sad[PIXEL_4x4]   = pixel_sad_4x4;
+
+    pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
+    pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
+    pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
+    pixf->satd[PIXEL_8x8]  = pixel_satd_8x8;
+    pixf->satd[PIXEL_8x4]  = pixel_satd_8x4;
+    pixf->satd[PIXEL_4x8]  = pixel_satd_4x8;
+    pixf->satd[PIXEL_4x4]  = pixel_satd_4x4;
+
+    pixf->avg[PIXEL_16x16]= pixel_avg_16x16;
+    pixf->avg[PIXEL_16x8] = pixel_avg_16x8;
+    pixf->avg[PIXEL_8x16] = pixel_avg_8x16;
+    pixf->avg[PIXEL_8x8]  = pixel_avg_8x8;
+    pixf->avg[PIXEL_8x4]  = pixel_avg_8x4;
+    pixf->avg[PIXEL_4x8]  = pixel_avg_4x8;
+    pixf->avg[PIXEL_4x4]  = pixel_avg_4x4;
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext;
+        pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext;
+        pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext;
+        pixf->sad[PIXEL_8x8  ] = x264_pixel_sad_8x8_mmxext;
+        pixf->sad[PIXEL_8x4  ] = x264_pixel_sad_8x4_mmxext;
+        pixf->sad[PIXEL_4x8  ] = x264_pixel_sad_4x8_mmxext;
+        pixf->sad[PIXEL_4x4]   = x264_pixel_sad_4x4_mmxext;
+
+        pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
+        pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
+        pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;
+        pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_mmxext;
+        pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_mmxext;
+        pixf->satd[PIXEL_4x8]  = x264_pixel_satd_4x8_mmxext;
+        pixf->satd[PIXEL_4x4]  = x264_pixel_satd_4x4_mmxext;
+    }
+#endif
+#ifdef HAVE_ALTIVEC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        x264_pixel_altivec_init( pixf );
+    }
+#endif
+}
+
--- a/core/pixel.h
+++ b/core/pixel.h
@ -0,0 +1,62 @@
+/*****************************************************************************
+ * pixel.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PIXEL_H
+#define _PIXEL_H 1
+
+typedef int  (*x264_pixel_sad_t) ( uint8_t *, int, uint8_t *, int );
+typedef int  (*x264_pixel_satd_t)( uint8_t *, int, uint8_t *, int );
+typedef void (*x264_pixel_avg_t) ( uint8_t *, int, uint8_t *, int );
+
+enum
+{
+    PIXEL_16x16 = 0,
+    PIXEL_16x8  = 1,
+    PIXEL_8x16  = 2,
+    PIXEL_8x8   = 3,
+    PIXEL_8x4   = 4,
+    PIXEL_4x8   = 5,
+    PIXEL_4x4   = 6,
+};
+
+static const struct {
+    int w;
+    int h;
+} x264_pixel_size[7] = {
+    { 16, 16 },
+    { 16,  8 }, {  8, 16 },
+    {  8,  8 },
+    {  8,  4 }, {  4,  8 },
+    {  4,  4 }
+};
+
+typedef struct
+{
+    x264_pixel_sad_t  sad[7];
+    x264_pixel_satd_t satd[7];
+    x264_pixel_avg_t  avg[7];
+} x264_pixel_function_t;
+
+void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
+
+#endif
--- a/core/ppc/mc.c
+++ b/core/ppc/mc.c
@ -0,0 +1,681 @@
+/*****************************************************************************
+ * mc.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "../mc.h"
+#include "../clip1.h"
+#include "mc.h"
+#include "ppccommon.h"
+
+typedef void (*pf_mc_t)( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height );
+
+static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
+{
+    return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
+           pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
+           pix[ 3*i_pix_next];
+}
+static inline int x264_tapfilter1( uint8_t *pix )
+{
+    return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
+           pix[ 3];
+}
+
+/* pixel_avg */
+static inline void pixel_avg_w4( uint8_t *dst,  int i_dst,
+                                 uint8_t *src1, int i_src1,
+                                 uint8_t *src2, int i_src2,
+                                 int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+        }
+        dst  += i_dst;
+        src1 += i_src1;
+        src2 += i_src2;
+    }
+}
+static inline void pixel_avg_w8( uint8_t *dst,  int i_dst,
+                                 uint8_t *src1, int i_src1,
+                                 uint8_t *src2, int i_src2,
+                                 int i_height )
+{
+    /* TODO - optimize */
+    pixel_avg_w4( &dst[0], i_dst, &src1[0], i_src1, &src2[0], i_src2,
+                  i_height );
+    pixel_avg_w4( &dst[4], i_dst, &src1[4], i_src1, &src2[4], i_src2,
+                  i_height );
+}
+static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
+                                  uint8_t *src1, int i_src1,
+                                  uint8_t *src2, int i_src2,
+                                  int i_height )
+{
+    int y;
+    vector_u8_t src1v, src2v;
+    for( y = 0; y < i_height; y++ )
+    {
+        LOAD_16( src1, src1v );
+        LOAD_16( src2, src2v );
+        src1v = vec_avg( src1v, src2v );
+        STORE_16( src1v, dst );
+
+        dst  += i_dst;
+        src1 += i_src1;
+        src2 += i_src2;
+    }
+}
+
+/* mc_copy: plain c */
+#define MC_COPY( name, a )                                \
+static void name( uint8_t *src, int i_src,                \
+                  uint8_t *dst, int i_dst, int i_height ) \
+{                                                         \
+    int y;                                                \
+    for( y = 0; y < i_height; y++ )                       \
+    {                                                     \
+        memcpy( dst, src, a );                            \
+        src += i_src;                                     \
+        dst += i_dst;                                     \
+    }                                                     \
+}
+MC_COPY( mc_copy_w4,  4  )
+MC_COPY( mc_copy_w8,  8  )
+MC_COPY( mc_copy_w16, 16 )
+
+/* TAP_FILTER:
+   a is source (vector_s16_t [6])
+   b is a temporary vector_s16_t
+   c is the result
+
+   c   = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
+   c  += 16;
+   c >>= 5;
+   c  += 80; */
+#define TAP_FILTER( a, b, c )                       \
+    c = vec_add( a[0], a[5] );                      \
+    b = vec_add( a[1], a[4] );                      \
+    c = vec_sub( c, b );                            \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_sub( c, b );                            \
+    b = vec_add( a[2], a[3] );                      \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_add( c, b );                            \
+    b = vec_sl( b, vec_splat_u16( 2 ) );            \
+    c = vec_add( c, b );                            \
+    c = vec_add( c, vec_splat_s16( 8 ) );           \
+    c = vec_add( c, vec_splat_s16( 8 ) );           \
+    c = vec_sr( c, vec_splat_u16( 5 ) );            \
+    c = vec_add( c, vec_sl( vec_splat_s16( 5 ),     \
+                            vec_splat_u16( 4 ) ) );
+
+/* mc_hh */
+static inline void mc_hh_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter1( &src[x] ) +
+                                      16 ) >> 5 );
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hh_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    DECLARE_ALIGNED( int16_t, tmp[8], 16 );
+
+    LOAD_ZERO;
+    vector_u8_t    loadv;
+    vector_s16_t   srcv[6];
+    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
+    vector_s16_t   dstv;
+    vector_s16_t   tmpv;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        LOAD_16( &src[-2], loadv );
+
+        for( x = 0; x < 6; x++ )
+        {
+            _srcv[x] = vec_perm( loadv, zero_u8,
+                                 vec_lvsl( 0, (int*) x ) );
+            CONVERT_U8_TO_S16( srcv[x] );
+        }
+
+        TAP_FILTER( srcv, tmpv, dstv );
+        vec_st( dstv, 0, tmp );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1_table[tmp[x]];
+        }
+
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hh_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hh_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hh_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc_hv */
+static inline void mc_hv_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            dst[x] = x264_mc_clip1( ( x264_tapfilter( &src[x], i_src ) +
+                                      16 ) >> 5 );
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hv_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    int x, y;
+    DECLARE_ALIGNED( int16_t, tmp[8], 16 );
+
+    LOAD_ZERO;
+    vector_s16_t   srcv[6];
+    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
+    vector_s16_t   dstv;
+    vector_s16_t   tmpv;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        if( y )
+        {
+            for( x = 0; x < 5; x++ )
+            {
+                srcv[x] = srcv[x+1];
+            }
+            LOAD_8( &src[3*i_src], _srcv[5] );
+            CONVERT_U8_TO_S16( srcv[5] );
+        }
+        else
+        {
+            for( x = 0; x < 6; x++ )
+            {
+                LOAD_8( &src[(x-2)*i_src], _srcv[x] );
+                CONVERT_U8_TO_S16( srcv[x] );
+            }
+        }
+
+        TAP_FILTER( srcv, tmpv, dstv );
+        vec_st( dstv, 0, tmp );
+
+        for( x = 0; x < 8; x++ )
+        {
+            dst[x] = x264_mc_clip1_table[tmp[x]];
+        }
+        src += i_src;
+        dst += i_dst;
+    }
+}
+static inline void mc_hv_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hv_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hv_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc_hc */
+static inline void mc_hc_w4( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t *out;
+    uint8_t *pix;
+    int x, y;
+
+    for( x = 0; x < 4; x++ )
+    {
+        int tap[6];
+
+        pix = &src[x];
+        out = &dst[x];
+
+        tap[0] = x264_tapfilter1( &pix[-2*i_src] );
+        tap[1] = x264_tapfilter1( &pix[-1*i_src] );
+        tap[2] = x264_tapfilter1( &pix[ 0*i_src] );
+        tap[3] = x264_tapfilter1( &pix[ 1*i_src] );
+        tap[4] = x264_tapfilter1( &pix[ 2*i_src] );
+
+        for( y = 0; y < i_height; y++ )
+        {
+            tap[5] = x264_tapfilter1( &pix[ 3*i_src] );
+
+            *out = x264_mc_clip1( ( tap[0] - 5*tap[1] + 20 * tap[2] +
+                                    20 * tap[3] -5*tap[4] + tap[5] +
+                                    512 ) >> 10 );
+
+            /* Next line */
+            pix += i_src;
+            out += i_dst;
+            tap[0] = tap[1];
+            tap[1] = tap[2];
+            tap[2] = tap[3];
+            tap[3] = tap[4];
+            tap[4] = tap[5];
+        }
+    }
+}
+static inline void mc_hc_w8( uint8_t *src, int i_src,
+                             uint8_t *dst, int i_dst, int i_height )
+{
+    /* TODO: optimize */
+    mc_hc_w4( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hc_w4( &src[4], i_src, &dst[4], i_dst, i_height );
+}
+static inline void mc_hc_w16( uint8_t *src, int i_src,
+                              uint8_t *dst, int i_dst, int i_height )
+{
+    mc_hc_w8( &src[0], i_src, &dst[0], i_dst, i_height );
+    mc_hc_w8( &src[8], i_src, &dst[8], i_dst, i_height );
+}
+
+/* mc I+H */
+static void mc_xy10_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hh_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
+}
+static void mc_xy10_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hh_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
+}
+static void mc_xy10_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
+}
+
+static void mc_xy30_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hh_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src + 1, i_src, tmp, 4, i_height );
+}
+static void mc_xy30_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hh_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src + 1, i_src, tmp, 8, i_height );
+}
+static void mc_xy30_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hh_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src + 1, i_src, tmp, 16, i_height );
+}
+
+/* mc I+V */
+static void mc_xy01_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hv_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src, i_src, tmp, 4, i_height );
+}
+static void mc_xy01_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hv_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src, i_src, tmp, 8, i_height );
+}
+static void mc_xy01_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src, i_src, tmp, 16, i_height );
+}
+
+static void mc_xy03_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*4];
+    mc_hv_w4( src, i_src, tmp, 4, i_height );
+    pixel_avg_w4( dst, i_dst, src + i_src, i_src, tmp, 4, i_height );
+}
+static void mc_xy03_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*8];
+    mc_hv_w8( src, i_src, tmp, 8, i_height );
+    pixel_avg_w8( dst, i_dst, src + i_src, i_src, tmp, 8, i_height );
+}
+static void mc_xy03_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp[16*16];
+    mc_hv_w16( src, i_src, tmp, 16, i_height );
+    pixel_avg_w16( dst, i_dst, src + i_src, i_src, tmp, 16, i_height );
+}
+
+/* H+V */
+static void mc_xy11_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy11_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy11_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy31_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src+1, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src,   i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy31_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src+1, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src,   i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy31_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src+1, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src,   i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy13_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src,       i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy13_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src,       i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy13_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src,       i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy33_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hv_w4( src+1,     i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy33_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hv_w8( src+1,     i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy33_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hv_w16( src+1,     i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy21_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src, i_src, tmp1, 4, i_height );
+    mc_hh_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy21_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src, i_src, tmp1, 8, i_height );
+    mc_hh_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy21_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src, i_src, tmp1, 16, i_height );
+    mc_hh_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy12_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src, i_src, tmp1, 4, i_height );
+    mc_hv_w4( src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy12_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src, i_src, tmp1, 8, i_height );
+    mc_hv_w8( src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy12_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src, i_src, tmp1, 16, i_height );
+    mc_hv_w16( src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy32_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src,   i_src, tmp1, 4, i_height );
+    mc_hv_w4( src+1, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy32_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src,   i_src, tmp1, 8, i_height );
+    mc_hv_w8( src+1, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy32_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src,   i_src, tmp1, 16, i_height );
+    mc_hv_w16( src+1, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void mc_xy23_w4( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*4];
+    uint8_t tmp2[16*4];
+    mc_hc_w4( src,       i_src, tmp1, 4, i_height );
+    mc_hh_w4( src+i_src, i_src, tmp2, 4, i_height );
+    pixel_avg_w4( dst, i_dst, tmp1, 4, tmp2, 4, i_height );
+}
+static void mc_xy23_w8( uint8_t *src, int i_src,
+                        uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*8];
+    uint8_t tmp2[16*8];
+    mc_hc_w8( src,       i_src, tmp1, 8, i_height );
+    mc_hh_w8( src+i_src, i_src, tmp2, 8, i_height );
+    pixel_avg_w8( dst, i_dst, tmp1, 8, tmp2, 8, i_height );
+}
+static void mc_xy23_w16( uint8_t *src, int i_src,
+                         uint8_t *dst, int i_dst, int i_height )
+{
+    uint8_t tmp1[16*16];
+    uint8_t tmp2[16*16];
+    mc_hc_w16( src,       i_src, tmp1, 16, i_height );
+    mc_hh_w16( src+i_src, i_src, tmp2, 16, i_height );
+    pixel_avg_w16( dst, i_dst, tmp1, 16, tmp2, 16, i_height );
+}
+
+static void motion_compensation_luma( uint8_t *src, int i_src,
+                                      uint8_t *dst, int i_dst,
+                                      int mvx,int mvy,
+                                      int i_width, int i_height )
+{
+    static const pf_mc_t pf_mc[3][4][4] =    /*XXX [dqy][dqx] */
+    {
+        {
+            { mc_copy_w4,  mc_xy10_w4,    mc_hh_w4,      mc_xy30_w4 },
+            { mc_xy01_w4,  mc_xy11_w4,    mc_xy21_w4,    mc_xy31_w4 },
+            { mc_hv_w4,    mc_xy12_w4,    mc_hc_w4,      mc_xy32_w4 },
+            { mc_xy03_w4,  mc_xy13_w4,    mc_xy23_w4,    mc_xy33_w4 },
+        },
+        {
+            { mc_copy_w8,  mc_xy10_w8,    mc_hh_w8,      mc_xy30_w8 },
+            { mc_xy01_w8,  mc_xy11_w8,    mc_xy21_w8,    mc_xy31_w8 },
+            { mc_hv_w8,    mc_xy12_w8,    mc_hc_w8,      mc_xy32_w8 },
+            { mc_xy03_w8,  mc_xy13_w8,    mc_xy23_w8,    mc_xy33_w8 },
+        },
+        {
+            { mc_copy_w16,  mc_xy10_w16,    mc_hh_w16,      mc_xy30_w16 },
+            { mc_xy01_w16,  mc_xy11_w16,    mc_xy21_w16,    mc_xy31_w16 },
+            { mc_hv_w16,    mc_xy12_w16,    mc_hc_w16,      mc_xy32_w16 },
+            { mc_xy03_w16,  mc_xy13_w16,    mc_xy23_w16,    mc_xy33_w16 },
+        }
+    };
+
+    src += (mvy >> 2) * i_src + (mvx >> 2);
+    if( i_width == 4 )
+    {
+        pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+    else if( i_width == 8 )
+    {
+        pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+    else if( i_width == 16 )
+    {
+        pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src, dst, i_dst, i_height );
+    }
+}
+
+void x264_mc_altivec_init( x264_mc_function_t pf[2] )
+{
+    pf[MC_LUMA] = motion_compensation_luma;
+}
--- a/core/ppc/mc.h
+++ b/core/ppc/mc.h
@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: mc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PPC_MC_H
+#define _PPC_MC_H 1
+
+void x264_mc_altivec_init( x264_mc_function_t pf[2] );
+
+#endif
--- a/core/ppc/pixel.c
+++ b/core/ppc/pixel.c
@ -0,0 +1,215 @@
+/*****************************************************************************
+ * pixel.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "../pixel.h"
+#include "pixel.h"
+#include "ppccommon.h"
+
+/* sad routines */
+#define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b )        \
+static int name( uint8_t *pix1, int i_pix1,            \
+                 uint8_t *pix2, int i_pix2 )           \
+{                                                      \
+    int y;                                             \
+    DECLARE_ALIGNED( int, sum, 16 );                   \
+                                                       \
+    LOAD_ZERO;                                         \
+    vector_u8_t  pix1v, pix2v;                         \
+    vector_s32_t sumv = zero_s32;                      \
+    for( y = 0; y < ly; y++ )                          \
+    {                                                  \
+        LOAD_##lx( pix1, pix1v );                      \
+        LOAD_##lx( pix2, pix2v );                      \
+        sumv = (vector_s32_t) vec_sum4s(               \
+                   vec_sub( vec_max( pix1v, pix2v ),   \
+                            vec_min( pix1v, pix2v ) ), \
+                   (vector_u32_t) sumv );              \
+        pix1 += i_pix1;                                \
+        pix2 += i_pix2;                                \
+    }                                                  \
+    sumv = vec_sum##a( sumv, zero_s32 );               \
+    vec_ste( vec_splat( sumv, b ), 0, &sum );          \
+    return sum;                                        \
+}
+
+PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s,  3 )
+PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec,  8,  16, 2s, 1 )
+PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec,  16, 8,  s,  3 )
+PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
+
+/* satd routines */
+static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
+                                          uint8_t *pix2, int i_pix2 )
+{
+    int i;
+    DECLARE_ALIGNED( int, i_satd, 16 );
+
+    LOAD_ZERO;
+    vector_s32_t satdv = zero_s32;
+    vector_u8_t  pix1u8v, pix2u8v;
+    vector_s16_t pix1s16v, pix2s16v;
+    vector_s16_t diffv[8];
+    vector_s16_t tmpv[8];
+    vector_s16_t s01v, s23v, d01v, d23v;
+
+    /* Diff 8x8 */
+    for( i = 0; i < 8; i++ )
+    {
+        LOAD_8( pix1, pix1u8v );
+        LOAD_8( pix2, pix2u8v );
+
+        /* u8 -> s16 conversion */
+        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
+        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+
+        diffv[i] = vec_sub( pix1s16v, pix2s16v );
+
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    /* Hadamar H */
+    HADAMAR( &diffv[0], &tmpv[0] );
+    HADAMAR( &diffv[4], &tmpv[4] );
+
+    /* Transpose */
+    TRANSPOSE8x8( tmpv, diffv );
+
+    /* Hadamar V */
+    HADAMAR( &diffv[0], &tmpv[0] );
+    HADAMAR( &diffv[4], &tmpv[4] );
+
+    /* Sum of absolute values */
+    for( i = 0; i < 8; i++ )
+    {
+        satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
+    }
+    satdv = vec_sums( satdv, zero_s32 );
+
+    /* Done */
+    vec_ste( vec_splat( satdv, 3 ), 0, &i_satd );
+    return i_satd / 2;
+}
+
+static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
+                                    uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8], i_pix1,
+                                   &pix2[8], i_pix2 );
+}
+static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
+                                    uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1], i_pix1,
+                                   &pix2[8*i_pix2], i_pix2 );
+}
+static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
+                                     uint8_t *pix2, int i_pix2 )
+{
+    return pixel_satd_8x8_altivec( &pix1[0], i_pix1,
+                                   &pix2[0], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8], i_pix1,
+                                   &pix2[8], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1], i_pix1,
+                                   &pix2[8*i_pix2], i_pix2 ) +
+           pixel_satd_8x8_altivec( &pix1[8*i_pix1+8], i_pix1,
+                                   &pix2[8*i_pix2+8], i_pix2 );
+}
+
+static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
+                                          uint8_t *pix2, int i_pix2 )
+{
+    int i;
+    DECLARE_ALIGNED( int, i_satd, 16 );
+
+    LOAD_ZERO;
+    vector_s32_t satdv = zero_s32;
+    vector_u8_t  pix1u8v, pix2u8v;
+    vector_s16_t pix1s16v, pix2s16v;
+    vector_s16_t diffv[4];
+    vector_s16_t tmpv[4];
+    vector_s16_t s01v, s23v, d01v, d23v;
+
+    /* Diff 4x8 */
+    for( i = 0; i < 4; i++ )
+    {
+        LOAD_4( pix1, pix1u8v );
+        LOAD_4( pix2, pix2u8v );
+
+        /* u8 -> s16 conversion */
+        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
+        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+
+        diffv[i] = vec_sub( pix1s16v, pix2s16v );
+
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    /* Hadamar H */
+    HADAMAR( diffv, tmpv );
+
+    /* Transpose */
+    TRANSPOSE4x4( tmpv, diffv );
+
+    /* Hadamar V */
+    HADAMAR( diffv, tmpv );
+
+    /* Sum of absolute values */
+    for( i = 0; i < 4; i++ )
+    {
+        satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
+    }
+    satdv = vec_sum2s( satdv, zero_s32 );
+
+    /* Done */
+    vec_ste( vec_splat( satdv, 1 ), 0, &i_satd );
+    return i_satd / 2;
+}
+
+/****************************************************************************
+ * x264_pixel_init:
+ ****************************************************************************/
+void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
+{
+    pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
+    pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
+    pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
+    pixf->sad[PIXEL_8x8]    = pixel_sad_8x8_altivec;
+
+    pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
+    pixf->satd[PIXEL_8x16]  = pixel_satd_8x16_altivec;
+    pixf->satd[PIXEL_16x8]  = pixel_satd_16x8_altivec;
+    pixf->satd[PIXEL_8x8]   = pixel_satd_8x8_altivec;
+    pixf->satd[PIXEL_4x4]   = pixel_satd_4x4_altivec;
+}
--- a/core/ppc/pixel.h
+++ b/core/ppc/pixel.h
@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: pixel.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PPC_PIXEL_H
+#define _PPC_PIXEL_H 1
+
+void x264_pixel_altivec_init( x264_pixel_function_t *pixf );
+
+#endif
--- a/core/ppc/ppccommon.h
+++ b/core/ppc/ppccommon.h
@ -0,0 +1,158 @@
+/*****************************************************************************
+ * ppccommon.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ppccommon.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Eric Petit <titer@m0k.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* Handy */
+#define vector_u8_t  vector unsigned char
+#define vector_s16_t vector signed short
+#define vector_u32_t vector unsigned int
+#define vector_s32_t vector signed int
+
+#define LOAD_ZERO    vector_s32_t zero = vec_splat_s32( 0 )
+#define zero_u8      (vector_u8_t)  zero
+#define zero_s16     (vector_s16_t) zero
+#define zero_s32     (vector_s32_t) zero
+
+#define CONVERT_U8_TO_S16( a ) \
+    a = (vector_s16_t) vec_mergeh( zero_u8, (vector_u8_t) a )
+
+/* Macros to load aligned or unaligned data without risking buffer
+   overflows. */
+#define LOAD_16( p, v )                                \
+    if( (int) p & 0xF )                                \
+    {                                                  \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
+                      vec_lvsl( 0, p ) );              \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        v = vec_ld( 0, p );                            \
+    }
+
+#define LOAD_8( p, v )                                             \
+    if( !( (int) p & 0xF ) )                                       \
+    {                                                              \
+        v = vec_ld( 0, p );                                        \
+    }                                                              \
+    else if( ( (int) p & 0xF ) < 9 )                               \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
+                      vec_lvsl( 0, p ) );                          \
+    }                                                              \
+    else                                                           \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
+                      vec_lvsl( 0, p ) );                          \
+    }
+
+#define LOAD_4( p, v )                                             \
+    if( !( (int) p & 0xF ) )                                       \
+    {                                                              \
+        v = vec_ld( 0, p );                                        \
+    }                                                              \
+    else if( ( (int) p & 0xF ) < 13 )                              \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
+                      vec_lvsl( 0, p ) );                          \
+    }                                                              \
+    else                                                           \
+    {                                                              \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
+                      vec_lvsl( 0, p ) );                          \
+    }
+
+/* Store aligned or unaligned data */
+#define STORE_16( v, p )                              \
+    if( (int) p & 0xF )                               \
+    {                                                 \
+        vector unsigned char tmp1, tmp2;              \
+        vector unsigned char align, mask;             \
+        tmp1 = vec_ld( 0, p );                        \
+        tmp2 = vec_ld( 16, p );                       \
+        align = vec_lvsr( 0, p );                     \
+        mask = vec_perm( (vector unsigned char) (0),  \
+                         (vector unsigned char) (-1), \
+                         align);                      \
+        v = vec_perm( v, v, align);                   \
+        tmp1 = vec_sel( tmp1, v, mask );              \
+        tmp2 = vec_sel( v, tmp2, mask );              \
+        vec_st( tmp1, 0, p );                         \
+        vec_st( tmp2, 16, p );                        \
+    }                                                 \
+    else                                              \
+    {                                                 \
+        vec_st( v, 0, p );                            \
+    }
+
+/* Transpose 8x8 (vector_s16_t [8]) */
+#define TRANSPOSE8x8( a, b )           \
+    b[0] = vec_mergeh( a[0], a[4] ); \
+    b[1] = vec_mergel( a[0], a[4] ); \
+    b[2] = vec_mergeh( a[1], a[5] ); \
+    b[3] = vec_mergel( a[1], a[5] ); \
+    b[4] = vec_mergeh( a[2], a[6] ); \
+    b[5] = vec_mergel( a[2], a[6] ); \
+    b[6] = vec_mergeh( a[3], a[7] ); \
+    b[7] = vec_mergel( a[3], a[7] ); \
+    a[0] = vec_mergeh( b[0], b[4] ); \
+    a[1] = vec_mergel( b[0], b[4] ); \
+    a[2] = vec_mergeh( b[1], b[5] ); \
+    a[3] = vec_mergel( b[1], b[5] ); \
+    a[4] = vec_mergeh( b[2], b[6] ); \
+    a[5] = vec_mergel( b[2], b[6] ); \
+    a[6] = vec_mergeh( b[3], b[7] ); \
+    a[7] = vec_mergel( b[3], b[7] ); \
+    b[0] = vec_mergeh( a[0], a[4] ); \
+    b[1] = vec_mergel( a[0], a[4] ); \
+    b[2] = vec_mergeh( a[1], a[5] ); \
+    b[3] = vec_mergel( a[1], a[5] ); \
+    b[4] = vec_mergeh( a[2], a[6] ); \
+    b[5] = vec_mergel( a[2], a[6] ); \
+    b[6] = vec_mergeh( a[3], a[7] ); \
+    b[7] = vec_mergel( a[3], a[7] );
+
+/* Transpose 4x4 (vector_s16_t [4]) */
+#define TRANSPOSE4x4( a, b ) \
+    (b)[0] = vec_mergeh( (a)[0], zero_s16 ); \
+    (b)[1] = vec_mergeh( (a)[1], zero_s16 ); \
+    (b)[2] = vec_mergeh( (a)[2], zero_s16 ); \
+    (b)[3] = vec_mergeh( (a)[3], zero_s16 ); \
+    (a)[0] = vec_mergeh( (b)[0], (b)[2] );   \
+    (a)[1] = vec_mergel( (b)[0], (b)[2] );   \
+    (a)[2] = vec_mergeh( (b)[1], (b)[3] );   \
+    (a)[3] = vec_mergel( (b)[1], (b)[3] );   \
+    (b)[0] = vec_mergeh( (a)[0], (a)[2] );   \
+    (b)[1] = vec_mergel( (a)[0], (a)[2] );   \
+    (b)[2] = vec_mergeh( (a)[1], (a)[3] );   \
+    (b)[3] = vec_mergel( (a)[1], (a)[3] );
+
+/* Hadamar (vector_s16_t [4]) */
+#define HADAMAR( a, b ) \
+    s01v   = vec_add( (a)[0], (a)[1] ); \
+    s23v   = vec_add( (a)[2], (a)[3] ); \
+    d01v   = vec_sub( (a)[0], (a)[1] ); \
+    d23v   = vec_sub( (a)[2], (a)[3] ); \
+    (b)[0] = vec_add( s01v, s23v );     \
+    (b)[1] = vec_sub( s01v, s23v );     \
+    (b)[2] = vec_sub( d01v, d23v );     \
+    (b)[3] = vec_add( d01v, d23v );
+
--- a/core/predict.c
+++ b/core/predict.c
@ -0,0 +1,697 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* XXX predict4x4 are inspired from ffmpeg h264 decoder
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "x264.h"
+#include "predict.h"
+
+#ifdef _MSC_VER
+#undef HAVE_MMXEXT  /* not finished now */
+#endif
+#ifdef HAVE_MMXEXT
+#   include "i386/predict.h"
+#endif
+
+static inline int clip_uint8( int a )
+{
+    if (a&(~255))
+        return (-a)>>31;
+    else
+        return a;
+}
+
+/****************************************************************************
+ * 16x16 prediction for intra block DC, H, V, P
+ ****************************************************************************/
+static void predict_16x16_dc( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i, j;
+
+    /* calculate DC value */
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+        dc += src[i - i_stride];
+    }
+    dc = ( dc + 16 ) >> 5;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_left( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[-1 + i * i_stride];
+    }
+    dc = ( dc + 8 ) >> 4;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_top( uint8_t *src, int i_stride )
+{
+    int dc = 0;
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        dc += src[i - i_stride];
+    }
+    dc = ( dc + 8 ) >> 4;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_dc_128( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_16x16_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+        for( j = 0; j < 16; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+
+    }
+}
+static void predict_16x16_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 16; i++ )
+    {
+        for( j = 0; j < 16; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+static void predict_16x16_p( uint8_t *src, int i_stride )
+{
+    int x, y, i;
+    int a, b, c;
+    int H = 0;
+    int V = 0;
+    int i00;
+
+    /* calcule H and V */
+    for( i = 0; i <= 7; i++ )
+    {
+        H += ( i + 1 ) * ( src[ 8 + i - i_stride ] - src[6 -i -i_stride] );
+        V += ( i + 1 ) * ( src[-1 + (8+i)*i_stride] - src[-1 + (6-i)*i_stride] );
+    }
+
+    a = 16 * ( src[-1 + 15*i_stride] + src[15 - i_stride] );
+    b = ( 5 * H + 32 ) >> 6;
+    c = ( 5 * V + 32 ) >> 6;
+
+    i00 = a - b * 7 - c * 7 + 16;
+
+    for( y = 0; y < 16; y++ )
+    {
+        for( x = 0; x < 16; x++ )
+        {
+            int pix;
+
+            pix = (i00+b*x)>>5;
+
+            src[x] = clip_uint8( pix );
+        }
+        src += i_stride;
+        i00 += c;
+    }
+}
+
+
+/****************************************************************************
+ * 8x8 prediction for intra chroma block DC, H, V, P
+ ****************************************************************************/
+static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+{
+    int x,y;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            src[x] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc0 = 0, dc1 = 0;
+
+    for( y = 0; y < 4; y++ )
+    {
+        dc0 += src[y * i_stride     - 1];
+        dc1 += src[(y+4) * i_stride - 1];
+    }
+    dc0 = ( dc0 + 2 ) >> 2;
+    dc1 = ( dc1 + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            src[           x] = dc0;
+            src[4*i_stride+x] = dc1;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc0 = 0, dc1 = 0;
+
+    for( x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - i_stride];
+        dc1 += src[x + 4 - i_stride];
+    }
+    dc0 = ( dc0 + 2 ) >> 2;
+    dc1 = ( dc1 + 2 ) >> 2;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x    ] = dc0;
+            src[x + 4] = dc1;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    int dc0, dc1, dc2, dc3;
+    int i;
+
+    /* First do :
+          s0 s1
+       s2
+       s3
+    */
+    for( i = 0; i < 4; i++ )
+    {
+        s0 += src[i - i_stride];
+        s1 += src[i + 4 - i_stride];
+        s2 += src[-1 + i * i_stride];
+        s3 += src[-1 + (i+4)*i_stride];
+    }
+    /* now calculate
+       dc0 dc1
+       dc2 dc3
+     */
+    dc0 = ( s0 + s2 + 4 ) >> 3;
+    dc1 = ( s1 + 2 ) >> 2;
+    dc2 = ( s3 + 2 ) >> 2;
+    dc3 = ( s1 + s3 + 4 ) >> 3;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[             x    ] = dc0;
+            src[             x + 4] = dc1;
+            src[4*i_stride + x    ] = dc2;
+            src[4*i_stride + x + 4] = dc3;
+        }
+        src += i_stride;
+    }
+}
+
+static void predict_8x8_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 8; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+
+        for( j = 0; j < 8; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+    }
+}
+static void predict_8x8_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 8; i++ )
+    {
+        for( j = 0; j < 8; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+
+static void predict_8x8_p( uint8_t *src, int i_stride )
+{
+    int i;
+    int x,y;
+    int a, b, c;
+    int H = 0;
+    int V = 0;
+    int i00;
+
+    for( i = 0; i < 4; i++ )
+    {
+        H += ( i + 1 ) * ( src[4+i - i_stride] - src[2 - i -i_stride] );
+        V += ( i + 1 ) * ( src[-1 +(i+4)*i_stride] - src[-1+(2-i)*i_stride] );
+    }
+
+    a = 16 * ( src[-1+7*i_stride] + src[7 - i_stride] );
+    b = ( 17 * H + 16 ) >> 5;
+    c = ( 17 * V + 16 ) >> 5;
+    i00 = a -3*b -3*c + 16;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            int pix;
+
+            pix = (i00 +b*x) >> 5;
+            src[x] = clip_uint8( pix );
+        }
+        src += i_stride;
+        i00 += c;
+    }
+}
+
+/****************************************************************************
+ * 4x4 prediction for intra luma block DC, H, V, P
+ ****************************************************************************/
+static void predict_4x4_dc_128( uint8_t *src, int i_stride )
+{
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = 128;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_left( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[-1+0*i_stride] + src[-1+i_stride]+
+               src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc_top( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[0 - i_stride] + src[1 - i_stride] +
+               src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_dc( uint8_t *src, int i_stride )
+{
+    int x,y;
+    int dc = ( src[-1+0*i_stride] + src[-1+i_stride]+
+               src[-1+2*i_stride] + src[-1+3*i_stride] +
+               src[0 - i_stride]  + src[1 - i_stride] +
+               src[2 - i_stride]  + src[3 - i_stride] + 4 ) >> 3;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            src[x] = dc;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_h( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 4; i++ )
+    {
+        uint8_t v;
+
+        v = src[-1];
+
+        for( j = 0; j < 4; j++ )
+        {
+            src[j] = v;
+        }
+        src += i_stride;
+    }
+}
+static void predict_4x4_v( uint8_t *src, int i_stride )
+{
+    int i,j;
+
+    for( i = 0; i < 4; i++ )
+    {
+        for( j = 0; j < 4; j++ )
+        {
+            src[i * i_stride +j] = src[j - i_stride];
+        }
+    }
+}
+
+#define PREDICT_4x4_LOAD_LEFT \
+    const int l0 = src[-1+0*i_stride];   \
+    const int l1 = src[-1+1*i_stride];   \
+    const int l2 = src[-1+2*i_stride];   \
+    const int l3 = src[-1+3*i_stride];
+
+#define PREDICT_4x4_LOAD_TOP \
+    const int t0 = src[0-1*i_stride];   \
+    const int t1 = src[1-1*i_stride];   \
+    const int t2 = src[2-1*i_stride];   \
+    const int t3 = src[3-1*i_stride];
+
+#define PREDICT_4x4_LOAD_TOP_RIGHT \
+    const int t4 = src[4-1*i_stride];   \
+    const int t5 = src[5-1*i_stride];   \
+    const int t6 = src[6-1*i_stride];   \
+    const int t7 = src[7-1*i_stride];
+
+
+static void predict_4x4_ddl( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_TOP
+    PREDICT_4x4_LOAD_TOP_RIGHT
+
+    src[0*i_stride+0] = ( t0 + 2*t1+ t2 + 2 ) >> 2;
+
+    src[0*i_stride+1] =
+    src[1*i_stride+0] = ( t1 + 2*t2+ t3 + 2 ) >> 2;
+
+    src[0*i_stride+2] =
+    src[1*i_stride+1] =
+    src[2*i_stride+0] = ( t2 + 2*t3+ t4 + 2 ) >> 2;
+
+    src[0*i_stride+3] =
+    src[1*i_stride+2] =
+    src[2*i_stride+1] =
+    src[3*i_stride+0] = ( t3 + 2*t4+ t5 + 2 ) >> 2;
+
+    src[1*i_stride+3] =
+    src[2*i_stride+2] =
+    src[3*i_stride+1] = ( t4 + 2*t5+ t6 + 2 ) >> 2;
+
+    src[2*i_stride+3] =
+    src[3*i_stride+2] = ( t5 + 2*t6+ t7 + 2 ) >> 2;
+
+    src[3*i_stride+3] = ( t6 + 3 * t7 + 2 ) >> 2;
+}
+static void predict_4x4_ddr( uint8_t *src, int i_stride )
+{
+    const int lt = src[-1-i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+
+    src[0*i_stride+0] =
+    src[1*i_stride+1] =
+    src[2*i_stride+2] =
+    src[3*i_stride+3] = ( t0 + 2*lt +l0 + 2 ) >> 2;
+
+    src[0*i_stride+1] =
+    src[1*i_stride+2] =
+    src[2*i_stride+3] = ( lt + 2 * t0 + t1 + 2 ) >> 2;
+
+    src[0*i_stride+2] =
+    src[1*i_stride+3] = ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+
+    src[0*i_stride+3] = ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+
+    src[1*i_stride+0] =
+    src[2*i_stride+1] =
+    src[3*i_stride+2] = ( lt + 2 * l0 + l1 + 2 ) >> 2;
+
+    src[2*i_stride+0] =
+    src[3*i_stride+1] = ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+
+    src[3*i_stride+0] = ( l1 + 2 * l2 + l3 + 2 ) >> 2;
+}
+
+static void predict_4x4_vr( uint8_t *src, int i_stride )
+{
+    const int lt = src[-1-i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+    /* produce warning as l3 is unused */
+
+    src[0*i_stride+0]=
+    src[2*i_stride+1]= ( lt + t0 + 1 ) >> 1;
+
+    src[0*i_stride+1]=
+    src[2*i_stride+2]= ( t0 + t1 + 1 ) >> 1;
+
+    src[0*i_stride+2]=
+    src[2*i_stride+3]= ( t1 + t2 + 1 ) >> 1;
+
+    src[0*i_stride+3]= ( t2 + t3 + 1 ) >> 1;
+
+    src[1*i_stride+0]=
+    src[3*i_stride+1]= ( l0 + 2 * lt + t0 + 2 ) >> 2;
+
+    src[1*i_stride+1]=
+    src[3*i_stride+2]= ( lt + 2 * t0 + t1 + 2 ) >> 2;
+
+    src[1*i_stride+2]=
+    src[3*i_stride+3]= ( t0 + 2 * t1 + t2 + 2) >> 2;
+
+    src[1*i_stride+3]= ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+    src[2*i_stride+0]= ( lt + 2 * l0 + l1 + 2 ) >> 2;
+    src[3*i_stride+0]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+}
+
+static void predict_4x4_hd( uint8_t *src, int i_stride )
+{
+    const int lt= src[-1-1*i_stride];
+    PREDICT_4x4_LOAD_LEFT
+    PREDICT_4x4_LOAD_TOP
+    /* produce warning as t3 is unused */
+
+    src[0*i_stride+0]=
+    src[1*i_stride+2]= ( lt + l0 + 1 ) >> 1;
+    src[0*i_stride+1]=
+    src[1*i_stride+3]= ( l0 + 2 * lt + t0 + 2 ) >> 2;
+    src[0*i_stride+2]= ( lt + 2 * t0 + t1 + 2 ) >> 2;
+    src[0*i_stride+3]= ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+    src[1*i_stride+0]=
+    src[2*i_stride+2]= ( l0 + l1 + 1 ) >> 1;
+    src[1*i_stride+1]=
+    src[2*i_stride+3]= ( lt + 2 * l0 + l1 + 2 ) >> 2;
+    src[2*i_stride+0]=
+    src[3*i_stride+2]= ( l1 + l2+ 1 ) >> 1;
+    src[2*i_stride+1]=
+    src[3*i_stride+3]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+    src[3*i_stride+0]= ( l2 + l3 + 1 ) >> 1;
+    src[3*i_stride+1]= ( l1 + 2 * l2 + l3 + 2 ) >> 2;
+}
+
+static void predict_4x4_vl( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_TOP
+    PREDICT_4x4_LOAD_TOP_RIGHT
+    /* produce warning as t7 is unused */
+
+    src[0*i_stride+0]= ( t0 + t1 + 1 ) >> 1;
+    src[0*i_stride+1]=
+    src[2*i_stride+0]= ( t1 + t2 + 1 ) >> 1;
+    src[0*i_stride+2]=
+    src[2*i_stride+1]= ( t2 + t3 + 1 ) >> 1;
+    src[0*i_stride+3]=
+    src[2*i_stride+2]= ( t3 + t4+ 1 ) >> 1;
+    src[2*i_stride+3]= ( t4 + t5+ 1 ) >> 1;
+    src[1*i_stride+0]= ( t0 + 2 * t1 + t2 + 2 ) >> 2;
+    src[1*i_stride+1]=
+    src[3*i_stride+0]= ( t1 + 2 * t2 + t3 + 2 ) >> 2;
+    src[1*i_stride+2]=
+    src[3*i_stride+1]= ( t2 + 2 * t3 + t4 + 2 ) >> 2;
+    src[1*i_stride+3]=
+    src[3*i_stride+2]= ( t3 + 2 * t4 + t5 + 2 ) >> 2;
+    src[3*i_stride+3]= ( t4 + 2 * t5 + t6 + 2 ) >> 2;
+}
+
+static void predict_4x4_hu( uint8_t *src, int i_stride )
+{
+    PREDICT_4x4_LOAD_LEFT
+
+    src[0*i_stride+0]= ( l0 + l1 + 1 ) >> 1;
+    src[0*i_stride+1]= ( l0 + 2 * l1 + l2 + 2 ) >> 2;
+
+    src[0*i_stride+2]=
+    src[1*i_stride+0]= ( l1 + l2 + 1 ) >> 1;
+
+    src[0*i_stride+3]=
+    src[1*i_stride+1]= ( l1 + 2*l2 + l3 + 2 ) >> 2;
+
+    src[1*i_stride+2]=
+    src[2*i_stride+0]= ( l2 + l3 + 1 ) >> 1;
+
+    src[1*i_stride+3]=
+    src[2*i_stride+1]= ( l2 + 2 * l3 + l3 + 2 ) >> 2;
+
+    src[2*i_stride+3]=
+    src[3*i_stride+1]=
+    src[3*i_stride+0]=
+    src[2*i_stride+2]=
+    src[3*i_stride+2]=
+    src[3*i_stride+3]= l3;
+}
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_16x16_V ]     = predict_16x16_v;
+    pf[I_PRED_16x16_H ]     = predict_16x16_h;
+    pf[I_PRED_16x16_DC]     = predict_16x16_dc;
+    pf[I_PRED_16x16_P ]     = predict_16x16_p;
+    pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
+    pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top;
+    pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_16x16_init_mmxext( pf );
+    }
+#endif
+}
+
+void x264_predict_8x8_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
+    pf[I_PRED_CHROMA_P ]     = predict_8x8_p;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_8x8_init_mmxext( pf );
+    }
+#endif
+}
+
+void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
+{
+    pf[I_PRED_4x4_V]      = predict_4x4_v;
+    pf[I_PRED_4x4_H]      = predict_4x4_h;
+    pf[I_PRED_4x4_DC]     = predict_4x4_dc;
+    pf[I_PRED_4x4_DDL]    = predict_4x4_ddl;
+    pf[I_PRED_4x4_DDR]    = predict_4x4_ddr;
+    pf[I_PRED_4x4_VR]     = predict_4x4_vr;
+    pf[I_PRED_4x4_HD]     = predict_4x4_hd;
+    pf[I_PRED_4x4_VL]     = predict_4x4_vl;
+    pf[I_PRED_4x4_HU]     = predict_4x4_hu;
+    pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left;
+    pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top;
+    pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128;
+
+#ifdef HAVE_MMXEXT
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        x264_predict_4x4_init_mmxext( pf );
+    }
+#endif
+}
+
--- a/core/predict.h
+++ b/core/predict.h
@ -0,0 +1,92 @@
+/*****************************************************************************
+ * predict.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: predict.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _PREDICT_H
+#define _PREDICT_H 1
+
+typedef void (*x264_predict_t)( uint8_t *src, int i_stride );
+
+enum intra_chroma_pred_e
+{
+    I_PRED_CHROMA_DC = 0,
+    I_PRED_CHROMA_H  = 1,
+    I_PRED_CHROMA_V  = 2,
+    I_PRED_CHROMA_P  = 3,
+
+    I_PRED_CHROMA_DC_LEFT = 4,
+    I_PRED_CHROMA_DC_TOP  = 5,
+    I_PRED_CHROMA_DC_128  = 6
+};
+static const int x264_mb_pred_mode8x8_fix[7] =
+{
+    I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
+    I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
+};
+
+enum intra16x16_pred_e
+{
+    I_PRED_16x16_V  = 0,
+    I_PRED_16x16_H  = 1,
+    I_PRED_16x16_DC = 2,
+    I_PRED_16x16_P  = 3,
+
+    I_PRED_16x16_DC_LEFT = 4,
+    I_PRED_16x16_DC_TOP  = 5,
+    I_PRED_16x16_DC_128  = 6,
+};
+static const int x264_mb_pred_mode16x16_fix[7] =
+{
+    I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P,
+    I_PRED_16x16_DC,I_PRED_16x16_DC,I_PRED_16x16_DC
+};
+
+enum intra4x4_pred_e
+{
+    I_PRED_4x4_V  = 0,
+    I_PRED_4x4_H  = 1,
+    I_PRED_4x4_DC = 2,
+    I_PRED_4x4_DDL= 3,
+    I_PRED_4x4_DDR= 4,
+    I_PRED_4x4_VR = 5,
+    I_PRED_4x4_HD = 6,
+    I_PRED_4x4_VL = 7,
+    I_PRED_4x4_HU = 8,
+
+    I_PRED_4x4_DC_LEFT = 9,
+    I_PRED_4x4_DC_TOP  = 10,
+    I_PRED_4x4_DC_128  = 11,
+};
+static const int x264_mb_pred_mode4x4_fix[12] =
+{
+    I_PRED_4x4_V,   I_PRED_4x4_H,   I_PRED_4x4_DC,
+    I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR,
+    I_PRED_4x4_HD,  I_PRED_4x4_VL,  I_PRED_4x4_HU,
+    I_PRED_4x4_DC,  I_PRED_4x4_DC,  I_PRED_4x4_DC
+};
+
+void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x8_init   ( int cpu, x264_predict_t pf[7] );
+void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
+
+
+#endif
--- a/core/set.h
+++ b/core/set.h
@ -0,0 +1,123 @@
+/*****************************************************************************
+ * set.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _SET_H
+#define _SET_H 1
+
+enum profile_e
+{
+    PROFILE_BASELINE = 66,
+    PROFILE_MAIN = 77,
+    PROFILE_EXTENTED = 88
+};
+
+typedef struct
+{
+    int i_id;
+
+    int i_profile_idc;
+    int i_level_idc;
+
+    int b_constraint_set0;
+    int b_constraint_set1;
+    int b_constraint_set2;
+
+    int i_log2_max_frame_num;
+
+    int i_poc_type;
+    /* poc 0 */
+    int i_log2_max_poc_lsb;
+    /* poc 1 */
+    int b_delta_pic_order_always_zero;
+    int i_offset_for_non_ref_pic;
+    int i_offset_for_top_to_bottom_field;
+    int i_num_ref_frames_in_poc_cycle;
+    int i_offset_for_ref_frame[256];
+
+    int i_num_ref_frames;
+    int b_gaps_in_frame_num_value_allowed;
+    int i_mb_width;
+    int i_mb_height;
+    int b_frame_mbs_only;
+    int b_mb_adaptive_frame_field;
+    int b_direct8x8_inference;
+
+    int b_crop;
+    struct
+    {
+        int i_left;
+        int i_right;
+        int i_top;
+        int i_bottom;
+    } crop;
+
+    int b_vui;
+    struct
+    {
+        int i_sar_width;
+        int i_sar_height;
+        /* FIXME to complete */
+    } vui;
+
+} x264_sps_t;
+
+typedef struct
+{
+    int i_id;
+    int i_sps_id;
+
+    int b_cabac;
+
+    int b_pic_order;
+    int i_num_slice_groups;
+
+    int i_slice_group_map_type;
+    /* i_slice_group_map_type == 0 */
+    int i_run_length[256];      /* FIXME */
+    /* i_slice_group_map_type == 2 */
+    int i_top_left[256];        /* FIXME */
+    int i_bottom_right[256];    /* FIXME */
+    /* i_slice_group_map_type == 3, 4, 5 */
+    int b_slice_group_change_direction;
+    int i_slice_group_change_rate;
+    /* i_slice_group_map_type == 6 */
+    int i_pic_size_in_map_units;
+    int i_slice_group_id[256];  /* FIXME */
+
+    int i_num_ref_idx_l0_active;
+    int i_num_ref_idx_l1_active;
+
+    int b_weighted_pred;
+    int b_weighted_bipred;
+
+    int i_pic_init_qp;
+    int i_pic_init_qs;
+
+    int i_chroma_qp_index_offset;
+
+    int b_deblocking_filter_control;
+    int b_constrained_intra_pred;
+    int b_redundant_pic_cnt;
+} x264_pps_t;
+
+#endif
--- a/core/vlc.h
+++ b/core/vlc.h
@ -0,0 +1,914 @@
+/*****************************************************************************
+ * vlc.h : vlc table
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+typedef struct
+{
+    int i_bits;
+    int i_size;
+} vlc_t;
+
+/* XXX: don't forget to change it if you change vlc_t */
+#define MKVLC( a, b ) { a, b }
+static const vlc_t x264_coeff_token[5][17*4] =
+{
+    /* table 0 */
+    {
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 8 ), /* str=00000111 */
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x6, 8 ), /* str=00000110 */
+        MKVLC( 0x5, 7 ), /* str=0000101 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+
+        MKVLC( 0x7, 10 ), /* str=0000000111 */
+        MKVLC( 0x6, 9 ), /* str=000000110 */
+        MKVLC( 0x5, 8 ), /* str=00000101 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+
+        MKVLC( 0x7, 11 ), /* str=00000000111 */
+        MKVLC( 0x6, 10 ), /* str=0000000110 */
+        MKVLC( 0x5, 9 ), /* str=000000101 */
+        MKVLC( 0x4, 7 ), /* str=0000100 */
+
+        MKVLC( 0xf, 13 ), /* str=0000000001111 */
+        MKVLC( 0x6, 11 ), /* str=00000000110 */
+        MKVLC( 0x5, 10 ), /* str=0000000101 */
+        MKVLC( 0x4, 8 ), /* str=00000100 */
+
+        MKVLC( 0xb, 13 ), /* str=0000000001011 */
+        MKVLC( 0xe, 13 ), /* str=0000000001110 */
+        MKVLC( 0x5, 11 ), /* str=00000000101 */
+        MKVLC( 0x4, 9 ), /* str=000000100 */
+
+        MKVLC( 0x8, 13 ), /* str=0000000001000 */
+        MKVLC( 0xa, 13 ), /* str=0000000001010 */
+        MKVLC( 0xd, 13 ), /* str=0000000001101 */
+        MKVLC( 0x4, 10 ), /* str=0000000100 */
+
+        MKVLC( 0xf, 14 ), /* str=00000000001111 */
+        MKVLC( 0xe, 14 ), /* str=00000000001110 */
+        MKVLC( 0x9, 13 ), /* str=0000000001001 */
+        MKVLC( 0x4, 11 ), /* str=00000000100 */
+
+        MKVLC( 0xb, 14 ), /* str=00000000001011 */
+        MKVLC( 0xa, 14 ), /* str=00000000001010 */
+        MKVLC( 0xd, 14 ), /* str=00000000001101 */
+        MKVLC( 0xc, 13 ), /* str=0000000001100 */
+
+        MKVLC( 0xf, 15 ), /* str=000000000001111 */
+        MKVLC( 0xe, 15 ), /* str=000000000001110 */
+        MKVLC( 0x9, 14 ), /* str=00000000001001 */
+        MKVLC( 0xc, 14 ), /* str=00000000001100 */
+
+        MKVLC( 0xb, 15 ), /* str=000000000001011 */
+        MKVLC( 0xa, 15 ), /* str=000000000001010 */
+        MKVLC( 0xd, 15 ), /* str=000000000001101 */
+        MKVLC( 0x8, 14 ), /* str=00000000001000 */
+
+        MKVLC( 0xf, 16 ), /* str=0000000000001111 */
+        MKVLC( 0x1, 15 ), /* str=000000000000001 */
+        MKVLC( 0x9, 15 ), /* str=000000000001001 */
+        MKVLC( 0xc, 15 ), /* str=000000000001100 */
+
+        MKVLC( 0xb, 16 ), /* str=0000000000001011 */
+        MKVLC( 0xe, 16 ), /* str=0000000000001110 */
+        MKVLC( 0xd, 16 ), /* str=0000000000001101 */
+        MKVLC( 0x8, 15 ), /* str=000000000001000 */
+
+        MKVLC( 0x7, 16 ), /* str=0000000000000111 */
+        MKVLC( 0xa, 16 ), /* str=0000000000001010 */
+        MKVLC( 0x9, 16 ), /* str=0000000000001001 */
+        MKVLC( 0xc, 16 ), /* str=0000000000001100 */
+
+        MKVLC( 0x4, 16 ), /* str=0000000000000100 */
+        MKVLC( 0x6, 16 ), /* str=0000000000000110 */
+        MKVLC( 0x5, 16 ), /* str=0000000000000101 */
+        MKVLC( 0x8, 16 ), /* str=0000000000001000 */
+    },
+
+    /* table 1 */
+    {
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xb, 6 ), /* str=001011 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 6 ), /* str=000111 */
+        MKVLC( 0x7, 5 ), /* str=00111 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 7 ), /* str=0000111 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+
+        MKVLC( 0x7, 8 ), /* str=00000111 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+
+        MKVLC( 0x4, 8 ), /* str=00000100 */
+        MKVLC( 0x6, 7 ), /* str=0000110 */
+        MKVLC( 0x5, 7 ), /* str=0000101 */
+        MKVLC( 0x6, 5 ), /* str=00110 */
+
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x6, 8 ), /* str=00000110 */
+        MKVLC( 0x5, 8 ), /* str=00000101 */
+        MKVLC( 0x8, 6 ), /* str=001000 */
+
+        MKVLC( 0xf, 11 ), /* str=00000001111 */
+        MKVLC( 0x6, 9 ), /* str=000000110 */
+        MKVLC( 0x5, 9 ), /* str=000000101 */
+        MKVLC( 0x4, 6 ), /* str=000100 */
+
+        MKVLC( 0xb, 11 ), /* str=00000001011 */
+        MKVLC( 0xe, 11 ), /* str=00000001110 */
+        MKVLC( 0xd, 11 ), /* str=00000001101 */
+        MKVLC( 0x4, 7 ), /* str=0000100 */
+
+        MKVLC( 0xf, 12 ), /* str=000000001111 */
+        MKVLC( 0xa, 11 ), /* str=00000001010 */
+        MKVLC( 0x9, 11 ), /* str=00000001001 */
+        MKVLC( 0x4, 9 ), /* str=000000100 */
+
+        MKVLC( 0xb, 12 ), /* str=000000001011 */
+        MKVLC( 0xe, 12 ), /* str=000000001110 */
+        MKVLC( 0xd, 12 ), /* str=000000001101 */
+        MKVLC( 0xc, 11 ), /* str=00000001100 */
+
+        MKVLC( 0x8, 12 ), /* str=000000001000 */
+        MKVLC( 0xa, 12 ), /* str=000000001010 */
+        MKVLC( 0x9, 12 ), /* str=000000001001 */
+        MKVLC( 0x8, 11 ), /* str=00000001000 */
+
+        MKVLC( 0xf, 13 ), /* str=0000000001111 */
+        MKVLC( 0xe, 13 ), /* str=0000000001110 */
+        MKVLC( 0xd, 13 ), /* str=0000000001101 */
+        MKVLC( 0xc, 12 ), /* str=000000001100 */
+
+        MKVLC( 0xb, 13 ), /* str=0000000001011 */
+        MKVLC( 0xa, 13 ), /* str=0000000001010 */
+        MKVLC( 0x9, 13 ), /* str=0000000001001 */
+        MKVLC( 0xc, 13 ), /* str=0000000001100 */
+
+        MKVLC( 0x7, 13 ), /* str=0000000000111 */
+        MKVLC( 0xb, 14 ), /* str=00000000001011 */
+        MKVLC( 0x6, 13 ), /* str=0000000000110 */
+        MKVLC( 0x8, 13 ), /* str=0000000001000 */
+
+        MKVLC( 0x9, 14 ), /* str=00000000001001 */
+        MKVLC( 0x8, 14 ), /* str=00000000001000 */
+        MKVLC( 0xa, 14 ), /* str=00000000001010 */
+        MKVLC( 0x1, 13 ), /* str=0000000000001 */
+
+        MKVLC( 0x7, 14 ), /* str=00000000000111 */
+        MKVLC( 0x6, 14 ), /* str=00000000000110 */
+        MKVLC( 0x5, 14 ), /* str=00000000000101 */
+        MKVLC( 0x4, 14 ), /* str=00000000000100 */
+    },
+    /* table 2 */
+    {
+        MKVLC( 0xf, 4 ), /* str=1111 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xf, 6 ), /* str=001111 */
+        MKVLC( 0xe, 4 ), /* str=1110 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0xb, 6 ), /* str=001011 */
+        MKVLC( 0xf, 5 ), /* str=01111 */
+        MKVLC( 0xd, 4 ), /* str=1101 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x8, 6 ), /* str=001000 */
+        MKVLC( 0xc, 5 ), /* str=01100 */
+        MKVLC( 0xe, 5 ), /* str=01110 */
+        MKVLC( 0xc, 4 ), /* str=1100 */
+
+        MKVLC( 0xf, 7 ), /* str=0001111 */
+        MKVLC( 0xa, 5 ), /* str=01010 */
+        MKVLC( 0xb, 5 ), /* str=01011 */
+        MKVLC( 0xb, 4 ), /* str=1011 */
+
+        MKVLC( 0xb, 7 ), /* str=0001011 */
+        MKVLC( 0x8, 5 ), /* str=01000 */
+        MKVLC( 0x9, 5 ), /* str=01001 */
+        MKVLC( 0xa, 4 ), /* str=1010 */
+
+        MKVLC( 0x9, 7 ), /* str=0001001 */
+        MKVLC( 0xe, 6 ), /* str=001110 */
+        MKVLC( 0xd, 6 ), /* str=001101 */
+        MKVLC( 0x9, 4 ), /* str=1001 */
+
+        MKVLC( 0x8, 7 ), /* str=0001000 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0x8, 4 ), /* str=1000 */
+
+        MKVLC( 0xf, 8 ), /* str=00001111 */
+        MKVLC( 0xe, 7 ), /* str=0001110 */
+        MKVLC( 0xd, 7 ), /* str=0001101 */
+        MKVLC( 0xd, 5 ), /* str=01101 */
+
+        MKVLC( 0xb, 8 ), /* str=00001011 */
+        MKVLC( 0xe, 8 ), /* str=00001110 */
+        MKVLC( 0xa, 7 ), /* str=0001010 */
+        MKVLC( 0xc, 6 ), /* str=001100 */
+
+        MKVLC( 0xf, 9 ), /* str=000001111 */
+        MKVLC( 0xa, 8 ), /* str=00001010 */
+        MKVLC( 0xd, 8 ), /* str=00001101 */
+        MKVLC( 0xc, 7 ), /* str=0001100 */
+
+        MKVLC( 0xb, 9 ), /* str=000001011 */
+        MKVLC( 0xe, 9 ), /* str=000001110 */
+        MKVLC( 0x9, 8 ), /* str=00001001 */
+        MKVLC( 0xc, 8 ), /* str=00001100 */
+
+        MKVLC( 0x8, 9 ), /* str=000001000 */
+        MKVLC( 0xa, 9 ), /* str=000001010 */
+        MKVLC( 0xd, 9 ), /* str=000001101 */
+        MKVLC( 0x8, 8 ), /* str=00001000 */
+
+        MKVLC( 0xd, 10 ), /* str=0000001101 */
+        MKVLC( 0x7, 9 ), /* str=000000111 */
+        MKVLC( 0x9, 9 ), /* str=000001001 */
+        MKVLC( 0xc, 9 ), /* str=000001100 */
+
+        MKVLC( 0x9, 10 ), /* str=0000001001 */
+        MKVLC( 0xc, 10 ), /* str=0000001100 */
+        MKVLC( 0xb, 10 ), /* str=0000001011 */
+        MKVLC( 0xa, 10 ), /* str=0000001010 */
+
+        MKVLC( 0x5, 10 ), /* str=0000000101 */
+        MKVLC( 0x8, 10 ), /* str=0000001000 */
+        MKVLC( 0x7, 10 ), /* str=0000000111 */
+        MKVLC( 0x6, 10 ), /* str=0000000110 */
+
+        MKVLC( 0x1, 10 ), /* str=0000000001 */
+        MKVLC( 0x4, 10 ), /* str=0000000100 */
+        MKVLC( 0x3, 10 ), /* str=0000000011 */
+        MKVLC( 0x2, 10 ), /* str=0000000010 */
+    },
+
+    /* table 3 */
+    {
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x8, 6 ), /* str=001000 */
+        MKVLC( 0x9, 6 ), /* str=001001 */
+        MKVLC( 0xa, 6 ), /* str=001010 */
+        MKVLC( 0xb, 6 ), /* str=001011 */
+
+        MKVLC( 0xc, 6 ), /* str=001100 */
+        MKVLC( 0xd, 6 ), /* str=001101 */
+        MKVLC( 0xe, 6 ), /* str=001110 */
+        MKVLC( 0xf, 6 ), /* str=001111 */
+
+        MKVLC( 0x10, 6 ), /* str=010000 */
+        MKVLC( 0x11, 6 ), /* str=010001 */
+        MKVLC( 0x12, 6 ), /* str=010010 */
+        MKVLC( 0x13, 6 ), /* str=010011 */
+
+        MKVLC( 0x14, 6 ), /* str=010100 */
+        MKVLC( 0x15, 6 ), /* str=010101 */
+        MKVLC( 0x16, 6 ), /* str=010110 */
+        MKVLC( 0x17, 6 ), /* str=010111 */
+
+        MKVLC( 0x18, 6 ), /* str=011000 */
+        MKVLC( 0x19, 6 ), /* str=011001 */
+        MKVLC( 0x1a, 6 ), /* str=011010 */
+        MKVLC( 0x1b, 6 ), /* str=011011 */
+
+        MKVLC( 0x1c, 6 ), /* str=011100 */
+        MKVLC( 0x1d, 6 ), /* str=011101 */
+        MKVLC( 0x1e, 6 ), /* str=011110 */
+        MKVLC( 0x1f, 6 ), /* str=011111 */
+
+        MKVLC( 0x20, 6 ), /* str=100000 */
+        MKVLC( 0x21, 6 ), /* str=100001 */
+        MKVLC( 0x22, 6 ), /* str=100010 */
+        MKVLC( 0x23, 6 ), /* str=100011 */
+
+        MKVLC( 0x24, 6 ), /* str=100100 */
+        MKVLC( 0x25, 6 ), /* str=100101 */
+        MKVLC( 0x26, 6 ), /* str=100110 */
+        MKVLC( 0x27, 6 ), /* str=100111 */
+
+        MKVLC( 0x28, 6 ), /* str=101000 */
+        MKVLC( 0x29, 6 ), /* str=101001 */
+        MKVLC( 0x2a, 6 ), /* str=101010 */
+        MKVLC( 0x2b, 6 ), /* str=101011 */
+
+        MKVLC( 0x2c, 6 ), /* str=101100 */
+        MKVLC( 0x2d, 6 ), /* str=101101 */
+        MKVLC( 0x2e, 6 ), /* str=101110 */
+        MKVLC( 0x2f, 6 ), /* str=101111 */
+
+        MKVLC( 0x30, 6 ), /* str=110000 */
+        MKVLC( 0x31, 6 ), /* str=110001 */
+        MKVLC( 0x32, 6 ), /* str=110010 */
+        MKVLC( 0x33, 6 ), /* str=110011 */
+
+        MKVLC( 0x34, 6 ), /* str=110100 */
+        MKVLC( 0x35, 6 ), /* str=110101 */
+        MKVLC( 0x36, 6 ), /* str=110110 */
+        MKVLC( 0x37, 6 ), /* str=110111 */
+
+        MKVLC( 0x38, 6 ), /* str=111000 */
+        MKVLC( 0x39, 6 ), /* str=111001 */
+        MKVLC( 0x3a, 6 ), /* str=111010 */
+        MKVLC( 0x3b, 6 ), /* str=111011 */
+
+        MKVLC( 0x3c, 6 ), /* str=111100 */
+        MKVLC( 0x3d, 6 ), /* str=111101 */
+        MKVLC( 0x3e, 6 ), /* str=111110 */
+        MKVLC( 0x3f, 6 ), /* str=111111 */
+    },
+
+    /* table 4 */
+    {
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x7, 6 ), /* str=000111 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x4, 6 ), /* str=000100 */
+        MKVLC( 0x6, 6 ), /* str=000110 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x3, 7 ), /* str=0000011 */
+        MKVLC( 0x2, 7 ), /* str=0000010 */
+        MKVLC( 0x5, 6 ), /* str=000101 */
+
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x3, 8 ), /* str=00000011 */
+        MKVLC( 0x2, 8 ), /* str=00000010 */
+        MKVLC( 0x0, 7 ), /* str=0000000 */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    }
+};
+
+static const vlc_t x264_level_prefix[16] =
+{
+    MKVLC( 0x01,  1 ),
+    MKVLC( 0x01,  2 ),
+    MKVLC( 0x01,  3 ),
+    MKVLC( 0x01,  4 ),
+    MKVLC( 0x01,  5 ),
+    MKVLC( 0x01,  6 ),
+    MKVLC( 0x01,  7 ),
+    MKVLC( 0x01,  8 ),
+    MKVLC( 0x01,  9 ),
+    MKVLC( 0x01, 10 ),
+    MKVLC( 0x01, 11 ),
+    MKVLC( 0x01, 12 ),
+    MKVLC( 0x01, 13 ),
+    MKVLC( 0x01, 14 ),
+    MKVLC( 0x01, 15 ),
+    MKVLC( 0x01, 16 )
+};
+
+/* [i_total_coeff-1][i_total_zeros] */
+static const vlc_t x264_total_zeros[15][16] =
+{
+    { /* i_total 1 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x3, 7 ), /* str=0000011 */
+        MKVLC( 0x2, 7 ), /* str=0000010 */
+        MKVLC( 0x3, 8 ), /* str=00000011 */
+        MKVLC( 0x2, 8 ), /* str=00000010 */
+        MKVLC( 0x3, 9 ), /* str=000000011 */
+        MKVLC( 0x2, 9 ), /* str=000000010 */
+        MKVLC( 0x1, 9 ), /* str=000000001 */
+    },
+    { /* i_total 2 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x3, 6 ), /* str=000011 */
+        MKVLC( 0x2, 6 ), /* str=000010 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 3 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 4 */
+        MKVLC( 0x3, 5 ), /* str=00011 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x2, 5 ), /* str=00010 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 5 */
+        MKVLC( 0x5, 4 ), /* str=0101 */
+        MKVLC( 0x4, 4 ), /* str=0100 */
+        MKVLC( 0x3, 4 ), /* str=0011 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 4 ), /* str=0010 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 6 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 7 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 8 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 9 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x0, 6 ), /* str=000000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 10 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x0, 5 ), /* str=00000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 11 */
+        MKVLC( 0x0, 4 ), /* str=0000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 12 */
+        MKVLC( 0x0, 4 ), /* str=0000 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 13 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 14 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_total 15 */
+        MKVLC( 0x0, 1 ), /* str=0 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+};
+
+/* [i_total_coeff-1][i_total_zeros] */
+static const vlc_t x264_total_zeros_dc[3][4] =
+{
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x01, 2 ), /* 01 */
+        MKVLC( 0x01, 3 ), /* 001*/
+        MKVLC( 0x00, 3 )  /* 000*/
+    },
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x01, 2 ), /* 01 */
+        MKVLC( 0x00, 2 ), /* 00 */
+        MKVLC( 0x00, 0 )  /*    */
+    },
+    {
+        MKVLC( 0x01, 1 ), /* 1  */
+        MKVLC( 0x00, 1 ), /* 0  */
+        MKVLC( 0x00, 0 ), /*    */
+        MKVLC( 0x00, 0 )  /*    */
+    }
+};
+
+/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */
+static const vlc_t x264_run_before[7][15] =
+{
+    { /* i_zero_left 1 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x0, 1 ), /* str=0 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 2 */
+        MKVLC( 0x1, 1 ), /* str=1 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 3 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x0, 2 ), /* str=00 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 4 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x1, 2 ), /* str=01 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 5 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x2, 2 ), /* str=10 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 6 */
+        MKVLC( 0x3, 2 ), /* str=11 */
+        MKVLC( 0x0, 3 ), /* str=000 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+        MKVLC( 0x0, 0 ), /* str= */
+    },
+    { /* i_zero_left 7 */
+        MKVLC( 0x7, 3 ), /* str=111 */
+        MKVLC( 0x6, 3 ), /* str=110 */
+        MKVLC( 0x5, 3 ), /* str=101 */
+        MKVLC( 0x4, 3 ), /* str=100 */
+        MKVLC( 0x3, 3 ), /* str=011 */
+        MKVLC( 0x2, 3 ), /* str=010 */
+        MKVLC( 0x1, 3 ), /* str=001 */
+        MKVLC( 0x1, 4 ), /* str=0001 */
+        MKVLC( 0x1, 5 ), /* str=00001 */
+        MKVLC( 0x1, 6 ), /* str=000001 */
+        MKVLC( 0x1, 7 ), /* str=0000001 */
+        MKVLC( 0x1, 8 ), /* str=00000001 */
+        MKVLC( 0x1, 9 ), /* str=000000001 */
+        MKVLC( 0x1, 10 ), /* str=0000000001 */
+        MKVLC( 0x1, 11 ), /* str=00000000001 */
+    },
+};
--- a/decoder/decoder.c
+++ b/decoder/decoder.c
@ -0,0 +1,772 @@
+/*****************************************************************************
+ * x264: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: decoder.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/cpu.h"
+#include "../core/vlc.h"
+
+#include "macroblock.h"
+#include "set.h"
+#include "vlc.h"
+
+
+static void x264_slice_idr( x264_t *h )
+{
+    int i;
+
+    h->i_poc_msb = 0;
+    h->i_poc_lsb = 0;
+    h->i_frame_offset = 0;
+    h->i_frame_num = 0;
+
+    if( h->sps )
+    {
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            h->freference[i]->i_poc = -1;
+        }
+
+        h->fdec = h->freference[0];
+        h->i_ref0 = 0;
+        h->i_ref1 = 0;
+    }
+}
+
+/* The slice reading is split in two part:
+ *      - before ref_pic_list_reordering( )
+ *      - after  dec_ref_pic_marking( )
+ */
+static int x264_slice_header_part1_read( bs_t *s,
+                                         x264_slice_header_t *sh, x264_sps_t sps_array[32], x264_pps_t pps_array[256], int b_idr )
+{
+    sh->i_first_mb = bs_read_ue( s );
+    sh->i_type = bs_read_ue( s );
+    if( sh->i_type >= 5 )
+    {
+        sh->i_type -= 5;
+    }
+    sh->i_pps_id = bs_read_ue( s );
+    if( bs_eof( s ) || sh->i_pps_id >= 256 || pps_array[sh->i_pps_id].i_id == -1 )
+    {
+        fprintf( stderr, "invalid pps_id in slice header\n" );
+        return -1;
+    }
+
+    sh->pps = &pps_array[sh->i_pps_id];
+    sh->sps = &sps_array[sh->pps->i_sps_id];    /* valid if pps valid */
+
+    sh->i_frame_num = bs_read( s, sh->sps->i_log2_max_frame_num );
+    if( !sh->sps->b_frame_mbs_only )
+    {
+        sh->b_field_pic = bs_read1( s );
+        if( sh->b_field_pic )
+        {
+            sh->b_bottom_field = bs_read1( s );
+        }
+    }
+
+    if( b_idr )
+    {
+        sh->i_idr_pic_id = bs_read_ue( s );
+    }
+    else
+    {
+        sh->i_idr_pic_id = 0;
+    }
+
+    if( sh->sps->i_poc_type == 0 )
+    {
+        sh->i_poc_lsb = bs_read( s, sh->sps->i_log2_max_poc_lsb );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            sh->i_delta_poc_bottom = bs_read_se( s );
+        }
+    }
+    else if( sh->sps->i_poc_type == 1 && !sh->sps->b_delta_pic_order_always_zero )
+    {
+        sh->i_delta_poc[0] = bs_read_se( s );
+        if( sh->pps->b_pic_order && !sh->b_field_pic )
+        {
+            sh->i_delta_poc[1] = bs_read_se( s );
+        }
+    }
+
+    if( sh->pps->b_redundant_pic_cnt )
+    {
+        sh->i_redundant_pic_cnt = bs_read_ue( s );
+    }
+
+    if( sh->i_type == SLICE_TYPE_B )
+    {
+        sh->b_direct_spatial_mv_pred = bs_read1( s );
+    }
+
+    if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP || sh->i_type == SLICE_TYPE_B )
+    {
+        sh->b_num_ref_idx_override = bs_read1( s );
+
+        sh->i_num_ref_idx_l0_active = sh->pps->i_num_ref_idx_l0_active; /* default */
+        sh->i_num_ref_idx_l1_active = sh->pps->i_num_ref_idx_l1_active; /* default */
+
+        if( sh->b_num_ref_idx_override )
+        {
+            sh->i_num_ref_idx_l0_active = bs_read_ue( s ) + 1;
+            if( sh->i_type == SLICE_TYPE_B )
+            {
+                sh->i_num_ref_idx_l1_active = bs_read_ue( s ) + 1;
+            }
+        }
+    }
+
+    return bs_eof( s ) ? -1 : 0;
+}
+
+static int x264_slice_header_part2_read( bs_t *s, x264_slice_header_t *sh )
+{
+    if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I && sh->i_type != SLICE_TYPE_SI )
+    {
+        sh->i_cabac_init_idc = bs_read_ue( s );
+    }
+    sh->i_qp_delta = bs_read_se( s );
+
+    if( sh->i_type == SLICE_TYPE_SI || sh->i_type == SLICE_TYPE_SP )
+    {
+        if( sh->i_type == SLICE_TYPE_SP )
+        {
+            sh->b_sp_for_swidth = bs_read1( s );
+        }
+        sh->i_qs_delta = bs_read_se( s );
+    }
+
+    if( sh->pps->b_deblocking_filter_control )
+    {
+        sh->i_disable_deblocking_filter_idc = bs_read_ue( s );
+        if( sh->i_disable_deblocking_filter_idc != 1 )
+        {
+            sh->i_alpha_c0_offset = bs_read_se( s );
+            sh->i_beta_offset = bs_read_se( s );
+        }
+    }
+    else
+    {
+        sh->i_alpha_c0_offset = 0;
+        sh->i_beta_offset = 0;
+    }
+
+    if( sh->pps->i_num_slice_groups > 1 && sh->pps->i_slice_group_map_type >= 3 && sh->pps->i_slice_group_map_type <= 5 )
+    {
+        /* FIXME */
+        return -1;
+    }
+    return 0;
+}
+
+static int x264_slice_header_ref_pic_reordering( x264_t *h, bs_t *s )
+{
+    int b_ok;
+    int i;
+
+    /* use the no more use frame */
+    h->fdec = h->freference[0];
+    h->fdec->i_poc = h->i_poc;
+
+    /* build ref list 0/1 */
+    h->i_ref0 = 0;
+    h->i_ref1 = 0;
+    for( i = 1; i < h->sps->i_num_ref_frames + 1; i++ )
+    {
+        if( h->freference[i]->i_poc >= 0 )
+        {
+            if( h->freference[i]->i_poc < h->fdec->i_poc )
+            {
+                h->fref0[h->i_ref0++] = h->freference[i];
+            }
+            else if( h->freference[i]->i_poc > h->fdec->i_poc )
+            {
+                h->fref1[h->i_ref1++] = h->freference[i];
+            }
+        }
+    }
+
+    /* Order ref0 from higher to lower poc */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref0 - 1; i++ )
+        {
+            if( h->fref0[i]->i_poc < h->fref0[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref0[i+1];
+
+                h->fref0[i+1] = h->fref0[i];
+                h->fref0[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+    /* Order ref1 from lower to higher poc (bubble sort) for B-frame */
+    do
+    {
+        b_ok = 1;
+        for( i = 0; i < h->i_ref1 - 1; i++ )
+        {
+            if( h->fref1[i]->i_poc > h->fref1[i+1]->i_poc )
+            {
+                x264_frame_t *tmp = h->fref1[i+1];
+
+                h->fref1[i+1] = h->fref1[i];
+                h->fref1[i] = tmp;
+                b_ok = 0;
+                break;
+            }
+        }
+    } while( !b_ok );
+
+    if( h->i_ref0 > h->pps->i_num_ref_idx_l0_active )
+    {
+        h->i_ref0 = h->pps->i_num_ref_idx_l0_active;
+    }
+    if( h->i_ref1 > h->pps->i_num_ref_idx_l1_active )
+    {
+        h->i_ref1 = h->pps->i_num_ref_idx_l1_active;
+    }
+
+    //fprintf( stderr,"POC:%d ref0=%d POC0=%d\n", h->fdec->i_poc, h->i_ref0, h->i_ref0 > 0 ? h->fref0[0]->i_poc : -1 );
+
+
+    /* Now parse the stream and change the default order */
+    if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+    {
+        int b_reorder = bs_read1( s );
+
+        if( b_reorder )
+        {
+            /* FIXME */
+            return -1;
+        }
+    }
+    if( h->sh.i_type == SLICE_TYPE_B )
+    {
+        int b_reorder = bs_read1( s );
+        if( b_reorder )
+        {
+            /* FIXME */
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static int x264_slice_header_pred_weight_table( x264_t *h, bs_t *s )
+{
+    return -1;
+}
+
+static int  x264_slice_header_dec_ref_pic_marking( x264_t *h, bs_t *s, int i_nal_type  )
+{
+    if( i_nal_type == NAL_SLICE_IDR )
+    {
+        int b_no_output_of_prior_pics = bs_read1( s );
+        int b_long_term_reference_flag = bs_read1( s );
+
+        /* TODO */
+        if( b_no_output_of_prior_pics )
+        {
+
+        }
+
+        if( b_long_term_reference_flag )
+        {
+
+        }
+    }
+    else
+    {
+        int b_adaptive_ref_pic_marking_mode = bs_read1( s );
+        if( b_adaptive_ref_pic_marking_mode )
+        {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/****************************************************************************
+ * Decode a slice header and setup h for mb decoding.
+ ****************************************************************************/
+static int x264_slice_header_decode( x264_t *h, bs_t *s, x264_nal_t *nal )
+{
+    /* read the first part of the slice */
+    if( x264_slice_header_part1_read( s, &h->sh,
+                                      h->sps_array, h->pps_array,
+                                      nal->i_type == NAL_SLICE_IDR ? 1 : 0 ) < 0 )
+    {
+        fprintf( stderr, "x264_slice_header_part1_read failed\n" );
+        return -1;
+    }
+
+    /* now reset h if needed for this frame */
+    if( h->sps != h->sh.sps || h->pps != h->sh.pps )
+    {
+        int i;
+        /* TODO */
+
+        h->sps = NULL;
+        h->pps = NULL;
+        if( h->picture->i_width != 0 && h->picture->i_height != 0 )
+        {
+            for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+            {
+                x264_frame_delete( h->freference[i]);
+            }
+            free( h->mb );
+        }
+
+        h->picture->i_width = 0;
+        h->picture->i_height = 0;
+    }
+
+    /* and init if needed */
+    if( h->sps == NULL || h->pps == NULL )
+    {
+        int i;
+
+        h->sps = h->sh.sps;
+        h->pps = h->sh.pps;
+
+        h->param.i_width = h->picture->i_width = 16 * h->sps->i_mb_width;
+        h->param.i_height= h->picture->i_height= 16 * h->sps->i_mb_height;
+
+        fprintf( stderr, "x264: %dx%d\n", h->picture->i_width, h->picture->i_height );
+
+        h->mb = x264_macroblocks_new( h->sps->i_mb_width, h->sps->i_mb_height );
+
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            h->freference[i] = x264_frame_new( h );
+            h->freference[i]->i_poc = -1;
+        }
+        h->fdec = h->freference[0];
+        h->i_ref0 = 0;
+        h->i_ref1 = 0;
+
+        h->i_poc_msb = 0;
+        h->i_poc_lsb = 0;
+        h->i_frame_offset = 0;
+        h->i_frame_num = 0;
+    }
+
+    /* calculate poc for current frame */
+    if( h->sps->i_poc_type == 0 )
+    {
+        int i_max_poc_lsb = 1 << h->sps->i_log2_max_poc_lsb;
+
+        if( h->sh.i_poc_lsb < h->i_poc_lsb && h->i_poc_lsb - h->sh.i_poc_lsb >= i_max_poc_lsb/2 )
+        {
+            h->i_poc_msb += i_max_poc_lsb;
+        }
+        else if( h->sh.i_poc_lsb > h->i_poc_lsb  && h->sh.i_poc_lsb - h->i_poc_lsb > i_max_poc_lsb/2 )
+        {
+            h->i_poc_msb -= i_max_poc_lsb;
+        }
+        h->i_poc_lsb = h->sh.i_poc_lsb;
+
+        h->i_poc = h->i_poc_msb + h->sh.i_poc_lsb;
+    }
+    else if( h->sps->i_poc_type == 1 )
+    {
+        /* FIXME */
+        return -1;
+    }
+    else
+    {
+        if( nal->i_type == NAL_SLICE_IDR )
+        {
+            h->i_frame_offset = 0;
+            h->i_poc = 0;
+        }
+        else
+        {
+            if( h->sh.i_frame_num < h->i_frame_num )
+            {
+                h->i_frame_offset += 1 << h->sps->i_log2_max_frame_num;
+            }
+            if( nal->i_ref_idc > 0 )
+            {
+                h->i_poc = 2 * ( h->i_frame_offset + h->sh.i_frame_num );
+            }
+            else
+            {
+                h->i_poc = 2 * ( h->i_frame_offset + h->sh.i_frame_num ) - 1;
+            }
+        }
+        h->i_frame_num = h->sh.i_frame_num;
+    }
+
+    fprintf( stderr, "x264: pic type=%s poc:%d\n",
+             h->sh.i_type == SLICE_TYPE_I ? "I" : (h->sh.i_type == SLICE_TYPE_P ? "P" : "B?" ),
+             h->i_poc );
+
+    if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_P )
+    {
+        fprintf( stderr, "only SLICE I/P supported\n" );
+        return -1;
+    }
+
+    /* read and do the ref pic reordering */
+    if( x264_slice_header_ref_pic_reordering( h, s ) < 0 )
+    {
+        return -1;
+    }
+
+    if( ( (h->sh.i_type == SLICE_TYPE_P || h->sh.i_type == SLICE_TYPE_SP) && h->sh.pps->b_weighted_pred  ) ||
+        ( h->sh.i_type == SLICE_TYPE_B && h->sh.pps->b_weighted_bipred ) )
+    {
+        if( x264_slice_header_pred_weight_table( h, s ) < 0 )
+        {
+            return -1;
+        }
+    }
+
+    if( nal->i_ref_idc != 0 )
+    {
+        x264_slice_header_dec_ref_pic_marking( h, s, nal->i_type );
+    }
+
+    if( x264_slice_header_part2_read( s, &h->sh ) < 0 )
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int x264_slice_data_decode( x264_t *h, bs_t *s )
+{
+    int mb_xy = h->sh.i_first_mb;
+    int i_ret = 0;
+
+    if( h->pps->b_cabac )
+    {
+        /* TODO: alignement and cabac init */
+    }
+
+    /* FIXME field decoding */
+    for( ;; )
+    {
+        x264_mb_context_t context;
+        x264_macroblock_t *mb;
+
+        if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+        {
+            break;
+        }
+
+        mb = &h->mb[mb_xy];
+
+        /* load neighbour */
+        x264_macroblock_context_load( h, mb, &context );
+
+
+        if( h->pps->b_cabac )
+        {
+            if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+            {
+                /* TODO */
+            }
+            i_ret = x264_macroblock_read_cabac( h, s, mb );
+        }
+        else
+        {
+            if( h->sh.i_type != SLICE_TYPE_I && h->sh.i_type != SLICE_TYPE_SI )
+            {
+                int i_skip = bs_read_ue( s );
+
+                while( i_skip > 0 )
+                {
+                    x264_macroblock_decode_skip( h, mb );
+
+                    /* next macroblock */
+                    mb_xy++;
+                    if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+                    {
+                        break;
+                    }
+                    mb++;
+
+                    /* load neighbour */
+                    x264_macroblock_context_load( h, mb, &context );
+
+                    i_skip--;
+                }
+                if( mb_xy >= h->sps->i_mb_width * h->sps->i_mb_height )
+                {
+                    break;
+                }
+            }
+            i_ret = x264_macroblock_read_cavlc( h, s, mb );
+        }
+
+        if( i_ret < 0 )
+        {
+            fprintf( stderr, "x264_macroblock_read failed [%d,%d]\n", mb->i_mb_x, mb->i_mb_y );
+            break;
+        }
+
+        if( x264_macroblock_decode( h, mb ) < 0 )
+        {
+            fprintf( stderr, "x264_macroblock_decode failed\n" );
+            /* try to do some error correction ;) */
+        }
+
+        mb_xy++;
+    }
+
+    if( i_ret >= 0 )
+    {
+        int i;
+
+        /* expand border for frame reference TODO avoid it when using b-frame */
+        x264_frame_expand_border( h->fdec );
+
+        /* apply deblocking filter to the current decoded picture */
+        if( !h->pps->b_deblocking_filter_control || h->sh.i_disable_deblocking_filter_idc != 1 )
+        {
+            x264_frame_deblocking_filter( h, h->sh.i_type );
+        }
+
+#if 0
+        /* expand border for frame reference TODO avoid it when using b-frame */
+        x264_frame_expand_border( h->fdec );
+#endif
+
+        h->picture->i_plane = h->fdec->i_plane;
+        for( i = 0; i < h->picture->i_plane; i++ )
+        {
+            h->picture->i_stride[i] = h->fdec->i_stride[i];
+            h->picture->plane[i]    = h->fdec->plane[i];
+        }
+
+        /* move frame in the buffer FIXME won't work for B-frame */
+        h->fdec = h->freference[h->sps->i_num_ref_frames];
+        for( i = h->sps->i_num_ref_frames; i > 0; i-- )
+        {
+            h->freference[i] = h->freference[i-1];
+        }
+        h->freference[0] = h->fdec;
+    }
+
+    return i_ret;
+}
+
+/****************************************************************************
+ *
+ ******************************* x264 libs **********************************
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * x264_decoder_open:
+ ****************************************************************************/
+x264_t *x264_decoder_open   ( x264_param_t *param )
+{
+    x264_t *h = x264_malloc( sizeof( x264_t ) );
+    int i;
+
+    memcpy( &h->param, param, sizeof( x264_param_t ) );
+
+    h->cpu = param->cpu;
+
+    /* no SPS and PPS active yet */
+    h->sps = NULL;
+    h->pps = NULL;
+
+    for( i = 0; i < 32; i++ )
+    {
+        h->sps_array[i].i_id = -1;  /* invalidate it */
+    }
+    for( i = 0; i < 256; i++ )
+    {
+        h->pps_array[i].i_id = -1;  /* invalidate it */
+    }
+
+    h->picture = x264_malloc( sizeof( x264_picture_t ) );
+    h->picture->i_width = 0;
+    h->picture->i_height= 0;
+
+    /* init predict_XxX */
+    x264_predict_16x16_init( h->cpu, h->predict_16x16 );
+    x264_predict_8x8_init( h->cpu, h->predict_8x8 );
+    x264_predict_4x4_init( h->cpu, h->predict_4x4 );
+
+    x264_pixel_init( h->cpu, &h->pixf );
+    x264_dct_init( h->cpu, &h->dctf );
+
+    x264_mc_init( h->cpu, h->mc );
+
+    /* create the vlc table (we could remove it from x264_t but it will need
+     * to introduce a x264_init() for global librarie) */
+    for( i = 0; i < 5; i++ )
+    {
+        /* max 2 step */
+        h->x264_coeff_token_lookup[i] = x264_vlc_table_lookup_new( x264_coeff_token[i], 17*4, 4 );
+    }
+    /* max 2 step */
+    h->x264_level_prefix_lookup = x264_vlc_table_lookup_new( x264_level_prefix, 16, 8 );
+
+    for( i = 0; i < 15; i++ )
+    {
+        /* max 1 step */
+        h->x264_total_zeros_lookup[i] = x264_vlc_table_lookup_new( x264_total_zeros[i], 16, 9 );
+    }
+    for( i = 0;i < 3; i++ )
+    {
+        /* max 1 step */
+        h->x264_total_zeros_dc_lookup[i] = x264_vlc_table_lookup_new( x264_total_zeros_dc[i], 4, 3 );
+    }
+    for( i = 0;i < 7; i++ )
+    {
+        /* max 2 step */
+        h->x264_run_before_lookup[i] = x264_vlc_table_lookup_new( x264_run_before[i], 15, 6 );
+    }
+
+    return h;
+}
+
+/****************************************************************************
+ * x264_decoder_decode: decode one nal unit
+ ****************************************************************************/
+int     x264_decoder_decode( x264_t *h,
+                             x264_picture_t **pp_pic, x264_nal_t *nal )
+{
+    int i_ret = 0;
+    bs_t bs;
+
+    /* no picture */
+    *pp_pic = NULL;
+
+    /* init bitstream reader */
+    bs_init( &bs, nal->p_payload, nal->i_payload );
+
+    switch( nal->i_type )
+    {
+        case NAL_SPS:
+            if( ( i_ret = x264_sps_read( &bs, h->sps_array ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_sps_read failed\n" );
+            }
+            break;
+
+        case NAL_PPS:
+            if( ( i_ret = x264_pps_read( &bs, h->pps_array ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_pps_read failed\n" );
+            }
+            break;
+
+        case NAL_SLICE_IDR:
+            fprintf( stderr, "x264: NAL_SLICE_IDR\n" );
+            x264_slice_idr( h );
+
+        case NAL_SLICE:
+            if( ( i_ret = x264_slice_header_decode( h, &bs, nal ) ) < 0 )
+            {
+                fprintf( stderr, "x264: x264_slice_header_decode failed\n" );
+            }
+            if( h->sh.i_redundant_pic_cnt == 0 && i_ret == 0 )
+            {
+                if( ( i_ret = x264_slice_data_decode( h, &bs ) ) < 0 )
+                {
+                    fprintf( stderr, "x264: x264_slice_data_decode failed\n" );
+                }
+                else
+                {
+                    *pp_pic = h->picture;
+                }
+            }
+            break;
+
+        case NAL_SLICE_DPA:
+        case NAL_SLICE_DPB:
+        case NAL_SLICE_DPC:
+            fprintf( stderr, "partitioned stream unsupported\n" );
+            i_ret = -1;
+            break;
+
+        case NAL_SEI:
+        default:
+            break;
+    }
+
+    /* restore CPU state (before using float again) */
+    x264_cpu_restore( h->cpu );
+
+    return i_ret;
+}
+
+/****************************************************************************
+ * x264_decoder_close:
+ ****************************************************************************/
+void    x264_decoder_close  ( x264_t *h )
+{
+    int i;
+
+    if( h->picture->i_width != 0 && h->picture->i_height != 0 )
+    {
+        for( i = 0; i < h->sps->i_num_ref_frames + 1; i++ )
+        {
+            x264_frame_delete( h->freference[i]);
+        }
+        x264_free( h->mb );
+    }
+
+    /* free vlc table */
+    for( i = 0; i < 5; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_coeff_token_lookup[i] );
+    }
+    x264_vlc_table_lookup_delete( h->x264_level_prefix_lookup );
+
+    for( i = 0; i < 15; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_total_zeros_lookup[i] );
+    }
+    for( i = 0;i < 3; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_total_zeros_dc_lookup[i] );
+    }
+    for( i = 0;i < 7; i++ )
+    {
+        x264_vlc_table_lookup_delete( h->x264_run_before_lookup[i] );
+    }
+
+    x264_free( h->picture );
+    x264_free( h );
+}
+
--- a/decoder/macroblock.c
+++ b/decoder/macroblock.c
--- a/decoder/macroblock.h
+++ b/decoder/macroblock.h
@ -0,0 +1,34 @@
+/*****************************************************************************
+ * macroblock.h: h264 decoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_MACROBLOCK_H
+#define _DECODER_MACROBLOCK_H 1
+
+int  x264_macroblock_read_cabac( x264_t *h, bs_t *s, x264_macroblock_t *mb );
+int  x264_macroblock_read_cavlc( x264_t *h, bs_t *s, x264_macroblock_t *mb );
+
+int  x264_macroblock_decode( x264_t *h, x264_macroblock_t *mb );
+void x264_macroblock_decode_skip( x264_t *h, x264_macroblock_t *mb );
+
+#endif
+
--- a/decoder/set.c
+++ b/decoder/set.c
@ -0,0 +1,262 @@
+/*****************************************************************************
+ * x264: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "set.h"
+
+/* return -1 if invalid, else the id */
+int x264_sps_read( bs_t *s, x264_sps_t sps_array[32] )
+{
+    x264_sps_t *sps;
+
+    int i_profile_idc;
+    int i_level_idc;
+
+    int b_constraint_set0;
+    int b_constraint_set1;
+    int b_constraint_set2;
+
+    int id;
+
+    i_profile_idc     = bs_read( s, 8 );
+    b_constraint_set0 = bs_read( s, 1 );
+    b_constraint_set1 = bs_read( s, 1 );
+    b_constraint_set2 = bs_read( s, 1 );
+
+    bs_skip( s, 5 );    /* reserved */
+    i_level_idc       = bs_read( s, 8 );
+
+
+    id = bs_read_ue( s );
+    if( bs_eof( s ) || id >= 32 )
+    {
+        /* the sps is invalid, no need to corrupt sps_array[0] */
+        return -1;
+    }
+
+    sps = &sps_array[id];
+    sps->i_id = id;
+
+    /* put pack parsed value */
+    sps->i_profile_idc     = i_profile_idc;
+    sps->i_level_idc       = i_level_idc;
+    sps->b_constraint_set0 = b_constraint_set0;
+    sps->b_constraint_set1 = b_constraint_set1;
+    sps->b_constraint_set2 = b_constraint_set2;
+
+    sps->i_log2_max_frame_num = bs_read_ue( s ) + 4;
+
+    sps->i_poc_type = bs_read_ue( s );
+    if( sps->i_poc_type == 0 )
+    {
+        sps->i_log2_max_poc_lsb = bs_read_ue( s ) + 4;
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+        sps->b_delta_pic_order_always_zero = bs_read( s, 1 );
+        sps->i_offset_for_non_ref_pic = bs_read_se( s );
+        sps->i_offset_for_top_to_bottom_field = bs_read_se( s );
+        sps->i_num_ref_frames_in_poc_cycle = bs_read_ue( s );
+        if( sps->i_num_ref_frames_in_poc_cycle > 256 )
+        {
+            /* FIXME what to do */
+            sps->i_num_ref_frames_in_poc_cycle = 256;
+        }
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            sps->i_offset_for_ref_frame[i] = bs_read_se( s );
+        }
+    }
+    else if( sps->i_poc_type > 2 )
+    {
+        goto error;
+    }
+
+    sps->i_num_ref_frames = bs_read_ue( s );
+    sps->b_gaps_in_frame_num_value_allowed = bs_read( s, 1 );
+    sps->i_mb_width = bs_read_ue( s ) + 1;
+    sps->i_mb_height= bs_read_ue( s ) + 1;
+    sps->b_frame_mbs_only = bs_read( s, 1 );
+    if( !sps->b_frame_mbs_only )
+    {
+        sps->b_mb_adaptive_frame_field = bs_read( s, 1 );
+    }
+    else
+    {
+        sps->b_mb_adaptive_frame_field = 0;
+    }
+    sps->b_direct8x8_inference = bs_read( s, 1 );
+
+    sps->b_crop = bs_read( s, 1 );
+    if( sps->b_crop )
+    {
+        sps->crop.i_left  = bs_read_ue( s );
+        sps->crop.i_right = bs_read_ue( s );
+        sps->crop.i_top   = bs_read_ue( s );
+        sps->crop.i_bottom= bs_read_ue( s );
+    }
+    else
+    {
+        sps->crop.i_left  = 0;
+        sps->crop.i_right = 0;
+        sps->crop.i_top   = 0;
+        sps->crop.i_bottom= 0;
+    }
+
+    sps->b_vui = bs_read( s, 1 );
+    if( sps->b_vui )
+    {
+        /* FIXME */
+    }
+    else
+    {
+
+    }
+
+    if( bs_eof( s ) )
+    {
+        /* no rbsp trailing */
+        fprintf( stderr, "incomplete SPS\n" );
+        goto error;
+    }
+
+    fprintf( stderr, "x264_sps_read: sps:0x%x profile:%d/%d poc:%d ref:%d %xx%d crop:%d-%d-%d-%d\n",
+             sps->i_id,
+             sps->i_profile_idc, sps->i_level_idc,
+             sps->i_poc_type,
+             sps->i_num_ref_frames,
+             sps->i_mb_width, sps->i_mb_height,
+             sps->crop.i_left, sps->crop.i_right,
+             sps->crop.i_top, sps->crop.i_bottom );
+
+    return id;
+
+error:
+    /* invalidate this sps */
+    sps->i_id = -1;
+    return -1;
+}
+
+/* return -1 if invalid, else the id */
+int x264_pps_read( bs_t *s, x264_pps_t pps_array[256] )
+{
+    x264_pps_t *pps;
+    int id;
+    int i;
+
+    id = bs_read_ue( s );
+    if( bs_eof( s ) || id >= 256 )
+    {
+        fprintf( stderr, "id invalid\n" );
+        return -1;
+    }
+    pps = &pps_array[id];
+    pps->i_id = id;
+    pps->i_sps_id = bs_read_ue( s );
+    if( pps->i_sps_id >= 32 )
+    {
+        goto error;
+    }
+    pps->b_cabac = bs_read( s, 1 );
+    pps->b_pic_order = bs_read( s, 1 );
+    pps->i_num_slice_groups = bs_read_ue( s ) + 1;
+    if( pps->i_num_slice_groups > 1 )
+    {
+        fprintf( stderr, "FMO unsupported\n " );
+
+        pps->i_slice_group_map_type  =bs_read_ue( s );
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_run_length[i] = bs_read_ue( s );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_top_left[i] = bs_read_ue( s );
+                pps->i_bottom_right[i] = bs_read_ue( s );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 3 ||
+                 pps->i_slice_group_map_type == 4 ||
+                 pps->i_slice_group_map_type == 5 )
+        {
+            pps->b_slice_group_change_direction = bs_read( s, 1 );
+            pps->i_slice_group_change_rate = bs_read_ue( s ) + 1;
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            pps->i_pic_size_in_map_units = bs_read_ue( s ) + 1;
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+               /*  FIXME */
+                /* pps->i_slice_group_id = bs_read( s, ceil( log2( pps->i_pic_size_in_map_units +1 ) ) ); */
+            }
+        }
+    }
+    pps->i_num_ref_idx_l0_active = bs_read_ue( s ) + 1;
+    pps->i_num_ref_idx_l1_active = bs_read_ue( s ) + 1;
+    pps->b_weighted_pred = bs_read( s, 1 );
+    pps->b_weighted_bipred = bs_read( s, 2 );
+
+    pps->i_pic_init_qp = bs_read_se( s ) + 26;
+    pps->i_pic_init_qs = bs_read_se( s ) + 26;
+
+    pps->i_chroma_qp_index_offset = bs_read_se( s );
+
+    pps->b_deblocking_filter_control = bs_read( s, 1 );
+    pps->b_constrained_intra_pred = bs_read( s, 1 );
+    pps->b_redundant_pic_cnt = bs_read( s, 1 );
+
+    if( bs_eof( s ) )
+    {
+        /* no rbsp trailing */
+        fprintf( stderr, "incomplete PPS\n" );
+        goto error;
+    }
+    fprintf( stderr, "x264_sps_read: pps:0x%x sps:0x%x %s slice_groups=%d ref0:%d ref1:%d QP:%d QS:%d QC=%d DFC:%d CIP:%d RPC:%d\n",
+             pps->i_id, pps->i_sps_id,
+             pps->b_cabac ? "CABAC" : "CAVLC",
+             pps->i_num_slice_groups,
+             pps->i_num_ref_idx_l0_active,
+             pps->i_num_ref_idx_l1_active,
+             pps->i_pic_init_qp, pps->i_pic_init_qs, pps->i_chroma_qp_index_offset,
+             pps->b_deblocking_filter_control,
+             pps->b_constrained_intra_pred,
+             pps->b_redundant_pic_cnt );
+
+    return id;
+error:
+    pps->i_id = -1;
+    return -1;
+}
+
--- a/decoder/set.h
+++ b/decoder/set.h
@ -0,0 +1,33 @@
+/*****************************************************************************
+ * set.h: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_SET_H
+#define _DECODER_SET_H 1
+
+/* return -1 if invalid, else the id */
+int x264_sps_read( bs_t *s, x264_sps_t sps_array[32] );
+
+/* return -1 if invalid, else the id */
+int x264_pps_read( bs_t *s, x264_pps_t pps_array[256] );
+
+#endif
--- a/decoder/vlc.c
+++ b/decoder/vlc.c
@ -0,0 +1,236 @@
+/*****************************************************************************
+ * vlc.c: VLC lookup table generation.
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "vlc.h"
+
+
+static int  vlc_table_realloc( x264_vlc_table_t *table, int i_size )
+{
+    int i_index;
+
+    i_index = table->i_lookup;
+
+    table->i_lookup += i_size;
+    table->lookup = x264_realloc( table->lookup, sizeof( vlc_lookup_t ) * table->i_lookup );
+
+    return( i_index );
+}
+
+static int vlc_table_create_part( x264_vlc_table_t *table, const vlc_t *vlc, int i_lookup_bits, int i_nb_vlc, int i_prefix_code, int i_prefix_length )
+{
+    int i;
+    int i_nb_lookup;
+    vlc_lookup_t *lookup;
+    int i_table_index;
+
+    i_nb_lookup = 1 << i_lookup_bits;
+
+    i_table_index = vlc_table_realloc( table, i_nb_lookup );
+    lookup = &table->lookup[i_table_index];
+
+    for( i = 0; i < i_nb_lookup; i++ )
+    {
+        lookup[i].i_value  = -1;
+        lookup[i].i_size = 0;
+    }
+
+    for( i = 0; i < i_nb_vlc; i++ )
+    {
+        int i_bits;
+        if( vlc[i].i_size <= 0 )
+        {
+            continue;
+        }
+
+        i_bits = vlc[i].i_size - i_prefix_length;
+        if( i_bits > 0 && ( vlc[i].i_bits >> i_bits ) == i_prefix_code )
+        {
+            if( i_bits <= i_lookup_bits )
+            {
+                int i_lookup_index;
+                int nb;
+
+                i_lookup_index = ( vlc[i].i_bits << ( i_lookup_bits - i_bits ) )%i_nb_lookup;
+                nb = 1 << ( i_lookup_bits - i_bits );
+                for( nb = 0; nb < (1 << ( i_lookup_bits - i_bits)); nb++ )
+                {
+                    lookup[i_lookup_index].i_value = i; /* vlc[i].i_value; */
+                    lookup[i_lookup_index].i_size = i_bits;
+                    i_lookup_index++;
+                }
+            }
+            else
+            {
+                int i_bits_max;
+                int i_lookup_index;
+                /* need another table */
+                i_lookup_index = ( vlc[i].i_bits >> (i_bits - i_lookup_bits ) )%i_nb_lookup;
+
+                i_bits_max =  -lookup[i_lookup_index].i_size;
+                if( i_bits_max < i_bits - i_lookup_bits )
+                {
+                    i_bits_max = i_bits - i_lookup_bits;
+                }
+                lookup[i_lookup_index].i_size = -i_bits_max;
+            }
+        }
+    }
+
+    /* create other level table */
+    for( i = 0; i < i_nb_lookup; i++ )
+    {
+        if( lookup[i].i_size < 0 )
+        {
+            int i_bits;
+            int i_index;
+            i_bits = -lookup[i].i_size;
+            if( i_bits > i_lookup_bits )
+            {
+                lookup[i].i_size = -i_lookup_bits;
+                i_bits = i_lookup_bits;
+            }
+
+            i_index = vlc_table_create_part( table, vlc, i_bits, i_nb_vlc,
+                                             (i_prefix_code << i_lookup_bits)|i,
+                                              i_lookup_bits+i_prefix_length );
+            lookup = &table->lookup[i_table_index]; // reallocated
+            lookup[i].i_value = i_index;
+        }
+    }
+
+    return( i_table_index );
+}
+
+
+x264_vlc_table_t *x264_vlc_table_lookup_new( const vlc_t *vlc, int i_vlc, int i_lookup_bits )
+{
+    x264_vlc_table_t *table = x264_malloc( sizeof( x264_vlc_table_t ) );
+
+    table->i_lookup_bits = i_lookup_bits;
+    table->i_lookup = 0;
+    table->lookup   = NULL;
+
+    vlc_table_create_part( table, vlc, i_lookup_bits, i_vlc, 0, 0 );
+
+    return table;
+}
+
+void x264_vlc_table_lookup_delete( x264_vlc_table_t *table )
+{
+    x264_free( table->lookup );
+    x264_free( table );
+}
+
+#if 0
+void x264_vlc_table_lookup_print( x264_vlc_table_t *table )
+{
+    int idx;
+
+    fprintf( stderr, "       " );
+    for( idx = 0; idx < table->i_lookup; idx++ )
+    {
+        if( table->lookup[idx].i_value == -1 )
+        {
+            fprintf( stderr, " MKVLCLU(    -1,  0 )," );
+        }
+        else
+        {
+            fprintf( stderr, " MKVLCLU( 0x%.3x, % 2d ),", table->lookup[idx].i_value, table->lookup[idx].i_size );
+        }
+        if( (idx+1)%4 == 0 && idx < table->i_lookup - 1)
+        {
+            fprintf( stderr, "\n       " );
+        }
+    }
+    fprintf( stderr, "\n" );
+}
+
+int main(void)
+{
+    int i;
+    x264_vlc_table_t *table;
+
+
+    printf( "typedef struct\n    int i_value;\n    int i_size;\n} vlc_lookup_t;\n\n#define MKVLCLU(a,b) { .i_value=a, .i_size=b}" );
+
+    /* create vlc  entry table and then vlc_lookup_t table */
+
+    /* x264_coeff_token */
+    fprintf( stderr, "static const vlc_lookup_t x264_coeff_token_lookup[5][]=\n{\n" );
+    for( i = 0; i < 5; i++ )
+    {
+        fprintf( stderr, "    {\n" );
+        table = x264_vlc_table_lookup_new( x264_coeff_token[i], 17*4, 6 );
+        x264_vlc_table_lookup_print( table );
+        x264_vlc_table_lookup_delete( table );
+        fprintf( stderr, "    },\n" );
+    }
+    fprintf( stderr, "};\n" );
+
+#if 0
+
+    vlce = convert_vlc_to_vlce( x264_level_prefix, 16 );
+    do_vlc_table_create( vlce, 16, "x264_level_prefix_lookup", 8 );
+    free( vlce );
+
+    for( i_table = 0; i_table < 15; i_table++ )
+    {
+        char name[512];
+        vlce = convert_vlc_to_vlce( x264_total_zeros[i_table], 16 );
+        sprintf( name, "x264_total_zeros_%d", i_table );
+        do_vlc_table_create( vlce, 16, name, 6 );
+
+        free( vlce );
+    }
+
+    for( i_table = 0; i_table < 3; i_table++ )
+    {
+        char name[512];
+
+        vlce = convert_vlc_to_vlce( x264_total_zeros_dc[i_table], 4 );
+        sprintf( name, "x264_total_zeros_dc_%d", i_table );
+        do_vlc_table_create( vlce, 4, name, 3 );
+
+        free( vlce );
+    }
+
+    for( i_table = 0; i_table < 7; i_table++ )
+    {
+        char name[512];
+        vlce = convert_vlc_to_vlce( x264_run_before[i_table], 15 );
+        sprintf( name, "x264_run_before_%d", i_table );
+        do_vlc_table_create( vlce, 15, name, 6 );
+
+        free( vlce );
+    }
+#endif
+    return 0;
+}
+
+#endif
--- a/decoder/vlc.h
+++ b/decoder/vlc.h
@ -0,0 +1,46 @@
+/*****************************************************************************
+ * vlc.h: h264 decoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: vlc.h,v 1.1 2004/06/03 19:27:07 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _DECODER_VLC_H
+#define _DECODER_VLC_H 1
+
+typedef struct
+{
+    int i_value;
+    int i_size;
+} vlc_lookup_t;
+
+struct x264_vlc_table_t
+{
+    int          i_lookup_bits;
+
+    int          i_lookup;
+    vlc_lookup_t *lookup;
+};
+
+x264_vlc_table_t *x264_vlc_table_lookup_new( const vlc_t *vlc, int i_vlc, int i_lookup_bits );
+
+void x264_vlc_table_lookup_delete( x264_vlc_table_t *table );
+
+#endif
+
--- a/doc/dct.txt
+++ b/doc/dct.txt
@ -0,0 +1,111 @@
+/****************************************************************************
+ * DCT/IDCT functions
+ ****************************************************************************/
+/* be carefull that "dct" could be equal to "luma" (ie dct_4x4(dct,dct) )*/
+static void dct_2x2_dc( int16_t dct[2][2], int16_t chroma[2][2] )
+{
+    int tmp[2][2];
+
+    tmp[0][0] = chroma[0][0] + chroma[0][1];
+    tmp[1][0] = chroma[0][0] - chroma[0][1];
+    tmp[0][1] = chroma[1][0] + chroma[1][1];
+    tmp[1][1] = chroma[1][0] - chroma[1][1];
+
+    dct[0][0] = tmp[0][0] + tmp[0][1];
+    dct[0][1] = tmp[1][0] + tmp[1][1];
+    dct[1][0] = tmp[0][0] - tmp[0][1];
+    dct[1][1] = tmp[1][0] - tmp[1][1];
+}
+
+static void idct_2x2_dc( int16_t dct[2][2], int16_t chroma[2][2] )
+{
+    dct_2x2_dc( chroma, dct );
+}
+
+static void dct_4x4_dc( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = luma[i][0] + luma[i][1] + luma[i][2] + luma[i][3];
+        tmp[1][i] = luma[i][0] + luma[i][1] - luma[i][2] - luma[i][3];
+        tmp[2][i] = luma[i][0] - luma[i][1] - luma[i][2] + luma[i][3];
+        tmp[3][i] = luma[i][0] - luma[i][1] + luma[i][2] - luma[i][3];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        dct[0][i] = ( tmp[i][0] + tmp[i][1] + tmp[i][2] + tmp[i][3] + 1) / 2;
+        dct[1][i] = ( tmp[i][0] + tmp[i][1] - tmp[i][2] - tmp[i][3] + 1) / 2;
+        dct[2][i] = ( tmp[i][0] - tmp[i][1] - tmp[i][2] + tmp[i][3] + 1) / 2;
+        dct[3][i] = ( tmp[i][0] - tmp[i][1] + tmp[i][2] - tmp[i][3] + 1) / 2;
+    }
+}
+
+static void dct_4x4( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] =   luma[i][0] +   luma[i][1] +   luma[i][2] +   luma[i][3];
+        tmp[1][i] = 2*luma[i][0] +   luma[i][1] -   luma[i][2] - 2*luma[i][3];
+        tmp[2][i] =   luma[i][0] -   luma[i][1] -   luma[i][2] +   luma[i][3];
+        tmp[3][i] =   luma[i][0] - 2*luma[i][1] + 2*luma[i][2] -   luma[i][3];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        dct[0][i] =   tmp[i][0] +   tmp[i][1] +   tmp[i][2] +   tmp[i][3];
+        dct[1][i] = 2*tmp[i][0] +   tmp[i][1] -   tmp[i][2] - 2*tmp[i][3];
+        dct[2][i] =   tmp[i][0] -   tmp[i][1] -   tmp[i][2] +   tmp[i][3];
+        dct[3][i] =   tmp[i][0] - 2*tmp[i][1] + 2*tmp[i][2] -   tmp[i][3];
+    }
+}
+
+static void idct_4x4( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = dct[0][i]   +  dct[1][i]     + dct[2][i]   + (dct[3][i]>>1);
+        tmp[1][i] = dct[0][i]   + (dct[1][i]>>1) - dct[2][i]   -  dct[3][i];
+        tmp[2][i] = dct[0][i]   - (dct[1][i]>>1) - dct[2][i]   +  dct[3][i];
+        tmp[3][i] = dct[0][i]   -  dct[1][i]     + dct[2][i]   - (dct[3][i]>>1);
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        /* XXX XXX use >> 6 and not / 64 -> doesn't give the same result :((((((((( */
+        luma[i][0] = ( tmp[i][0] +  tmp[i][1]     + tmp[i][2] + (tmp[i][3]>>1) + 32 ) >> 6;
+        luma[i][1] = ( tmp[i][0] + (tmp[i][1]>>1) - tmp[i][2] -  tmp[i][3]     + 32 ) >> 6;
+        luma[i][2] = ( tmp[i][0] - (tmp[i][1]>>1) - tmp[i][2] +  tmp[i][3]     + 32 ) >> 6;
+        luma[i][3] = ( tmp[i][0] -  tmp[i][1]     + tmp[i][2] - (tmp[i][3]>>1) + 32 ) >> 6;
+    }
+}
+static void idct_4x4_dc( int16_t dct[4][4], int16_t luma[4][4] )
+{
+    int16_t tmp[4][4];
+    int i;
+
+    for( i = 0; i < 4; i++ )
+    {
+        tmp[0][i] = dct[0][i] + dct[1][i] + dct[2][i] + dct[3][i];
+        tmp[1][i] = dct[0][i] + dct[1][i] - dct[2][i] - dct[3][i];
+        tmp[2][i] = dct[0][i] - dct[1][i] - dct[2][i] + dct[3][i];
+        tmp[3][i] = dct[0][i] - dct[1][i] + dct[2][i] - dct[3][i];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        luma[i][0] = ( tmp[i][0] + tmp[i][1] + tmp[i][2] + tmp[i][3] ) ;
+        luma[i][1] = ( tmp[i][0] + tmp[i][1] - tmp[i][2] - tmp[i][3] ) ;
+        luma[i][2] = ( tmp[i][0] - tmp[i][1] - tmp[i][2] + tmp[i][3] ) ;
+        luma[i][3] = ( tmp[i][0] - tmp[i][1] + tmp[i][2] - tmp[i][3] ) ;
+    }
+}
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@ -0,0 +1,29 @@
+/*****************************************************************************
+ * analyse.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: analyse.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ANALYSE_H
+#define _ANALYSE_H 1
+
+void x264_macroblock_analyse( x264_t *h );
+
+#endif
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@ -0,0 +1,688 @@
+/*****************************************************************************
+ * cavlc.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: cavlc.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "../core/vlc.h"
+#include "macroblock.h"
+
+static const uint8_t intra4x4_cbp_to_golomb[48]=
+{
+  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+ 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+ 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+static const uint8_t inter_cbp_to_golomb[48]=
+{
+  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+#define BLOCK_INDEX_CHROMA_DC   (-1)
+#define BLOCK_INDEX_LUMA_DC     (-2)
+
+static inline void bs_write_vlc( bs_t *s, vlc_t v )
+{
+    bs_write( s, v.i_size, v.i_bits );
+}
+
+/****************************************************************************
+ * block_residual_write_cavlc:
+ ****************************************************************************/
+static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, int i_count )
+{
+    int level[16], run[16];
+    int i_total, i_trailing;
+    int i_total_zero;
+    int i_last;
+    unsigned int i_sign;
+
+    int i;
+    int i_zero_left;
+    int i_suffix_length;
+
+    /* first find i_last */
+    i_last = i_count - 1;
+    while( i_last >= 0 && l[i_last] == 0 )
+    {
+        i_last--;
+    }
+
+    i_sign = 0;
+    i_total = 0;
+    i_trailing = 0;
+    i_total_zero = 0;
+
+    if( i_last >= 0 )
+    {
+        int b_trailing = 1;
+        int idx = 0;
+
+        /* level and run and total */
+        while( i_last >= 0 )
+        {
+            level[idx] = l[i_last--];
+
+            run[idx] = 0;
+            while( i_last >= 0 && l[i_last] == 0 )
+            {
+                run[idx]++;
+                i_last--;
+            }
+
+            i_total++;
+            i_total_zero += run[idx];
+
+            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
+            {
+                i_sign <<= 1;
+                if( level[idx] < 0 )
+                {
+                    i_sign |= 0x01;
+                }
+
+                i_trailing++;
+            }
+            else
+            {
+                b_trailing = 0;
+            }
+
+            idx++;
+        }
+    }
+
+    /* total/trailing */
+    if( i_idx == BLOCK_INDEX_CHROMA_DC )
+    {
+        bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
+    }
+    else
+    {
+        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
+        int nC;
+
+        if( i_idx == BLOCK_INDEX_LUMA_DC )
+        {
+            nC = x264_mb_predict_non_zero_code( h, 0 );
+        }
+        else
+        {
+            nC = x264_mb_predict_non_zero_code( h, i_idx );
+        }
+
+        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
+    }
+
+    if( i_total <= 0 )
+    {
+        return;
+    }
+
+    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
+    if( i_trailing > 0 )
+    {
+        bs_write( s, i_trailing, i_sign );
+    }
+    for( i = i_trailing; i < i_total; i++ )
+    {
+        int i_level_code;
+
+        /* calculate level code */
+        if( level[i] < 0 )
+        {
+            i_level_code = -2*level[i] - 1;
+        }
+        else /* if( level[i] > 0 ) */
+        {
+            i_level_code = 2 * level[i] - 2;
+        }
+        if( i == i_trailing && i_trailing < 3 )
+        {
+            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        }
+
+        if( ( i_level_code >> i_suffix_length ) < 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
+            if( i_suffix_length > 0 )
+            {
+                bs_write( s, i_suffix_length, i_level_code );
+            }
+        }
+        else if( i_suffix_length == 0 && i_level_code < 30 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, 4, i_level_code - 14 );
+        }
+        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        {
+            bs_write_vlc( s, x264_level_prefix[14] );
+            bs_write( s, i_suffix_length, i_level_code );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_level_prefix[15] );
+            i_level_code -= 15 << i_suffix_length;
+            if( i_suffix_length == 0 )
+            {
+                i_level_code -= 15;
+            }
+
+            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
+            {
+                fprintf( stderr, "OVERFLOW levelcode=%d\n", i_level_code );
+            }
+
+            bs_write( s, 12, i_level_code );    /* check overflow ?? */
+        }
+
+        if( i_suffix_length == 0 )
+        {
+            i_suffix_length++;
+        }
+        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
+        {
+            i_suffix_length++;
+        }
+    }
+
+    if( i_total < i_count )
+    {
+        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        {
+            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
+        }
+        else
+        {
+            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        }
+    }
+
+    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    {
+        int i_zl;
+
+        if( i_zero_left <= 0 )
+        {
+            break;
+        }
+
+        i_zl = X264_MIN( i_zero_left - 1, 6 );
+
+        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+
+        i_zero_left -= run[i];
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_write:
+ *****************************************************************************/
+void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
+{
+    const int i_mb_type = h->mb.i_type;
+    int i_mb_i_offset;
+    int i;
+
+    switch( h->sh.i_type )
+    {
+        case SLICE_TYPE_I:
+            i_mb_i_offset = 0;
+            break;
+        case SLICE_TYPE_P:
+            i_mb_i_offset = 5;
+            break;
+        case SLICE_TYPE_B:
+            i_mb_i_offset = 23;
+            break;
+        default:
+            fprintf( stderr, "internal error or slice unsupported\n" );
+            return;
+    }
+
+    /* Write:
+      - type
+      - prediction
+      - mv */
+    if( i_mb_type == I_PCM )
+    {
+        /* Untested */
+        bs_write_ue( s, i_mb_i_offset + 25 );
+
+        bs_align_0( s );
+        /* Luma */
+        for( i = 0; i < 16*16; i++ )
+        {
+            const int x = 16 * h->mb.i_mb_x + (i % 16);
+            const int y = 16 * h->mb.i_mb_y + (i / 16);
+            bs_write( s, 8, h->fenc->plane[0][y*h->mb.pic.i_stride[0]+x] );
+        }
+        /* Cb */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[1][y*h->mb.pic.i_stride[1]+x] );
+        }
+        /* Cr */
+        for( i = 0; i < 8*8; i++ )
+        {
+            const int x = 8 * h->mb.i_mb_x + (i % 8);
+            const int y = 8 * h->mb.i_mb_y + (i / 8);
+            bs_write( s, 8, h->fenc->plane[2][y*h->mb.pic.i_stride[2]+x] );
+        }
+        return;
+    }
+    else if( i_mb_type == I_4x4 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 0 );
+
+        /* Prediction: Luma */
+        for( i = 0; i < 16; i++ )
+        {
+            int i_pred = x264_mb_predict_intra4x4_mode( h, i );
+            int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+
+            if( i_pred == i_mode)
+            {
+                bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
+            }
+            else
+            {
+                bs_write1( s, 0 );  /* b_prev_intra4x4_pred_mode */
+                if( i_mode < i_pred )
+                {
+                    bs_write( s, 3, i_mode );
+                }
+                else
+                {
+                    bs_write( s, 3, i_mode - 1 );
+                }
+            }
+        }
+        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+    }
+    else if( i_mb_type == I_16x16 )
+    {
+        bs_write_ue( s, i_mb_i_offset + 1 + h->mb.i_intra16x16_pred_mode +
+                        h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
+        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+    }
+    else if( i_mb_type == P_L0 )
+    {
+        int mvp[2];
+
+        if( h->mb.i_partition == D_16x16 )
+        {
+            bs_write_ue( s, 0 );
+
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            }
+            x264_mb_predict_mv( h, 0, 0, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+        }
+        else if( h->mb.i_partition == D_16x8 )
+        {
+            bs_write_ue( s, 1 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            }
+
+            x264_mb_predict_mv( h, 0, 0, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+
+            x264_mb_predict_mv( h, 0, 8, 4, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][1] - mvp[1] );
+        }
+        else if( h->mb.i_partition == D_8x16 )
+        {
+            bs_write_ue( s, 2 );
+            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            }
+
+            x264_mb_predict_mv( h, 0, 0, 2, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+
+            x264_mb_predict_mv( h, 0, 4, 2, mvp );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][0] - mvp[0] );
+            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][1] - mvp[1] );
+        }
+    }
+    else if( i_mb_type == P_8x8 )
+    {
+        int b_sub_ref0;
+
+        if( h->mb.cache.ref[0][x264_scan8[0]] == 0 && h->mb.cache.ref[0][x264_scan8[4]] == 0 &&
+            h->mb.cache.ref[0][x264_scan8[8]] == 0 && h->mb.cache.ref[0][x264_scan8[12]] == 0 )
+        {
+            bs_write_ue( s, 4 );
+            b_sub_ref0 = 0;
+        }
+        else
+        {
+            bs_write_ue( s, 3 );
+            b_sub_ref0 = 1;
+        }
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    bs_write_ue( s, 0 );
+                    break;
+                case D_L0_8x4:
+                    bs_write_ue( s, 1 );
+                    break;
+                case D_L0_4x8:
+                    bs_write_ue( s, 2 );
+                    break;
+                case D_L0_4x4:
+                    bs_write_ue( s, 3 );
+                    break;
+            }
+        }
+        /* ref0 */
+        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        {
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[12]] );
+        }
+        for( i = 0; i < 4; i++ )
+        {
+            int mvp[2];
+
+            switch( h->mb.i_sub_partition[i] )
+            {
+                case D_L0_8x8:
+                    x264_mb_predict_mv( h, 0, 4*i, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+                    break;
+                case D_L0_8x4:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+2, 2, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
+                    break;
+                case D_L0_4x8:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
+                    break;
+                case D_L0_4x4:
+                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+2, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
+
+                    x264_mb_predict_mv( h, 0, 4*i+3, 1, mvp );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][0] - mvp[0] );
+                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][1] - mvp[1] );
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        fprintf( stderr, "invalid/unhandled mb_type (B_8x8)\n" );
+        return;
+    }
+    else if( i_mb_type != B_DIRECT )
+    {
+        /* All B mode */
+        /* Motion Vector */
+        int i_list;
+        int mvp[2];
+
+        int b_list[2][2];
+
+        /* init ref list utilisations */
+        for( i = 0; i < 2; i++ )
+        {
+            b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
+            b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
+        }
+
+
+        if( h->mb.i_partition == D_16x16 )
+        {
+            if( b_list[0][0] && b_list[1][0] )
+            {
+                bs_write_ue( s, 3 );
+            }
+            else if( b_list[1][0] )
+            {
+                bs_write_ue( s, 2 );
+            }
+            else
+            {
+                bs_write_ue( s, 1 );
+            }
+        }
+        else
+        {
+            if( i_mb_type == B_BI_BI )
+            {
+                bs_write_ue( s, 20 + (h->mb.i_partition == D_16x8 ? 0 : 1 ) );
+            }
+            else if( b_list[0][0] && b_list[1][0] )
+            {
+                /* B_BI_LX* */
+                bs_write_ue( s, 16 + (b_list[0][1]?0:2) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[0][1] && b_list[1][1] )
+            {
+                /* B_LX_BI */
+                bs_write_ue( s, 12 + (b_list[0][1]?0:2) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[1][1] )
+            {
+                /* B_LX_L1 */
+                bs_write_ue( s, 6 + (b_list[0][0]?2:0) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+            else if( b_list[0][1] )
+            {
+                /* B_LX_L0 */
+                bs_write_ue( s, 4 + (b_list[0][0]?0:6) + (h->mb.i_partition == D_16x8?0:1) );
+            }
+        }
+
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            const int i_ref_max = i_list == 0 ? h->sh.i_num_ref_idx_l0_active : h->sh.i_num_ref_idx_l1_active;
+
+            if( i_ref_max > 1 )
+            {
+                switch( h->mb.i_partition )
+                {
+                    case D_16x16:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        break;
+                    case D_16x8:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[8]] );
+                        break;
+                    case D_8x16:
+                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
+                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[4]] );
+                        break;
+                }
+            }
+        }
+        for( i_list = 0; i_list < 2; i_list++ )
+        {
+            switch( h->mb.i_partition )
+            {
+                case D_16x16:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    break;
+                case D_16x8:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    if( b_list[i_list][1] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 8, 4, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][1] - mvp[1] );
+                    }
+                    break;
+                case D_8x16:
+                    if( b_list[i_list][0] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 0, 2, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
+                    }
+                    if( b_list[i_list][1] )
+                    {
+                        x264_mb_predict_mv( h, i_list, 4, 2, mvp );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][0] - mvp[0] );
+                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][1] - mvp[1] );
+                    }
+                    break;
+            }
+        }
+    }
+    else if( i_mb_type == B_DIRECT )
+    {
+        bs_write_ue( s, 0 );
+    }
+    else
+    {
+        fprintf( stderr, "invalid/unhandled mb_type\n" );
+        return;
+    }
+
+    /* Coded block patern */
+    if( i_mb_type == I_4x4 )
+    {
+        bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    }
+    else if( i_mb_type != I_16x16 )
+    {
+        bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    }
+
+    /* write residual */
+    if( i_mb_type == I_16x16 )
+    {
+        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
+
+        /* DC Luma */
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 );
+
+        if( h->mb.i_cbp_luma != 0 )
+        {
+            /* AC Luma */
+            for( i = 0; i < 16; i++ )
+            {
+                block_residual_write_cavlc( h, s, i, h->dct.block[i].residual_ac, 15 );
+            }
+        }
+    }
+    else if( h->mb.i_cbp_luma != 0 || h->mb.i_cbp_chroma != 0 )
+    {
+        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
+
+        for( i = 0; i < 16; i++ )
+        {
+            if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
+            {
+                block_residual_write_cavlc( h, s, i, h->dct.block[i].luma4x4, 16 );
+            }
+        }
+    }
+    if( h->mb.i_cbp_chroma != 0 )
+    {
+        /* Chroma DC residual present */
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
+        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
+        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+        {
+            for( i = 0; i < 8; i++ )
+            {
+                block_residual_write_cavlc( h, s, 16 + i, h->dct.block[16+i].residual_ac, 15 );
+            }
+        }
+    }
+}
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@ -0,0 +1,859 @@
+/*****************************************************************************
+ * macroblock.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "macroblock.h"
+
+
+static const uint8_t block_idx_x[16] =
+{
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] =
+{
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] =
+{
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static const int quant_mf[6][4][4] =
+{
+    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
+       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
+    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
+       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
+    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
+       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
+    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
+       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
+    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
+       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
+    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
+       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+};
+
+static const int i_chroma_qp_table[52] =
+{
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
+    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
+    39, 39
+};
+
+/****************************************************************************
+ * Scan and Quant functions
+ ****************************************************************************/
+//static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
+//static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
+
+static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[2][0];
+    level[4] = dct[1][1];
+    level[5] = dct[0][2];
+    level[6] = dct[0][3];
+    level[7] = dct[1][2];
+    level[8] = dct[2][1];
+    level[9] = dct[3][0];
+    level[10] = dct[3][1];
+    level[11] = dct[2][2];
+    level[12] = dct[1][3];
+    level[13] = dct[2][3];
+    level[14] = dct[3][2];
+    level[15] = dct[3][3];
+#if 0
+    int i;
+    for( i = 0; i < 16; i++ )
+    {
+        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+#endif
+}
+static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
+{
+    level[0] = dct[0][1];
+    level[1] = dct[1][0];
+    level[2] = dct[2][0];
+    level[3] = dct[1][1];
+    level[4] = dct[0][2];
+    level[5] = dct[0][3];
+    level[6] = dct[1][2];
+    level[7] = dct[2][1];
+    level[8] = dct[3][0];
+    level[9] = dct[3][1];
+    level[10] = dct[2][2];
+    level[11] = dct[1][3];
+    level[12] = dct[2][3];
+    level[13] = dct[3][2];
+    level[14] = dct[3][3];
+#if 0
+    int i;
+    for( i = 1; i < 16; i++ )
+    {
+        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
+    }
+#endif
+}
+
+static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
+{
+    level[0] = dct[0][0];
+    level[1] = dct[0][1];
+    level[2] = dct[1][0];
+    level[3] = dct[1][1];
+}
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+    const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / 3;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int f2 = ( 2 << i_qbits ) / ( b_intra ? 3 : 6 );
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+#if 0
+/* From a JVT doc */
+static const int f_deadzone_intra[4][4][2] = /* [num][den] */
+{
+    { {1,2}, {3,7}, {2,5}, {1,3} },
+    { {3,7}, {2,5}, {1,3}, {1,4} },
+    { {2,5}, {1,3}, {1,4}, {1,5} },
+    { {1,3}, {1,4}, {1,5}, {1,5} }
+};
+static const int f_deadzone_inter[4][4][2] = /* [num][den] */
+{
+    { {1,3}, {2,7}, {4,15},{2,9} },
+    { {2,7}, {4,15},{2,9}, {1,6} },
+    { {4,15},{2,9}, {1,6}, {1,7} },
+    { {2,9}, {1,6}, {1,7}, {2,15} }
+};
+
+
+static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
+{
+    const int(*f_deadzone)[4][4][2] = b_intra ? &f_deadzone_intra : &f_deadzone_inter;
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+
+    int x,y;
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+#if 0
+            const int f = b_intra ?
+                          (f_deadzone_intra[y][x][0] * ( 1 << i_qbits ) / f_deadzone_intra[y][x][1])
+                          :
+                          (f_deadzone_inter[y][x][0] * ( 1 << i_qbits ) / f_deadzone_inter[y][x][1]);
+#else
+            const int f = (*f_deadzone)[y][x][0] * ( 1 << i_qbits ) / (*f_deadzone)[y][x][1];
+#endif
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
+            }
+            else
+            {
+                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
+            }
+        }
+    }
+}
+
+static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
+{
+    const int i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1];
+    int x,y;
+
+    for( y = 0; y < 4; y++ )
+    {
+        for( x = 0; x < 4; x++ )
+        {
+
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
+{
+    int const i_qbits = 15 + i_qscale / 6;
+    const int i_qmf = quant_mf[i_qscale%6][0][0];
+    const int f2 = b_intra ?
+                   (f_deadzone_intra[0][0][0] * ( 2 << i_qbits ) / f_deadzone_intra[0][0][1])
+                   :
+                   (f_deadzone_inter[0][0][0] * ( 2 << i_qbits ) / f_deadzone_inter[0][0][1]);
+    int x,y;
+    for( y = 0; y < 2; y++ )
+    {
+        for( x = 0; x < 2; x++ )
+        {
+            if( dct[y][x] > 0 )
+            {
+                dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
+            }
+            else
+            {
+                dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
+            }
+        }
+    }
+}
+
+
+#endif
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+    {
+        if( v[i] )
+        {
+            i_nz++;
+        }
+    }
+    return i_nz;
+}
+
+/* (ref: JVT-B118)
+ * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
+ * to 0 (low score means set it to null)
+ * Used in inter macroblock (luma and chroma)
+ *  luma: for a 8x8 block: if score < 4 -> null
+ *        for the complete mb: if score < 6 -> null
+ *  chroma: for the complete mb: if score < 7 -> null
+ */
+static int x264_mb_decimate_score( int *dct, int i_max )
+{
+    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int i_score = 0;
+    int idx = i_max - 1;
+
+    while( idx >= 0 && dct[idx] == 0 )
+    {
+        idx--;
+    }
+
+    while( idx >= 0 )
+    {
+        int i_run;
+
+        if( abs( dct[idx--] ) > 1 )
+        {
+            return 9;
+        }
+
+        i_run = 0;
+        while( idx >= 0 && dct[idx] == 0 )
+        {
+            idx--;
+            i_run++;
+        }
+        i_score += i_ds_table[i_run];
+    }
+
+    return i_score;
+}
+
+void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
+{
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = &h->mb.pic.p_fenc[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
+    uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
+
+    int16_t dct4x4[4][4];
+
+    h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+
+    quant_4x4( dct4x4, i_qscale, 1 );
+
+    scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
+
+    x264_mb_dequant_4x4( dct4x4, i_qscale );
+
+    /* output samples to fdec */
+    h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
+}
+
+static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
+{
+    const int i_stride = h->mb.pic.i_stride[0];
+    uint8_t  *p_src = h->mb.pic.p_fenc[0];
+    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+
+    int16_t dct4x4[16+1][4][4];
+
+    int i;
+
+    h->dctf.sub16x16_dct( &dct4x4[1], p_src, i_stride, p_dst, i_stride );
+    for( i = 0; i < 16; i++ )
+    {
+        /* copy dc coeff */
+        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+
+        /* quant/scan/dequant */
+        quant_4x4( dct4x4[1+i], i_qscale, 1 );
+        scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
+        x264_mb_dequant_4x4( dct4x4[1+i], i_qscale );
+    }
+
+    h->dctf.dct4x4dc( dct4x4[0] );
+    quant_4x4_dc( dct4x4[0], i_qscale );
+    scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
+
+    /* output samples to fdec */
+    h->dctf.idct4x4dc( dct4x4[0] );
+    x264_mb_dequant_4x4_dc( dct4x4[0], i_qscale );  /* XXX not inversed */
+
+    /* calculate dct coeffs */
+    for( i = 0; i < 16; i++ )
+    {
+        /* copy dc coeff */
+        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+    }
+    /* put pixels to fdec */
+    h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
+}
+
+static void x264_mb_encode_8x8( x264_t *h, int b_inter, int i_qscale )
+{
+    int i, ch;
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        const int i_stride = h->mb.pic.i_stride[1+ch];
+        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
+        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
+        int i_decimate_score = 0;
+
+        int16_t dct2x2[2][2];
+        int16_t dct4x4[4][4][4];
+
+        h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            /* copy dc coeff */
+            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+
+            quant_4x4( dct4x4[i], i_qscale, b_inter ? 0 : 1 );
+            scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
+            x264_mb_dequant_4x4( dct4x4[i], i_qscale );
+
+            if( b_inter )
+            {
+                i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
+            }
+        }
+
+        h->dctf.dct2x2dc( dct2x2 );
+        quant_2x2_dc( dct2x2, i_qscale, b_inter ? 0 : 1 );
+        scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+
+        /* output samples to fdec */
+        h->dctf.idct2x2dc( dct2x2 );
+        x264_mb_dequant_2x2_dc( dct2x2, i_qscale );  /* XXX not inversed */
+
+        if( b_inter && i_decimate_score < 7 )
+        {
+            /* Near null chroma 8x8 block so make it null (bits saving) */
+            for( i = 0; i < 4; i++ )
+            {
+                int x, y;
+                for( x = 0; x < 15; x++ )
+                {
+                    h->dct.block[16+i+ch*4].residual_ac[x] = 0;
+                }
+                for( x = 0; x < 4; x++ )
+                {
+                    for( y = 0; y < 4; y++ )
+                    {
+                        dct4x4[i][x][y] = 0;
+                    }
+                }
+            }
+        }
+
+        /* calculate dct coeffs */
+        for( i = 0; i < 4; i++ )
+        {
+            /* copy dc coeff */
+            dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
+        }
+        h->dctf.add8x8_idct( p_dst, i_stride, dct4x4 );
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode_pskip:
+ *  Encode an already marked skip block
+ *****************************************************************************/
+void x264_macroblock_encode_pskip( x264_t *h )
+{
+    const int mvx = h->mb.cache.mv[0][x264_scan8[0]][0];
+    const int mvy = h->mb.cache.mv[0][x264_scan8[0]][1];
+    int i;
+
+    /* Motion compensation XXX probably unneeded */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
+                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                    mvx, mvy, 16, 16 );
+
+    /* Chroma MC */
+    h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1], h->mb.pic.i_stride[1],
+                      h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
+                      mvx, mvy, 8, 8 );
+
+    h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][2], h->mb.pic.i_stride[2],
+                      h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
+                      mvx, mvy, 8, 8 );
+
+    h->mb.i_cbp_luma = 0x00;
+    h->mb.i_cbp_chroma = 0x00;
+
+    for( i = 0; i < 16+8; i++ )
+    {
+        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
+    }
+
+    /* store cbp */
+    h->mb.cbp[h->mb.i_mb_xy] = 0;
+}
+
+/*****************************************************************************
+ * x264_macroblock_encode:
+ *****************************************************************************/
+void x264_macroblock_encode( x264_t *h )
+{
+    int i_cbp_dc = 0;
+    int i_qscale;
+    int i;
+
+    if( h->mb.i_type == P_SKIP )
+    {
+        /* A bit special */
+        x264_macroblock_encode_pskip( h );
+        return;
+    }
+
+    /* quantification scale */
+    i_qscale = h->mb.qp[h->mb.i_mb_xy];
+
+    if( h->mb.i_type == I_16x16 )
+    {
+        const int i_mode = h->mb.i_intra16x16_pred_mode;
+        /* do the right prediction */
+        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+        /* encode the 16x16 macroblock */
+        x264_mb_encode_i16x16( h, i_qscale );
+
+        /* fix the pred mode value */
+        h->mb.i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[i_mode];
+    }
+    else if( h->mb.i_type == I_4x4 )
+    {
+        for( i = 0; i < 16; i++ )
+        {
+            const int i_dst = h->mb.pic.i_stride[0];
+            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst];
+            int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+
+            /* Do the right prediction */
+            h->predict_4x4[i_mode]( p_dst, i_dst );
+
+            /* encode one 4x4 block */
+            x264_mb_encode_i4x4( h, i, i_qscale );
+
+            /* fix the pred mode value */
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix[i_mode];
+        }
+    }
+    else    /* Inter MB */
+    {
+        int16_t dct4x4[16][4][4];
+
+        int i8x8, i4x4, idx;
+        int i_decimate_mb = 0;
+
+        /* Motion compensation */
+        x264_mb_mc( h );
+
+        h->dctf.sub16x16_dct( dct4x4,
+                              h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        {
+            int i_decimate_8x8;
+
+            /* encode one 4x4 block */
+            i_decimate_8x8 = 0;
+            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                idx = i8x8 * 4 + i4x4;
+
+                quant_4x4( dct4x4[idx], i_qscale, 0 );
+                scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
+                x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+
+                i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
+            }
+
+            /* decimate this 8x8 block */
+            i_decimate_mb += i_decimate_8x8;
+            if( i_decimate_8x8 < 4 )
+            {
+                for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    int x, y;
+                    idx = i8x8 * 4 + i4x4;
+                    for( i = 0; i < 16; i++ )
+                    {
+                        h->dct.block[idx].luma4x4[i] = 0;
+                    }
+                    for( x = 0; x < 4; x++ )
+                    {
+                        for( y = 0; y < 4; y++ )
+                        {
+                            dct4x4[idx][x][y] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        if( i_decimate_mb < 6 )
+        {
+            for( idx = 0; idx < 16; idx++ )
+            {
+                for( i = 0; i < 16; i++ )
+                {
+                    h->dct.block[idx].luma4x4[i] = 0;
+                }
+            }
+        }
+        else
+        {
+            h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 );
+        }
+    }
+
+    /* encode chroma */
+    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    if( IS_INTRA( h->mb.i_type ) )
+    {
+        const int i_mode = h->mb.i_chroma_pred_mode;
+        /* do the right prediction */
+        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
+        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
+
+        /* fix the pred mode value */
+        h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8_fix[i_mode];
+    }
+
+    /* encode the 8x8 blocks */
+    x264_mb_encode_8x8( h, !IS_INTRA( h->mb.i_type ), i_qscale );
+
+    /* Calculate the Luma/Chroma patern and non_zero_count */
+    if( h->mb.i_type == I_16x16 )
+    {
+        h->mb.i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            if( nz > 0 )
+            {
+                h->mb.i_cbp_luma = 0x0f;
+            }
+        }
+    }
+    else
+    {
+        h->mb.i_cbp_luma = 0x00;
+        for( i = 0; i < 16; i++ )
+        {
+            const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            if( nz > 0 )
+            {
+                h->mb.i_cbp_luma |= 1 << (i/4);
+            }
+        }
+    }
+
+    /* Calculate the chroma patern */
+    h->mb.i_cbp_chroma = 0x00;
+    for( i = 0; i < 8; i++ )
+    {
+        const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
+        h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
+        if( nz > 0 )
+        {
+            h->mb.i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
+        }
+    }
+    if( h->mb.i_cbp_chroma == 0x00 &&
+        ( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 || array_non_zero_count( h->dct.chroma_dc[1], 4 ) ) > 0 )
+    {
+        h->mb.i_cbp_chroma = 0x01;    /* dc only */
+    }
+
+    if( h->param.b_cabac )
+    {
+        if( h->mb.i_type == I_16x16 && array_non_zero_count( h->dct.luma16x16_dc, 16 ) > 0 )
+            i_cbp_dc = 0x01;
+        else
+            i_cbp_dc = 0x00;
+
+        if( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 )
+            i_cbp_dc |= 0x02;
+        if( array_non_zero_count( h->dct.chroma_dc[1], 4 ) > 0 )
+            i_cbp_dc |= 0x04;
+    }
+
+    /* store cbp */
+    h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
+
+    /* Check for P_SKIP
+     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
+     *      (if multiple mv give same result)*/
+    if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
+        h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
+        h->mb.qp[h->mb.i_mb_xy] == h->mb.i_last_qp )
+    {
+        if( h->mb.cache.ref[0][x264_scan8[0]] == 0 )
+        {
+            int mvp[2];
+
+            x264_mb_predict_mv_pskip( h, mvp );
+            if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
+                h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
+            {
+                h->mb.type[h->mb.i_mb_xy] = h->mb.i_type = P_SKIP;
+            }
+        }
+    }
+}
+
+/*****************************************************************************
+ * x264_macroblock_probe_pskip:
+ *  Check if the current MB could be encoded as a P_SKIP (it supposes you use
+ *  the previous QP
+ *****************************************************************************/
+int x264_macroblock_probe_pskip( x264_t *h )
+{
+    DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
+    DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
+    DECLARE_ALIGNED( int,     dctscan[16], 16 );
+
+    int i_qp;
+    int mvp[2];
+    int ch;
+    int n;
+
+    int i8x8, i4x4;
+    int i_decimate_mb;
+
+    /* quantification scale */
+    i_qp = h->mb.qp[h->mb.i_mb_xy];
+
+    /* Get the MV */
+    x264_mb_predict_mv_pskip( h, mvp );
+
+    /* Special case, need to clip the vector */
+    n = 16 * h->mb.i_mb_x + mvp[0];
+    if( n < -24 )
+        mvp[0] = -24 - 16*h->mb.i_mb_x;
+    else if( n > 16 * h->sps->i_mb_width + 24 )
+        mvp[0] = 16 * ( h->sps->i_mb_width - h->mb.i_mb_x ) + 24;
+
+    n = 16 * h->mb.i_mb_y + mvp[1];
+    if( n < -24 )
+        mvp[1] = -24 - 16*h->mb.i_mb_y;
+    else if( n > 16 * h->sps->i_mb_height + 8 )
+        mvp[1] = 16 * ( h->sps->i_mb_height - h->mb.i_mb_y ) + 8;
+
+
+    /* Motion compensation */
+    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
+                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                    mvp[0], mvp[1], 16, 16 );
+
+    /* get luma diff */
+    h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                  h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+    for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
+    {
+        /* encode one 4x4 block */
+        for( i4x4 = 0; i4x4 < 4; i4x4++ )
+        {
+            const int idx = i8x8 * 4 + i4x4;
+
+            quant_4x4( dct4x4[idx], i_qp, 0 );
+            scan_zigzag_4x4full( dctscan, dct4x4[idx] );
+
+            i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
+
+            if( i_decimate_mb >= 6 )
+            {
+                /* not as P_SKIP */
+                return 0;
+            }
+        }
+    }
+
+    /* encode chroma */
+    i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        const int i_stride = h->mb.pic.i_stride[1+ch];
+        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
+        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
+
+        h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1+ch], i_stride,
+                          h->mb.pic.p_fdec[1+ch],       i_stride,
+                          mvp[0], mvp[1], 8, 8 );
+
+        h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
+
+        /* calculate dct DC */
+        dct2x2[0][0] = dct4x4[0][0][0];
+        dct2x2[0][1] = dct4x4[1][0][0];
+        dct2x2[1][0] = dct4x4[2][0][0];
+        dct2x2[1][1] = dct4x4[3][0][0];
+        h->dctf.dct2x2dc( dct2x2 );
+        quant_2x2_dc( dct2x2, i_qp, 0 );
+        if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
+        {
+            /* can't be */
+            return 0;
+        }
+
+        /* calculate dct coeffs */
+        for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
+        {
+            quant_4x4( dct4x4[i4x4], i_qp, 0 );
+            scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
+
+            i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
+            if( i_decimate_mb >= 7 )
+            {
+                return 0;
+            }
+        }
+    }
+
+    return 1;
+}
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@ -0,0 +1,38 @@
+/*****************************************************************************
+ * macroblock.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: macroblock.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ENCODER_MACROBLOCK_H
+#define _ENCODER_MACROBLOCK_H 1
+
+#include "../core/macroblock.h"
+
+int x264_macroblock_probe_pskip( x264_t *h );
+
+void x264_macroblock_encode      ( x264_t *h );
+void x264_macroblock_write_cabac ( x264_t *h, bs_t *s );
+void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
+
+void x264_cabac_mb_skip( x264_t *h, int b_skip );
+
+#endif
+
--- a/encoder/me.c
+++ b/encoder/me.c
@ -0,0 +1,194 @@
+/*****************************************************************************
+ * me.c: h264 encoder library (Motion Estimation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: me.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "me.h"
+
+void x264_me_search( x264_t *h, x264_me_t *m )
+{
+    const int i_pixel = m->i_pixel;
+    int bcost;
+    int bmx, bmy;
+    uint8_t *p_fref = m->p_fref;
+    int i_iter;
+
+
+    /* init with mvp */
+    /* XXX: We don't need to clamp because the way diamond work, we will
+     * never go outside padded picture, and predict mv won't compute vector
+     * with componant magnitude greater.
+     * XXX: if some vector can go outside, (accelerator, ....) you need to clip
+     * them yourself */
+    bmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+    bmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+
+    p_fref = &m->p_fref[bmy * m->i_stride + bmx];
+    bcost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride );
+
+
+    /* try a candidate if provided */
+    if( m->b_mvc )
+    {
+        const int mx = x264_clip3( ( m->mvc[0] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+        const int my = x264_clip3( ( m->mvc[1] + 2 ) >> 2, -m->i_mv_range, m->i_mv_range );
+        uint8_t *p_fref2 = &m->p_fref[my*m->i_stride+mx];
+        int cost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, p_fref2, m->i_stride ) +
+                   m->lm * ( bs_size_se( m->mvc[0] - m->mvp[0] ) + bs_size_se( m->mvc[1] - m->mvp[1] ) );
+        if( cost < bcost )
+        {
+            bmx = mx;
+            bmy = my;
+            bcost = cost;
+            p_fref = p_fref2;
+        }
+    }
+
+    /* Don't need to test mv_range each time, we won't go outside picture+padding */
+    /* diamond */
+    for( i_iter = 0; i_iter < 16; i_iter++ )
+    {
+        int best = 0;
+        int cost[4];
+
+#define COST_MV( c, dx, dy ) \
+        (c) = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride,                    \
+                               &p_fref[(dy)*m->i_stride+(dx)], m->i_stride ) + \
+              m->lm * ( bs_size_se(((bmx+(dx))<<2) - m->mvp[0] ) +         \
+                        bs_size_se(((bmy+(dy))<<2) - m->mvp[1] ) )
+
+        COST_MV( cost[0],  0, -1 );
+        COST_MV( cost[1],  0,  1 );
+        COST_MV( cost[2], -1,  0 );
+        COST_MV( cost[3],  1,  0 );
+#undef COST_MV
+
+        if( cost[1] < cost[0] )    best = 1;
+        if( cost[2] < cost[best] ) best = 2;
+        if( cost[3] < cost[best] ) best = 3;
+
+        if( bcost <= cost[best] )
+            break;
+
+        bcost = cost[best];
+
+        if( best == 0 ) {
+            bmy--;
+            p_fref -= m->i_stride;
+        } else if( best == 1 ) {
+            bmy++;
+            p_fref += m->i_stride;
+        } else if( best == 2 ) {
+            bmx--;
+            p_fref--;
+        } else if( best == 3 ) {
+            bmx++;
+            p_fref++;
+        }
+    }
+
+    /* -> qpel mv */
+    m->mv[0] = bmx << 2;
+    m->mv[1] = bmy << 2;
+
+    /* compute the real cost */
+    m->cost = h->pixf.satd[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride ) +
+                m->lm * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
+                          bs_size_se( m->mv[1] - m->mvp[1] ) );
+}
+
+void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
+{
+    const int bw = x264_pixel_size[m->i_pixel].w;
+    const int bh = x264_pixel_size[m->i_pixel].h;
+
+    DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
+    int cost[4];
+    int best;
+
+    int bmx = m->mv[0];
+    int bmy = m->mv[1];
+
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 2, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 2, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 2, bmy + 0, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 2, bmy + 0, bw, bh );
+
+    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 2 - m->mvp[1] ) );
+    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 2 - m->mvp[1] ) );
+    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
+              m->lm * ( bs_size_se( bmx - 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
+              m->lm * ( bs_size_se( bmx + 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+
+    best = 0;
+    if( cost[1] < cost[0] )    best = 1;
+    if( cost[2] < cost[best] ) best = 2;
+    if( cost[3] < cost[best] ) best = 3;
+
+    if( cost[best] < m->cost )
+    {
+        m->cost = cost[best];
+        if( best == 0 )      bmy -= 2;
+        else if( best == 1 ) bmy += 2;
+        else if( best == 2 ) bmx -= 2;
+        else if( best == 3 ) bmx += 2;
+    }
+
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 1, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 1, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 1, bmy + 0, bw, bh );
+    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 1, bmy + 0, bw, bh );
+
+    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 1 - m->mvp[1] ) );
+    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
+              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 1 - m->mvp[1] ) );
+    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
+              m->lm * ( bs_size_se( bmx - 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
+              m->lm * ( bs_size_se( bmx + 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+
+    best = 0;
+    if( cost[1] < cost[0] )    best = 1;
+    if( cost[2] < cost[best] ) best = 2;
+    if( cost[3] < cost[best] ) best = 3;
+
+    if( cost[best] < m->cost )
+    {
+        m->cost = cost[best];
+        if( best == 0 )      bmy--;
+        else if( best == 1 ) bmy++;
+        else if( best == 2 ) bmx--;
+        else if( best == 3 ) bmx++;
+    }
+
+    m->mv[0] = bmx;
+    m->mv[1] = bmy;
+}
--- a/encoder/me.h
+++ b/encoder/me.h
@ -0,0 +1,52 @@
+/*****************************************************************************
+ * me.h: h264 encoder library (Motion Estimation)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: me.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ME_H
+#define _ME_H 1
+
+typedef struct
+{
+    /* input */
+    int      i_pixel;   /* PIXEL_WxH */
+    int      lm;        /* lambda motion */
+
+    uint8_t *p_fref;
+    uint8_t *p_fenc;
+    int      i_stride;
+
+    int i_mv_range;
+
+    int mvp[2];
+
+    int b_mvc;
+    int mvc[2];
+
+    /* output */
+    int cost;           /* satd + lm * nbits */
+    int mv[2];
+} x264_me_t;
+
+void x264_me_search( x264_t *h, x264_me_t *m );
+void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
+
+#endif
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@ -0,0 +1,96 @@
+/*****************************************************************************
+ * ratecontrol.c: h264 encoder library (Rate Control)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ratecontrol.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../core/common.h"
+#include "ratecontrol.h"
+
+
+x264_ratecontrol_t *x264_ratecontrol_new( x264_param_t *param )
+{
+    x264_ratecontrol_t *rc = x264_malloc( sizeof( x264_ratecontrol_t ) );
+
+    rc->fps = param->f_fps > 0.1 ? param->f_fps : 25.0f;
+    rc->i_iframe = param->i_iframe;
+    rc->i_bitrate = param->i_bitrate * 1000;
+
+    rc->i_qp_last = 26;
+    rc->i_qp      = param->i_qp_constant;
+
+    rc->i_frames  = 0;
+    rc->i_size    = 0;
+
+    return rc;
+}
+
+void x264_ratecontrol_delete( x264_ratecontrol_t *rc )
+{
+    x264_free( rc );
+}
+
+void x264_ratecontrol_start( x264_ratecontrol_t *rc, int i_slice_type )
+{
+    rc->i_slice_type = i_slice_type;
+}
+
+int  x264_ratecontrol_qp( x264_ratecontrol_t *rc )
+{
+    return x264_clip3( rc->i_qp, 1, 51 );
+}
+
+void x264_ratecontrol_end( x264_ratecontrol_t *rc, int bits )
+{
+    return;
+#if 0
+    int i_avg;
+    int i_target = rc->i_bitrate / rc->fps;
+    int i_qp = rc->i_qp;
+
+    rc->i_qp_last = rc->i_qp;
+    rc->i_frames++;
+    rc->i_size += bits / 8;
+
+    i_avg = 8 * rc->i_size / rc->i_frames;
+
+    if( rc->i_slice_type == SLICE_TYPE_I )
+    {
+        i_target = i_target * 20 / 10;
+    }
+
+    if( i_avg > i_target * 11 / 10 )
+    {
+        i_qp = rc->i_qp + ( i_avg / i_target - 1 );
+    }
+    else if( i_avg < i_target * 9 / 10 )
+    {
+        i_qp = rc->i_qp - ( i_target / i_avg - 1 );
+    }
+
+    rc->i_qp = x264_clip3( i_qp, rc->i_qp_last - 2, rc->i_qp_last + 2 );
+#endif
+}
+
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@ -0,0 +1,52 @@
+/*****************************************************************************
+ * ratecontrol.h: h264 encoder library (Rate Control)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: ratecontrol.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _RATECONTROL_H
+#define _RATECONTROL_H 1
+
+struct x264_ratecontrol_t
+{
+    float fps;
+    int   i_iframe;
+
+    int i_bitrate;
+    int i_qp_last;
+    int i_qp;
+
+    int i_slice_type;
+
+    int     i_frames;
+    int64_t i_size;
+
+};
+
+
+x264_ratecontrol_t *x264_ratecontrol_new   ( x264_param_t * );
+void                x264_ratecontrol_delete( x264_ratecontrol_t * );
+
+void x264_ratecontrol_start( x264_ratecontrol_t *, int i_slice_type );
+int  x264_ratecontrol_qp( x264_ratecontrol_t * );
+void x264_ratecontrol_end( x264_ratecontrol_t *, int bits );
+
+#endif
+
--- a/encoder/set.c
+++ b/encoder/set.c
@ -0,0 +1,382 @@
+/*****************************************************************************
+ * set: h264 encoder (SPS and SPS init and write)
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../x264.h"
+#include "../core/bs.h"
+#include "../core/set.h"
+
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
+{
+    sps->i_id               = i_id;
+
+    if( param->b_cabac || param->i_bframe > 0 )
+        sps->i_profile_idc      = PROFILE_MAIN;
+    else
+        sps->i_profile_idc      = PROFILE_BASELINE;
+
+    sps->i_level_idc        = 21;               /* FIXME ? */
+    sps->b_constraint_set0  = 0;
+    sps->b_constraint_set1  = 0;
+    sps->b_constraint_set2  = 0;
+
+    sps->i_log2_max_frame_num = 4;  /* at least 4 */
+    while( (1 << sps->i_log2_max_frame_num) <= param->i_idrframe * param->i_iframe )
+    {
+        sps->i_log2_max_frame_num++;
+    }
+    sps->i_log2_max_frame_num++;    /* just in case */
+
+    sps->i_poc_type = 0;
+    if( sps->i_poc_type == 0 )
+    {
+        sps->i_log2_max_poc_lsb = sps->i_log2_max_frame_num + 1;    /* max poc = 2*frame_num */
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+
+        /* FIXME */
+        sps->b_delta_pic_order_always_zero = 1;
+        sps->i_offset_for_non_ref_pic = 0;
+        sps->i_offset_for_top_to_bottom_field = 0;
+        sps->i_num_ref_frames_in_poc_cycle = 0;
+
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            sps->i_offset_for_ref_frame[i] = 0;
+        }
+    }
+
+    sps->i_num_ref_frames = param->i_frame_reference + 1; /* +1 for 2 ref in B */
+    sps->b_gaps_in_frame_num_value_allowed = 0;
+    sps->i_mb_width = ( param->i_width + 15 ) / 16;
+    sps->i_mb_height= ( param->i_height + 15 )/ 16;
+    sps->b_frame_mbs_only = 1;
+    sps->b_mb_adaptive_frame_field = 0;
+    sps->b_direct8x8_inference = 0;
+    if( sps->b_frame_mbs_only == 0 )
+    {
+        sps->b_direct8x8_inference = 1;
+    }
+
+    if( param->i_width % 16 != 0 || param->i_height % 16 != 0 )
+    {
+        sps->b_crop = 1;
+        sps->crop.i_left    = 0;
+        sps->crop.i_right   = ( 16 - param->i_width % 16)/2;
+        sps->crop.i_top     = 0;
+        sps->crop.i_bottom  = ( 16 - param->i_height % 16)/2;
+    }
+    else
+    {
+        sps->b_crop = 0;
+        sps->crop.i_left    = 0;
+        sps->crop.i_right   = 0;
+        sps->crop.i_top     = 0;
+        sps->crop.i_bottom  = 0;
+    }
+
+    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
+    {
+        int w = param->vui.i_sar_width;
+        int h = param->vui.i_sar_height;
+        int a = w, b = h;
+
+        while( b != 0 )
+        {
+            int t = a;
+
+            a = b;
+            b = t % b;
+        }
+
+        w /= a;
+        h /= a;
+        while( w > 65535 || h > 65535 )
+        {
+            w /= 2;
+            h /= 2;
+        }
+
+        if( w == 0 || h == 0 )
+        {
+            fprintf( stderr, "x264: cannot create valid sample aspect ratio\n" );
+            sps->b_vui = 0;
+        }
+        else if( w == h )
+        {
+            fprintf( stderr, "x264: no need for a SAR\n" );
+            sps->b_vui = 0;
+        }
+        else
+        {
+            fprintf( stderr, "x264: using SAR=%d/%d\n", w, h );
+            sps->b_vui = 1;
+            sps->vui.i_sar_width = w;
+            sps->vui.i_sar_height= h;
+        }
+    }
+    else
+    {
+        sps->b_vui = 0;
+    }
+}
+
+
+void x264_sps_write( bs_t *s, x264_sps_t *sps )
+{
+    bs_write( s, 8, sps->i_profile_idc );
+    bs_write( s, 1, sps->b_constraint_set0 );
+    bs_write( s, 1, sps->b_constraint_set1 );
+    bs_write( s, 1, sps->b_constraint_set2 );
+
+    bs_write( s, 5, 0 );    /* reserved */
+
+    bs_write( s, 8, sps->i_level_idc );
+
+    bs_write_ue( s, sps->i_id );
+    bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
+    bs_write_ue( s, sps->i_poc_type );
+    if( sps->i_poc_type == 0 )
+    {
+        bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 );
+    }
+    else if( sps->i_poc_type == 1 )
+    {
+        int i;
+
+        bs_write( s, 1, sps->b_delta_pic_order_always_zero );
+        bs_write_se( s, sps->i_offset_for_non_ref_pic );
+        bs_write_se( s, sps->i_offset_for_top_to_bottom_field );
+        bs_write_ue( s, sps->i_num_ref_frames_in_poc_cycle );
+
+        for( i = 0; i < sps->i_num_ref_frames_in_poc_cycle; i++ )
+        {
+            bs_write_se( s, sps->i_offset_for_ref_frame[i] );
+        }
+    }
+    bs_write_ue( s, sps->i_num_ref_frames );
+    bs_write( s, 1, sps->b_gaps_in_frame_num_value_allowed );
+    bs_write_ue( s, sps->i_mb_width - 1 );
+    bs_write_ue( s, sps->i_mb_height - 1);
+    bs_write( s, 1, sps->b_frame_mbs_only );
+    if( !sps->b_frame_mbs_only )
+    {
+        bs_write( s, 1, sps->b_mb_adaptive_frame_field );
+    }
+    bs_write( s, 1, sps->b_direct8x8_inference );
+
+    bs_write( s, 1, sps->b_crop );
+    if( sps->b_crop )
+    {
+        bs_write_ue( s, sps->crop.i_left );
+        bs_write_ue( s, sps->crop.i_right );
+        bs_write_ue( s, sps->crop.i_top );
+        bs_write_ue( s, sps->crop.i_bottom );
+    }
+
+    bs_write( s, 1, sps->b_vui );
+    if( sps->b_vui )
+    {
+        int i;
+        static const struct { int w, h; int sar; } sar[] =
+        {
+            { 1,   1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
+            { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
+            { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
+            { 160,99, 13}, { 0, 0, -1 }
+        };
+        bs_write1( s, 1 );      /* aspect_ratio_info_present_flag */
+        for( i = 0; sar[i].sar != -1; i++ )
+        {
+            if( sar[i].w == sps->vui.i_sar_width && sar[i].h == sps->vui.i_sar_height )
+                break;
+        }
+        if( sar[i].sar != -1 )
+        {
+            bs_write( s, 8, sar[i].sar );
+        }
+        else
+        {
+            bs_write( s, 8, 255);   /* aspect_ration_idc (extented) */
+            bs_write( s, 16, sps->vui.i_sar_width );
+            bs_write( s, 16, sps->vui.i_sar_height );
+        }
+
+        bs_write1( s, 0 );      /* overscan_info_present_flag */
+
+        bs_write1( s, 0 );      /* video_signal_type_present_flag */
+#if 0
+        bs_write( s, 3, 5 );    /* unspecified video format */
+        bs_write1( s, 1 );      /* video full range flag */
+        bs_write1( s, 0 );      /* colour description present flag */
+#endif
+        bs_write1( s, 0 );      /* chroma_loc_info_present_flag */
+        bs_write1( s, 0 );      /* timing_info_present_flag */
+        bs_write1( s, 0 );      /* nal_hrd_parameters_present_flag */
+        bs_write1( s, 0 );      /* vcl_hrd_parameters_present_flag */
+        bs_write1( s, 0 );      /* pic_struct_present_flag */
+        bs_write1( s, 0 );      /* bitstream_restriction_flag */
+    }
+
+    bs_rbsp_trailing( s );
+}
+
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
+{
+    pps->i_id = i_id;
+    pps->i_sps_id = sps->i_id;
+    pps->b_cabac = param->b_cabac;
+
+    pps->b_pic_order = 0;
+    pps->i_num_slice_groups = 1;
+
+    if( pps->i_num_slice_groups > 1 )
+    {
+        int i;
+
+        pps->i_slice_group_map_type = 0;
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_run_length[i] = 1;
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                pps->i_top_left[i] = 0;
+                pps->i_bottom_right[i] = 0;
+            }
+        }
+        else if( pps->i_slice_group_map_type >= 3 &&
+                 pps->i_slice_group_map_type <= 5 )
+        {
+            pps->b_slice_group_change_direction = 0;
+            pps->i_slice_group_change_rate = 0;
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            pps->i_pic_size_in_map_units = 1;
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+                pps->i_slice_group_id[i] = 0;
+            }
+        }
+    }
+    pps->i_num_ref_idx_l0_active = 1;
+    pps->i_num_ref_idx_l1_active = 1;
+
+    pps->b_weighted_pred = 0;
+    pps->b_weighted_bipred = 0;
+
+    pps->i_pic_init_qp = 26;
+    pps->i_pic_init_qs = 26;
+
+    pps->i_chroma_qp_index_offset = 0;
+#if 0
+    if( !param->b_deblocking_filter )
+    {
+        pps->b_deblocking_filter_control = 1;
+    }
+    else
+    {
+        pps->b_deblocking_filter_control = 1;
+    }
+#endif
+    pps->b_deblocking_filter_control = 1;
+    pps->b_constrained_intra_pred = 0;
+    pps->b_redundant_pic_cnt = 0;
+}
+
+void x264_pps_write( bs_t *s, x264_pps_t *pps )
+{
+    bs_write_ue( s, pps->i_id );
+    bs_write_ue( s, pps->i_sps_id );
+
+    bs_write( s, 1, pps->b_cabac );
+    bs_write( s, 1, pps->b_pic_order );
+    bs_write_ue( s, pps->i_num_slice_groups - 1 );
+
+    if( pps->i_num_slice_groups > 1 )
+    {
+        int i;
+
+        bs_write_ue( s, pps->i_slice_group_map_type );
+        if( pps->i_slice_group_map_type == 0 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                bs_write_ue( s, pps->i_run_length[i] - 1 );
+            }
+        }
+        else if( pps->i_slice_group_map_type == 2 )
+        {
+            for( i = 0; i < pps->i_num_slice_groups; i++ )
+            {
+                bs_write_ue( s, pps->i_top_left[i] );
+                bs_write_ue( s, pps->i_bottom_right[i] );
+            }
+        }
+        else if( pps->i_slice_group_map_type >= 3 &&
+                 pps->i_slice_group_map_type <= 5 )
+        {
+            bs_write( s, 1, pps->b_slice_group_change_direction );
+            bs_write_ue( s, pps->b_slice_group_change_direction - 1 );
+        }
+        else if( pps->i_slice_group_map_type == 6 )
+        {
+            bs_write_ue( s, pps->i_pic_size_in_map_units - 1 );
+            for( i = 0; i < pps->i_pic_size_in_map_units; i++ )
+            {
+                /* FIXME */
+                /* bs_write( s, ceil( log2( pps->i_pic_size_in_map_units +1 ) ),
+                 *              pps->i_slice_group_id[i] );
+                 */
+            }
+        }
+    }
+
+    bs_write_ue( s, pps->i_num_ref_idx_l0_active - 1 );
+    bs_write_ue( s, pps->i_num_ref_idx_l1_active - 1 );
+    bs_write( s, 1, pps->b_weighted_pred );
+    bs_write( s, 2, pps->b_weighted_bipred );
+
+    bs_write_se( s, pps->i_pic_init_qp - 26 );
+    bs_write_se( s, pps->i_pic_init_qs - 26 );
+    bs_write_se( s, pps->i_chroma_qp_index_offset );
+
+    bs_write( s, 1, pps->b_deblocking_filter_control );
+    bs_write( s, 1, pps->b_constrained_intra_pred );
+    bs_write( s, 1, pps->b_redundant_pic_cnt );
+
+    bs_rbsp_trailing( s );
+}
+
--- a/encoder/set.h
+++ b/encoder/set.h
@ -0,0 +1,32 @@
+/*****************************************************************************
+ * set.h: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: set.h,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _ENCODER_SET_H
+#define _ENCODER_SET_H 1
+
+void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
+void x264_sps_write( bs_t *s, x264_sps_t *sps );
+void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
+void x264_pps_write( bs_t *s, x264_pps_t *pps );
+
+#endif
--- a/extras/getopt.c
+++ b/extras/getopt.c
@ -0,0 +1,503 @@
+/*	$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $	*/
+
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#define REPLACE_GETOPT
+
+#define _DIAGASSERT(x) do {} while (0)
+
+#ifdef REPLACE_GETOPT
+#ifdef __weak_alias
+__weak_alias(getopt,_getopt)
+#endif
+int opterr = 1;	/* if error message should be printed */
+int optind = 1;	/* index into parent argv vector */
+int optopt = '?';	/* character checked for validity */
+int optreset;	/* reset getopt */
+char *optarg;	/* argument associated with option */
+#endif
+
+#ifdef __weak_alias
+__weak_alias(getopt_long,_getopt_long)
+#endif
+
+#ifndef __CYGWIN__
+#define __progname __argv[0]
+#else
+extern char *__progname;
+#endif
+
+#define IGNORE_FIRST	(*options == '-' || *options == '+')
+#define PRINT_ERROR	((opterr) && ((*options != ':') \
+				      || (IGNORE_FIRST && options[1] != ':')))
+
+#define IS_POSIXLY_CORRECT (getenv("POSIXLY_INCORRECT_GETOPT") == NULL)
+
+#define PERMUTE         (!IS_POSIXLY_CORRECT && !IGNORE_FIRST)
+/* XXX: GNU ignores PC if *options == '-' */
+#define IN_ORDER        (!IS_POSIXLY_CORRECT && *options == '-')
+
+/* return values */
+#define	BADCH	(int)'?'
+#define	BADARG		((IGNORE_FIRST && options[1] == ':') \
+			 || (*options == ':') ? (int)':' : (int)'?')
+#define INORDER (int)1
+
+static char EMSG[1];
+
+static int getopt_internal (int, char * const *, const char *);
+static int gcd (int, int);
+static void permute_args (int, int, int, char * const *);
+
+static char *place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+static void
+_vwarnx(const char *fmt, va_list ap)
+{
+  (void)fprintf(stderr, "%s: ", __progname);
+  if (fmt != NULL)
+    (void)vfprintf(stderr, fmt, ap);
+  (void)fprintf(stderr, "\n");
+}
+
+static void
+warnx(const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  _vwarnx(fmt, ap);
+  va_end(ap);
+}
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(a, b)
+	int a;
+	int b;
+{
+	int c;
+
+	c = a % b;
+	while (c != 0) {
+		a = b;
+		b = c;
+		c = a % b;
+	}
+
+	return b;
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(panonopt_start, panonopt_end, opt_end, nargv)
+	int panonopt_start;
+	int panonopt_end;
+	int opt_end;
+	char * const *nargv;
+{
+	int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+	char *swap;
+
+	_DIAGASSERT(nargv != NULL);
+
+	/*
+	 * compute lengths of blocks and number and size of cycles
+	 */
+	nnonopts = panonopt_end - panonopt_start;
+	nopts = opt_end - panonopt_end;
+	ncycle = gcd(nnonopts, nopts);
+	cyclelen = (opt_end - panonopt_start) / ncycle;
+
+	for (i = 0; i < ncycle; i++) {
+		cstart = panonopt_end+i;
+		pos = cstart;
+		for (j = 0; j < cyclelen; j++) {
+			if (pos >= panonopt_end)
+				pos -= nnonopts;
+			else
+				pos += nopts;
+			swap = nargv[pos];
+			/* LINTED const cast */
+			((char **) nargv)[pos] = nargv[cstart];
+			/* LINTED const cast */
+			((char **)nargv)[cstart] = swap;
+		}
+	}
+}
+
+/*
+ * getopt_internal --
+ *	Parse argc/argv argument vector.  Called by user level routines.
+ *  Returns -2 if -- is found (can be long option or end of options marker).
+ */
+static int
+getopt_internal(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	char *oli;				/* option letter list index */
+	int optchar;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	optarg = NULL;
+
+	/*
+	 * XXX Some programs (like rsyncd) expect to be able to
+	 * XXX re-initialize optind to 0 and have getopt_long(3)
+	 * XXX properly function again.  Work around this braindamage.
+	 */
+	if (optind == 0)
+		optind = 1;
+
+	if (optreset)
+		nonopt_start = nonopt_end = -1;
+start:
+	if (optreset || !*place) {		/* update scanning pointer */
+		optreset = 0;
+		if (optind >= nargc) {          /* end of argument vector */
+			place = EMSG;
+			if (nonopt_end != -1) {
+				/* do permutation, if we have to */
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			else if (nonopt_start != -1) {
+				/*
+				 * If we skipped non-options, set optind
+				 * to the first of them.
+				 */
+				optind = nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((*(place = nargv[optind]) != '-')
+		    || (place[1] == '\0')) {    /* found non-option */
+			place = EMSG;
+			if (IN_ORDER) {
+				/*
+				 * GNU extension:
+				 * return non-option as argument to option 1
+				 */
+				optarg = nargv[optind++];
+				return INORDER;
+			}
+			if (!PERMUTE) {
+				/*
+				 * if no permutation wanted, stop parsing
+				 * at first non-option
+				 */
+				return -1;
+			}
+			/* do permutation */
+			if (nonopt_start == -1)
+				nonopt_start = optind;
+			else if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				nonopt_start = optind -
+				    (nonopt_end - nonopt_start);
+				nonopt_end = -1;
+			}
+			optind++;
+			/* process next argument */
+			goto start;
+		}
+		if (nonopt_start != -1 && nonopt_end == -1)
+			nonopt_end = optind;
+		if (place[1] && *++place == '-') {	/* found "--" */
+			place++;
+			return -2;
+		}
+	}
+	if ((optchar = (int)*place++) == (int)':' ||
+	    (oli = strchr(options + (IGNORE_FIRST ? 1 : 0), optchar)) == NULL) {
+		/* option letter unknown or ':' */
+		if (!*place)
+			++optind;
+		if (PRINT_ERROR)
+			warnx(illoptchar, optchar);
+		optopt = optchar;
+		return BADCH;
+	}
+	if (optchar == 'W' && oli[1] == ';') {		/* -W long-option */
+		/* XXX: what if no long options provided (called by getopt)? */
+		if (*place)
+			return -2;
+
+		if (++optind >= nargc) {	/* no arg */
+			place = EMSG;
+			if (PRINT_ERROR)
+				warnx(recargchar, optchar);
+			optopt = optchar;
+			return BADARG;
+		} else				/* white space */
+			place = nargv[optind];
+		/*
+		 * Handle -W arg the same as --arg (which causes getopt to
+		 * stop parsing).
+		 */
+		return -2;
+	}
+	if (*++oli != ':') {			/* doesn't take argument */
+		if (!*place)
+			++optind;
+	} else {				/* takes (optional) argument */
+		optarg = NULL;
+		if (*place)			/* no white space */
+			optarg = place;
+		/* XXX: disable test for :: if PC? (GNU doesn't) */
+		else if (oli[1] != ':') {	/* arg not optional */
+			if (++optind >= nargc) {	/* no arg */
+				place = EMSG;
+				if (PRINT_ERROR)
+					warnx(recargchar, optchar);
+				optopt = optchar;
+				return BADARG;
+			} else
+				optarg = nargv[optind];
+		}
+		place = EMSG;
+		++optind;
+	}
+	/* dump back option letter */
+	return optchar;
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the real getopt]
+ */
+int
+getopt(nargc, nargv, options)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		++optind;
+		/*
+		 * We found an option (--), so if we skipped non-options,
+		 * we have to permute.
+		 */
+		if (nonopt_end != -1) {
+			permute_args(nonopt_start, nonopt_end, optind,
+				       nargv);
+			optind -= nonopt_end - nonopt_start;
+		}
+		nonopt_start = nonopt_end = -1;
+		retval = -1;
+	}
+	return retval;
+}
+#endif
+
+/*
+ * getopt_long --
+ *	Parse argc/argv argument vector.
+ */
+int
+getopt_long(nargc, nargv, options, long_options, idx)
+	int nargc;
+	char * const *nargv;
+	const char *options;
+	const struct option *long_options;
+	int *idx;
+{
+	int retval;
+
+	_DIAGASSERT(nargv != NULL);
+	_DIAGASSERT(options != NULL);
+	_DIAGASSERT(long_options != NULL);
+	/* idx may be NULL */
+
+	if ((retval = getopt_internal(nargc, nargv, options)) == -2) {
+		char *current_argv, *has_equal;
+		size_t current_argv_len;
+		int i, match;
+
+		current_argv = place;
+		match = -1;
+
+		optind++;
+		place = EMSG;
+
+		if (*current_argv == '\0') {		/* found "--" */
+			/*
+			 * We found an option (--), so if we skipped
+			 * non-options, we have to permute.
+			 */
+			if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return -1;
+		}
+		if ((has_equal = strchr(current_argv, '=')) != NULL) {
+			/* argument found (--option=arg) */
+			current_argv_len = has_equal - current_argv;
+			has_equal++;
+		} else
+			current_argv_len = strlen(current_argv);
+
+		for (i = 0; long_options[i].name; i++) {
+			/* find matching long option */
+			if (strncmp(current_argv, long_options[i].name,
+			    current_argv_len))
+				continue;
+
+			if (strlen(long_options[i].name) ==
+			    (unsigned)current_argv_len) {
+				/* exact match */
+				match = i;
+				break;
+			}
+			if (match == -1)		/* partial match */
+				match = i;
+			else {
+				/* ambiguous abbreviation */
+				if (PRINT_ERROR)
+					warnx(ambig, (int)current_argv_len,
+					     current_argv);
+				optopt = 0;
+				return BADCH;
+			}
+		}
+		if (match != -1) {			/* option found */
+			if (long_options[match].has_arg == no_argument
+			    && has_equal) {
+				if (PRINT_ERROR)
+					warnx(noarg, (int)current_argv_len,
+					     current_argv);
+				/*
+				 * XXX: GNU sets optopt to val regardless of
+				 * flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				return BADARG;
+			}
+			if (long_options[match].has_arg == required_argument ||
+			    long_options[match].has_arg == optional_argument) {
+				if (has_equal)
+					optarg = has_equal;
+				else if (long_options[match].has_arg ==
+				    required_argument) {
+					/*
+					 * optional argument doesn't use
+					 * next nargv
+					 */
+					optarg = nargv[optind++];
+				}
+			}
+			if ((long_options[match].has_arg == required_argument)
+			    && (optarg == NULL)) {
+				/*
+				 * Missing argument; leading ':'
+				 * indicates no error should be generated
+				 */
+				if (PRINT_ERROR)
+					warnx(recargstring, current_argv);
+				/*
+				 * XXX: GNU sets optopt to val regardless
+				 * of flag
+				 */
+				if (long_options[match].flag == NULL)
+					optopt = long_options[match].val;
+				else
+					optopt = 0;
+				--optind;
+				return BADARG;
+			}
+		} else {			/* unknown option */
+			if (PRINT_ERROR)
+				warnx(illoptstring, current_argv);
+			optopt = 0;
+			return BADCH;
+		}
+		if (long_options[match].flag) {
+			*long_options[match].flag = long_options[match].val;
+			retval = 0;
+		} else
+			retval = long_options[match].val;
+		if (idx)
+			*idx = match;
+	}
+	return retval;
+}
--- a/extras/getopt.h
+++ b/extras/getopt.h
@ -0,0 +1,179 @@
+/* Declarations for getopt.
+   Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+
+#ifndef _GETOPT_H
+
+#ifndef __need_getopt
+# define _GETOPT_H 1
+#endif
+
+/* If __GNU_LIBRARY__ is not already defined, either we are being used
+   standalone, or this is the first header included in the source file.
+   If we are being used with glibc, we need to include <features.h>, but
+   that does not exist if we are standalone.  So: if __GNU_LIBRARY__ is
+   not defined, include <ctype.h>, which will pull in <features.h> for us
+   if it's from glibc.  (Why ctype.h?  It's guaranteed to exist and it
+   doesn't flood the namespace with stuff the way some other headers do.)  */
+#if !defined __GNU_LIBRARY__
+# include <ctype.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+   When `getopt' finds an option that takes an argument,
+   the argument value is returned here.
+   Also, when `ordering' is RETURN_IN_ORDER,
+   each non-option ARGV-element is returned here.  */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+   This is used for communication to and from the caller
+   and for communication between successive calls to `getopt'.
+
+   On entry to `getopt', zero means this is the first call; initialize.
+
+   When `getopt' returns -1, this is the index of the first of the
+   non-option elements that the caller should itself scan.
+
+   Otherwise, `optind' communicates from one call to the next
+   how much of ARGV has been scanned so far.  */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+   for unrecognized options.  */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized.  */
+
+extern int optopt;
+
+#ifndef __need_getopt
+/* Describe the long-named options requested by the application.
+   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+   of `struct option' terminated by an element containing a name which is
+   zero.
+
+   The field `has_arg' is:
+   no_argument		(or 0) if the option does not take an argument,
+   required_argument	(or 1) if the option requires an argument,
+   optional_argument 	(or 2) if the option takes an optional argument.
+
+   If the field `flag' is not NULL, it points to a variable that is set
+   to the value given in the field `val' when the option is found, but
+   left unchanged if the option is not found.
+
+   To have a long-named option do something other than set an `int' to
+   a compiled-in constant, such as set a value from `optarg', set the
+   option's `flag' field to zero and its `val' field to a nonzero
+   value (the equivalent single-letter option character, if there is
+   one).  For long options that have a zero `flag' field, `getopt'
+   returns the contents of the `val' field.  */
+
+struct option
+{
+# if (defined __STDC__ && __STDC__) || defined __cplusplus
+  const char *name;
+# else
+  char *name;
+# endif
+  /* has_arg can't be an enum because some compilers complain about
+     type mismatches in all the code that assumes it is an int.  */
+  int has_arg;
+  int *flag;
+  int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'.  */
+
+# define no_argument		0
+# define required_argument	1
+# define optional_argument	2
+#endif	/* need getopt */
+
+
+/* Get definitions and prototypes for functions to process the
+   arguments in ARGV (ARGC of them, minus the program name) for
+   options given in OPTS.
+
+   Return the option character from OPTS just read.  Return -1 when
+   there are no more options.  For unrecognized options, or options
+   missing arguments, `optopt' is set to the option letter, and '?' is
+   returned.
+
+   The OPTS string is a list of characters which are recognized option
+   letters, optionally followed by colons, specifying that that letter
+   takes an argument, to be placed in `optarg'.
+
+   If a letter in OPTS is followed by two colons, its argument is
+   optional.  This behavior is specific to the GNU `getopt'.
+
+   The argument `--' causes premature termination of argument
+   scanning, explicitly telling `getopt' that there are no more
+   options.
+
+   If OPTS begins with `--', then non-option arguments are treated as
+   arguments to the option '\0'.  This behavior is specific to the GNU
+   `getopt'.  */
+
+#if (defined __STDC__ && __STDC__) || defined __cplusplus
+# ifdef __GNU_LIBRARY__
+/* Many other libraries have conflicting prototypes for getopt, with
+   differences in the consts, in stdlib.h.  To avoid compilation
+   errors, only prototype getopt for the GNU C library.  */
+extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
+# else /* not __GNU_LIBRARY__ */
+extern int getopt ();
+# endif /* __GNU_LIBRARY__ */
+
+# ifndef __need_getopt
+extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
+		        const struct option *__longopts, int *__longind);
+extern int getopt_long_only (int __argc, char *const *__argv,
+			     const char *__shortopts,
+		             const struct option *__longopts, int *__longind);
+
+/* Internal only.  Users should not call this directly.  */
+extern int _getopt_internal (int __argc, char *const *__argv,
+			     const char *__shortopts,
+		             const struct option *__longopts, int *__longind,
+			     int __long_only);
+# endif
+#else /* not __STDC__ */
+extern int getopt ();
+# ifndef __need_getopt
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+# endif
+#endif /* __STDC__ */
+
+#ifdef	__cplusplus
+}
+#endif
+
+/* Make sure we later can get all the definitions and declarations.  */
+#undef __need_getopt
+
+#endif /* getopt.h */
--- a/extras/stdint.h
+++ b/extras/stdint.h
@ -0,0 +1,186 @@
+/* ISO C9x  7.18  Integer types <stdint.h>
+ * Based on ISO/IEC SC22/WG14 9899 Committee draft (SC22 N2794)
+ *
+ *  THIS SOFTWARE IS NOT COPYRIGHTED
+ *
+ *  Contributor: Danny Smith <danny_r_smith_2001@yahoo.co.nz>
+ *
+ *  This source code is offered for use in the public domain. You may
+ *  use, modify or distribute it freely.
+ *
+ *  This code is distributed in the hope that it will be useful but
+ *  WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
+ *  DISCLAIMED. This includes but is not limited to warranties of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ *  Date: 2000-12-02
+ */
+
+
+#ifndef _STDINT_H
+#define _STDINT_H
+#define __need_wint_t
+#define __need_wchar_t
+#include <stddef.h>
+
+/* 7.18.1.1  Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char   uint8_t;
+typedef short  int16_t;
+typedef unsigned short  uint16_t;
+typedef int  int32_t;
+typedef unsigned   uint32_t;
+typedef __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+
+/* 7.18.1.2  Minimum-width integer types */
+typedef signed char int_least8_t;
+typedef unsigned char   uint_least8_t;
+typedef short  int_least16_t;
+typedef unsigned short  uint_least16_t;
+typedef int  int_least32_t;
+typedef unsigned   uint_least32_t;
+typedef __int64  int_least64_t;
+typedef unsigned __int64   uint_least64_t;
+
+/*  7.18.1.3  Fastest minimum-width integer types 
+ *  Not actually guaranteed to be fastest for all purposes
+ *  Here we use the exact-width types for 8 and 16-bit ints. 
+ */
+typedef char int_fast8_t;
+typedef unsigned char uint_fast8_t;
+typedef short  int_fast16_t;
+typedef unsigned short  uint_fast16_t;
+typedef int  int_fast32_t;
+typedef unsigned  int  uint_fast32_t;
+typedef __int64  int_fast64_t;
+typedef unsigned __int64   uint_fast64_t;
+
+/* 7.18.1.4  Integer types capable of holding object pointers */
+/*typedef int intptr_t;
+typedef unsigned uintptr_t;*/
+
+/* 7.18.1.5  Greatest-width integer types */
+typedef __int64  intmax_t;
+typedef unsigned __int64   uintmax_t;
+
+/* 7.18.2  Limits of specified-width integer types */
+#if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS)
+
+/* 7.18.2.1  Limits of exact-width integer types */
+#define INT8_MIN (-128) 
+#define INT16_MIN (-32768)
+#define INT32_MIN (-2147483647 - 1)
+#define INT64_MIN  (-9223372036854775807LL - 1)
+
+#define INT8_MAX 127
+#define INT16_MAX 32767
+#define INT32_MAX 2147483647
+#define INT64_MAX 9223372036854775807LL
+
+#define UINT8_MAX 0xff /* 255U */
+#define UINT16_MAX 0xffff /* 65535U */
+#define UINT32_MAX 0xffffffff  /* 4294967295U */
+#define UINT64_MAX 0xffffffffffffffffULL /* 18446744073709551615ULL */
+
+/* 7.18.2.2  Limits of minimum-width integer types */
+#define INT_LEAST8_MIN INT8_MIN
+#define INT_LEAST16_MIN INT16_MIN
+#define INT_LEAST32_MIN INT32_MIN
+#define INT_LEAST64_MIN INT64_MIN
+
+#define INT_LEAST8_MAX INT8_MAX
+#define INT_LEAST16_MAX INT16_MAX
+#define INT_LEAST32_MAX INT32_MAX
+#define INT_LEAST64_MAX INT64_MAX
+
+#define UINT_LEAST8_MAX UINT8_MAX
+#define UINT_LEAST16_MAX UINT16_MAX
+#define UINT_LEAST32_MAX UINT32_MAX
+#define UINT_LEAST64_MAX UINT64_MAX
+
+/* 7.18.2.3  Limits of fastest minimum-width integer types */
+#define INT_FAST8_MIN INT8_MIN
+#define INT_FAST16_MIN INT16_MIN
+#define INT_FAST32_MIN INT32_MIN
+#define INT_FAST64_MIN INT64_MIN
+
+#define INT_FAST8_MAX INT8_MAX
+#define INT_FAST16_MAX INT16_MAX
+#define INT_FAST32_MAX INT32_MAX
+#define INT_FAST64_MAX INT64_MAX
+
+#define UINT_FAST8_MAX UINT8_MAX
+#define UINT_FAST16_MAX UINT16_MAX
+#define UINT_FAST32_MAX UINT32_MAX
+#define UINT_FAST64_MAX UINT64_MAX
+
+/* 7.18.2.4  Limits of integer types capable of holding
+    object pointers */ 
+#define INTPTR_MIN INT32_MIN
+#define INTPTR_MAX INT32_MAX
+#define UINTPTR_MAX UINT32_MAX
+
+/* 7.18.2.5  Limits of greatest-width integer types */
+#define INTMAX_MIN INT64_MIN
+#define INTMAX_MAX INT64_MAX
+#define UINTMAX_MAX UINT64_MAX
+
+/* 7.18.3  Limits of other integer types */
+#define PTRDIFF_MIN INT32_MIN
+#define PTRDIFF_MAX INT32_MAX
+
+#define SIG_ATOMIC_MIN INT32_MIN
+#define SIG_ATOMIC_MAX INT32_MAX
+
+#define SIZE_MAX UINT32_MAX
+
+#ifndef WCHAR_MIN  /* also in wchar.h */ 
+#define WCHAR_MIN 0
+#define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */
+#endif
+
+/*
+ * wint_t is unsigned short for compatibility with MS runtime
+ */
+#define WINT_MIN 0
+#define WINT_MAX ((wint_t)-1) /* UINT16_MAX */
+
+#endif /* !defined ( __cplusplus) || defined __STDC_LIMIT_MACROS */
+
+
+/* 7.18.4  Macros for integer constants */
+#if !defined ( __cplusplus) || defined (__STDC_CONSTANT_MACROS)
+
+/* 7.18.4.1  Macros for minimum-width integer constants
+
+    Accoding to Douglas Gwyn <gwyn@arl.mil>:
+	"This spec was changed in ISO/IEC 9899:1999 TC1; in ISO/IEC
+	9899:1999 as initially published, the expansion was required
+	to be an integer constant of precisely matching type, which
+	is impossible to accomplish for the shorter types on most
+	platforms, because C99 provides no standard way to designate
+	an integer constant with width less than that of type int.
+	TC1 changed this to require just an integer constant
+	*expression* with *promoted* type."
+
+	The trick used here is from Clive D W Feather.
+*/
+
+#define INT8_C(val) (INT_LEAST8_MAX-INT_LEAST8_MAX+(val))
+#define INT16_C(val) (INT_LEAST16_MAX-INT_LEAST16_MAX+(val))
+#define INT32_C(val) (INT_LEAST32_MAX-INT_LEAST32_MAX+(val))
+#define INT64_C(val) (INT_LEAST64_MAX-INT_LEAST64_MAX+(val))
+
+#define UINT8_C(val) (UINT_LEAST8_MAX-UINT_LEAST8_MAX+(val))
+#define UINT16_C(val) (UINT_LEAST16_MAX-UINT_LEAST16_MAX+(val))
+#define UINT32_C(val) (UINT_LEAST32_MAX-UINT_LEAST32_MAX+(val))
+#define UINT64_C(val) (UINT_LEAST64_MAX-UINT_LEAST64_MAX+(val))
+
+/* 7.18.4.2  Macros for greatest-width integer constants */
+#define INTMAX_C(val) (INTMAX_MAX-INTMAX_MAX+(val))
+#define UINTMAX_C(val) (UINTMAX_MAX-UINTMAX_MAX+(val))
+
+#endif  /* !defined ( __cplusplus) || defined __STDC_CONSTANT_MACROS */
+
+#endif
--- a/testing/checkasm.c
+++ b/testing/checkasm.c
@ -0,0 +1,347 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#
+#include "core/common.h"
+#ifdef HAVE_MMXEXT
+#include "core/i386/pixel.h"
+#include "core/i386/dct.h"
+#include "core/i386/mc.h"
+#endif
+#ifdef HAVE_ALTIVEC
+#include "core/ppc/pixel.h"
+#endif
+
+/* buf1, buf2: initialised to randome data and shouldn't write into them */
+uint8_t * buf1, * buf2;
+/* buf3, buf4: used to store output */
+uint8_t * buf3, * buf4;
+
+static int check_pixel()
+{
+    x264_pixel_function_t pixel_c = {{0},{0},{0}};
+    x264_pixel_function_t pixel_asm = {{0}, {0},{0}};
+    int ret = 0, ok;
+    int i;
+
+    memset( &pixel_asm, 0, sizeof( x264_pixel_function_t ) );
+    x264_pixel_init( 0, &pixel_c );
+#ifdef HAVE_MMXEXT
+    x264_pixel_init( X264_CPU_MMX|X264_CPU_MMXEXT, &pixel_asm );
+#endif
+#ifdef HAVE_ALTIVEC
+    x264_pixel_altivec_init( &pixel_asm );
+#endif
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        int res_c, res_asm;
+        if( pixel_asm.sad[i] )
+        {
+            res_c   = pixel_c.sad[i]( buf1, 32, buf2, 32 );
+            res_asm =  pixel_asm.sad[i]( buf1, 32, buf2, 32 );
+            if( res_c != res_asm )
+            {
+                ok = 0;
+                fprintf( stderr, "sad[%d]: %d != %d [FAILED]\n", i, res_c, res_asm );
+            }
+        }
+    }
+    if( ok )
+        fprintf( stderr, " - pixel sad :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel sat :           [FAILED]\n" );
+    }
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        int res_c, res_asm;
+        if( pixel_asm.satd[i] )
+        {
+            res_c   = pixel_c.satd[i]( buf1, 32, buf2, 32 );
+            res_asm = pixel_asm.satd[i]( buf1, 32, buf2, 32 );
+            if( res_c != res_asm )
+            {
+                ok = 0;
+                fprintf( stderr, "satd[%d]: %d != %d [FAILED]\n", i, res_c, res_asm );
+            }
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - pixel satd :          [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel satd :          [FAILED]\n" );
+    }
+
+    for( i = 0, ok = 1; i < 7; i++ )
+    {
+        if( pixel_asm.avg[i] )
+        {
+            memcpy( buf3, buf1, 32*32 );
+            memcpy( buf4, buf1, 32*32 );
+            pixel_c.satd[i]( buf3, 32, buf2, 32 );
+            pixel_asm.satd[i]( buf4, 32, buf2, 32 );
+            if( memcmp( buf3, buf4, 32*32 ) )
+            {
+                ok = 0;
+                fprintf( stderr, "avg[%d]: [FAILED]\n", i );
+            }
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - pixel avg :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - pixel avg :           [FAILED]\n" );
+    }
+
+    return ret;
+}
+
+static int check_dct()
+{
+    x264_dct_function_t dct_c;
+    x264_dct_function_t dct_asm;
+    int ret = 0, ok;
+    int16_t dct1[16][4][4] __attribute((aligned(16)));
+    int16_t dct2[16][4][4] __attribute((aligned(16)));
+
+    memset( &dct_asm, 0, sizeof( dct_asm ) );
+    x264_dct_init( 0, &dct_c );
+#ifdef HAVE_MMXEXT
+    x264_dct_init( X264_CPU_MMX|X264_CPU_MMXEXT, &dct_asm );
+#endif
+#define TEST_DCT( name, t1, t2, size ) \
+    if( dct_asm.name ) \
+    { \
+        dct_c.name( t1, buf1, 32, buf2, 32 ); \
+        dct_asm.name( t2, buf1, 32, buf2, 32 ); \
+        if( memcmp( t1, t2, size ) ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, #name " [FAILED]\n" ); \
+        } \
+    }
+    ok = 1;
+    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
+    TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
+    TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
+    if( ok )
+        fprintf( stderr, " - sub_dctXxX :          [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - sub_dctXxX :          [FAILED]\n" );
+    }
+#undef TEST_DCT
+
+#define TEST_IDCT( name, t ) \
+    if( dct_asm.name ) \
+    { \
+        memcpy( buf3, buf1, 32*32 ); \
+        memcpy( buf4, buf1, 32*32 ); \
+        dct_c.name( buf3, 32, t ); \
+        dct_asm.name( buf4, 32, t ); \
+        if( memcmp( buf3, buf4, 32*32 ) ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, #name " [FAILED]\n" ); \
+        } \
+    }
+    ok = 1;
+    TEST_IDCT( add4x4_idct, dct1[0] );
+    TEST_IDCT( add8x8_idct, dct1 );
+    TEST_IDCT( add16x16_idct, dct1 );
+    if( ok )
+        fprintf( stderr, " - add_idctXxX :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - add_idctXxX :         [FAILED]\n" );
+    }
+#undef TEST_IDCT
+
+    ok = 1;
+    if( dct_asm.dct4x4dc )
+    {
+        int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+        int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+
+        dct_c.dct4x4dc( dct1 );
+        dct_asm.dct4x4dc( dct2 );
+        if( memcmp( dct1, dct2, 32 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - dct4x4dc :        [FAILED]\n" );
+        }
+    }
+    if( dct_asm.idct4x4dc )
+    {
+        int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+        int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
+
+        dct_c.idct4x4dc( dct1 );
+        dct_asm.idct4x4dc( dct2 );
+        if( memcmp( dct1, dct2, 32 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - idct4x4dc :        [FAILED]\n" );
+        }
+    }
+    if( ok )
+        fprintf( stderr, " - (i)dct4x4dc :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - (i)dct4x4dc :         [FAILED]\n" );
+    }
+
+    ok = 1;
+    if( dct_asm.dct2x2dc )
+    {
+        int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+        int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+
+        dct_c.dct2x2dc( dct1 );
+        dct_asm.dct2x2dc( dct2 );
+        if( memcmp( dct1, dct2, 4*2 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - dct2x2dc :        [FAILED]\n" );
+        }
+    }
+    if( dct_asm.idct2x2dc )
+    {
+        int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+        int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
+
+        dct_c.idct2x2dc( dct1 );
+        dct_asm.idct2x2dc( dct2 );
+        if( memcmp( dct1, dct2, 4*2 ) )
+        {
+            ok = 0;
+            fprintf( stderr, " - idct2x2dc :       [FAILED]\n" );
+        }
+    }
+
+    if( ok )
+        fprintf( stderr, " - (i)dct2x2dc :         [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - (i)dct2x2dc :         [FAILED]\n" );
+    }
+
+
+    return ret;
+}
+
+static int check_mc()
+{
+    x264_mc_function_t mc_c[2] = {0};
+    x264_mc_function_t mc_asm[2] = {0};
+    uint8_t *src = &buf1[2*32+2];
+    uint8_t *dst1 = &buf3[2*32+2];
+    uint8_t *dst2 = &buf4[2*32+2];
+    int dx, dy;
+    int ret = 0, ok[2] = { 1, 1 };
+
+    x264_mc_init( 0, mc_c );
+#ifdef HAVE_MMXEXT
+    x264_mc_mmxext_init( mc_asm );
+#endif
+
+    memset( buf3, 0, 32*32 );
+    memset( buf4, 0, 32*32 );
+
+    /* Do the MC */
+#define MC_TEST( t, w, h ) \
+        if( mc_asm[t] ) \
+        { \
+            memset(dst1, 0xCD, (h) * 16); \
+            mc_c[t]( src, 32, dst1, 16, dx, dy, w, h );     \
+            memset(dst2, 0xCD, (h) * 16); \
+            mc_asm[t]( src, 32, dst2, 16, dx, dy, w, h );   \
+            if( memcmp( dst1, dst2, 16*16 ) )               \
+            { \
+                fprintf( stderr, "mc["#t"][mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h );   \
+                ok[t] = 0; \
+            } \
+        }
+
+    for( dy = 0; dy < 4; dy++ )
+    {
+        for( dx = 0; dx < 4; dx++ )
+        {
+            MC_TEST( 0, 16, 16 );
+            MC_TEST( 0, 16, 8 );
+            MC_TEST( 0, 8, 16 );
+            MC_TEST( 0, 8, 8 );
+            MC_TEST( 0, 8, 4 );
+            MC_TEST( 0, 4, 8 );
+            MC_TEST( 0, 4, 4 );
+
+            MC_TEST( 1, 8, 8 );
+            MC_TEST( 1, 8, 4 );
+            MC_TEST( 1, 4, 8 );
+            MC_TEST( 1, 4, 4 );
+            MC_TEST( 1, 4, 2 );
+            MC_TEST( 1, 2, 4 );
+            MC_TEST( 1, 2, 2 );
+        }
+    }
+#undef MC_TEST
+    if( ok[0] )
+        fprintf( stderr, " - mc luma :             [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - mc luma :             [FAILED]\n" );
+    }
+    if( ok[1] )
+        fprintf( stderr, " - mc chroma :           [OK]\n" );
+    else {
+        ret = -1;
+        fprintf( stderr, " - mc chroma :           [FAILED]\n" );
+    }
+    return ret;
+}
+
+int main()
+{
+    int ret;
+    int i;
+
+#ifdef HAVE_MMXEXT
+    fprintf( stderr, "x264: MMXEXT against C\n" );
+#elif HAVE_ALTIVEC
+    fprintf( stderr, "x264: ALTIVEC against C\n" );
+#endif
+
+    buf1 = x264_malloc( 1024 ); /* 32 x 32 */
+    buf2 = x264_malloc( 1024 );
+    buf3 = x264_malloc( 1024 );
+    buf4 = x264_malloc( 1024 );
+
+    srand( x264_mdate() );
+
+    for( i = 0; i < 1024; i++ )
+    {
+        buf1[i] = rand() % 0xFF;
+        buf2[i] = rand() % 0xFF;
+        buf3[i] = buf4[i] = 0;
+    }
+
+    ret = check_pixel() +
+          check_dct() +
+          check_mc();
+
+    if( ret == 0 )
+    {
+        fprintf( stderr, "x264: All tests passed Yeah :)\n" );
+        return 0;
+    }
+    fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
+    return -1;
+}
+
--- a/testing/edge-detec.c
+++ b/testing/edge-detec.c
--- a/testing/macroblock-dz.c
+++ b/testing/macroblock-dz.c
--- a/tools/.cvsignore
+++ b/tools/.cvsignore
@ -0,0 +1,2 @@
+xyuv
+avc2avi
--- a/tools/Jamfile
+++ b/tools/Jamfile
@ -0,0 +1,9 @@
+# Jamfile
+
+# Little tool to embed h264 into avi
+Main avc2avi : avc2avi.c ;
+
+# Little YUV I420 player
+LINKLIBS = `sdl-config --libs` ;
+Main xyuv : xyuv.c ;
+
--- a/tools/avc2avi.c
+++ b/tools/avc2avi.c
@ -0,0 +1,820 @@
+/*****************************************************************************
+ * avc2avi.c: raw h264 -> AVI
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: avc2avi.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <signal.h>
+#define _GNU_SOURCE
+#include <getopt.h>
+
+#ifdef _MSC_VER
+#include <io.h>     /* _setmode() */
+#include <fcntl.h>  /* _O_BINARY */
+#endif
+
+#include "../core/bs.h"
+
+#define DATA_MAX 3000000
+uint8_t data[DATA_MAX];
+
+/* Ctrl-C handler */
+static int     i_ctrl_c = 0;
+static void    SigIntHandler( int a )
+{
+    i_ctrl_c = 1;
+}
+
+typedef struct
+{
+    char *psz_fin;
+    char *psz_fout;
+
+    float f_fps;
+    char  fcc[4];
+} cfg_t;
+
+typedef struct
+{
+    int i_data;
+    int i_data_max;
+    uint8_t *p_data;
+} vbuf_t;
+
+void vbuf_init( vbuf_t * );
+void vbuf_add( vbuf_t *, int i_data, void *p_data );
+void vbuf_reset( vbuf_t * );
+
+typedef struct
+{
+    FILE *f;
+
+    float f_fps;
+    char  fcc[4];
+
+    int   i_width;
+    int   i_height;
+
+    int64_t i_movi;
+    int64_t i_movi_end;
+    int64_t i_riff;
+
+    int      i_frame;
+    int      i_idx_max;
+    uint32_t *idx;
+} avi_t;
+
+void avi_init( avi_t *, FILE *, float, char fcc[4] );
+void avi_write( avi_t *, vbuf_t *, int  );
+void avi_end( avi_t * );
+
+enum nal_unit_type_e
+{
+    NAL_UNKNOWN = 0,
+    NAL_SLICE   = 1,
+    NAL_SLICE_DPA   = 2,
+    NAL_SLICE_DPB   = 3,
+    NAL_SLICE_DPC   = 4,
+    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+    NAL_SEI         = 6,    /* ref_idc == 0 */
+    NAL_SPS         = 7,
+    NAL_PPS         = 8
+    /* ref_idc == 0 for 6,9,10,11,12 */
+};
+enum nal_priority_e
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+    NAL_PRIORITY_LOW        = 1,
+    NAL_PRIORITY_HIGH       = 2,
+    NAL_PRIORITY_HIGHEST    = 3,
+};
+
+typedef struct
+{
+    int i_ref_idc;  /* nal_priority_e */
+    int i_type;     /* nal_unit_type_e */
+
+    /* This data are raw payload */
+    int     i_payload;
+    uint8_t *p_payload;
+} nal_t;
+
+typedef struct
+{
+    int i_width;
+    int i_height;
+
+    int i_nal_type;
+    int i_ref_idc;
+    int i_idr_pic_id;
+    int i_frame_num;
+
+    int b_key;
+    int i_log2_max_frame_num;
+} h264_t;
+
+void h264_parser_init( h264_t * );
+void h264_parser_parse( h264_t *h, nal_t *n, int *pb_nal_start );
+
+
+static int nal_decode( nal_t *nal, void *p_data, int i_data );
+
+static void Help( void );
+static int  Parse( int argc, char **argv, cfg_t * );
+static int  ParseNAL( nal_t *nal, avi_t *a, h264_t *h, int *pb_slice );
+
+/****************************************************************************
+ * main:
+ ****************************************************************************/
+int main( int argc, char **argv )
+{
+    cfg_t cfg;
+
+    FILE    *fout;
+    FILE    *fin;
+
+    vbuf_t  vb;
+    avi_t   avi;
+    h264_t  h264;
+
+    nal_t nal;
+    int i_frame;
+    int i_data;
+    int b_eof;
+    int b_key;
+    int b_slice;
+
+#ifdef _MSC_VER
+    _setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcos Morais <morais at dee.ufcg.edu.br> */
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    /* Parse command line */
+    if( Parse( argc, argv, &cfg ) < 0 )
+    {
+        return -1;
+    }
+
+    /* Open input */
+    if( cfg.psz_fin == NULL || *cfg.psz_fin == '\0' || !strcmp( cfg.psz_fin, "-" ) )
+        fin = stdin;
+    else
+        fin = fopen( cfg.psz_fin, "rb" );
+    if( fin == NULL )
+    {
+        fprintf( stderr, "cannot open input file\n" );
+        return -1;
+    }
+
+    /* Open output */
+    if( cfg.psz_fout == NULL || *cfg.psz_fout == '\0' || !strcmp( cfg.psz_fout, "-" ) )
+        fout = stdin;
+    else
+        fout = fopen( cfg.psz_fout, "wb" );
+    if( fout == NULL )
+    {
+        fprintf( stderr, "cannot open output file\n" );
+        return -1;
+    }
+
+    /* Init avi */
+    avi_init( &avi, fout, cfg.f_fps, cfg.fcc );
+
+    /* Init parser */
+    h264_parser_init( &h264 );
+
+    /* Control-C handler */
+    signal( SIGINT, SigIntHandler );
+
+    /* Init data */
+    b_eof = 0;
+    b_key = 0;
+    b_slice = 0;
+    i_frame = 0;
+    i_data  = 0;
+
+    /* Alloc space for a nal, used for decoding pps/sps/slice header */
+    nal.p_payload = malloc( DATA_MAX );
+
+    vbuf_init( &vb );
+
+    /* split frame */
+    while( !i_ctrl_c )
+    {
+        uint8_t *p, *p_next, *end;
+        int i_size;
+
+        /* fill buffer */
+        if( i_data < DATA_MAX && !b_eof )
+        {
+            int i_read = fread( &data[i_data], 1, DATA_MAX - i_data, fin );
+            if( i_read <= 0 )
+                b_eof = 1;
+            else
+                i_data += i_read;
+        }
+        if( i_data < 3 )
+            break;
+
+        end = &data[i_data];
+
+        /* Search begin of a NAL */
+        p = &data[0];
+        while( p < end - 3 )
+        {
+            if( p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x01 )
+            {
+                break;
+            }
+            p++;
+        }
+
+        if( p >= end - 3 )
+        {
+            fprintf( stderr, "garbage (i_data = %d)\n", i_data );
+            i_data = 0;
+            continue;
+        }
+
+        /* Search end of NAL */
+        p_next = p + 3;
+        while( p_next < end - 3 )
+        {
+            if( p_next[0] == 0x00 && p_next[1] == 0x00 && p_next[2] == 0x01 )
+            {
+                break;
+            }
+            p_next++;
+        }
+
+        if( p_next == end - 3 && i_data < DATA_MAX )
+            p_next = end;
+
+        /* Compute NAL size */
+        i_size = p_next - p - 3;
+        if( i_size <= 0 )
+        {
+            if( b_eof )
+                break;
+
+            fprintf( stderr, "nal too large (FIXME) ?\n" );
+            i_data = 0;
+            continue;
+        }
+
+        /* Nal start at p+3 with i_size length */
+        nal_decode( &nal, p +3, i_size < 2048 ? i_size : 2048 );
+
+        b_key = h264.b_key;
+
+        if( b_slice && vb.i_data && ( nal.i_type == NAL_SPS || nal.i_type == NAL_PPS ) )
+        {
+            avi_write( &avi, &vb, b_key );
+            vbuf_reset( &vb );
+            b_slice = 0;
+        }
+
+        /* Parse SPS/PPS/Slice */
+        if( ParseNAL( &nal, &avi, &h264, &b_slice ) && vb.i_data > 0 )
+        {
+            avi_write( &avi, &vb, b_key );
+            vbuf_reset( &vb );
+        }
+
+        /* fprintf( stderr, "nal:%d ref:%d\n", nal.i_type, nal.i_ref_idc ); */
+
+        /* Append NAL to buffer */
+        vbuf_add( &vb, i_size + 3, p );
+
+        /* Remove this nal */
+        memmove( &data[0], p_next, end - p_next );
+        i_data -= p_next - &data[0];
+    }
+
+    if( vb.i_data > 0 )
+    {
+        avi_write( &avi, &vb, h264.b_key );
+    }
+
+    avi.i_width  = h264.i_width;
+    avi.i_height = h264.i_height;
+
+    avi_end( &avi );
+
+    /* free mem */
+    free( nal.p_payload );
+
+    fclose( fin );
+    fclose( fout );
+
+    return 0;
+}
+
+/*****************************************************************************
+ * Help:
+ *****************************************************************************/
+static void Help( void )
+{
+    fprintf( stderr,
+             "avc2avi\n"
+             "Syntax: avc2avi [options] [ -i input.h264 ] [ -o output.avi ]\n"
+             "\n"
+             "  -h, --help                  Print this help\n"
+             "\n"
+             "  -i, --input                 Specify input file (default: stdin)\n"
+             "  -o, --output                Specify output file (default: stdout)\n"
+             "\n"
+             "  -f, --fps <float>           Set FPS (default: 25.0)\n"
+             "  -c, --codec <string>        Set the codec fourcc (default: 'h264')\n"
+             "\n" );
+}
+
+/*****************************************************************************
+ * Parse:
+ *****************************************************************************/
+static int  Parse( int argc, char **argv, cfg_t *cfg )
+{
+    /* Set default values */
+    cfg->psz_fin = NULL;
+    cfg->psz_fout = NULL;
+    cfg->f_fps = 25.0;
+    memcpy( cfg->fcc, "h264", 4 );
+
+    /* Parse command line options */
+    opterr = 0; // no error message
+    for( ;; )
+    {
+        int long_options_index;
+        static struct option long_options[] =
+        {
+            { "help",   no_argument,       NULL, 'h' },
+            { "input",  required_argument, NULL, 'i' },
+            { "output", required_argument, NULL, 'o' },
+            { "fps",    required_argument, NULL, 'f' },
+            { "codec",  required_argument, NULL, 'c' },
+            {0, 0, 0, 0}
+        };
+
+        int c;
+
+        c = getopt_long( argc, argv, "hi:o:f:c:",
+                         long_options, &long_options_index);
+
+        if( c == -1 )
+        {
+            break;
+        }
+
+        switch( c )
+        {
+            case 'h':
+                Help();
+                return -1;
+
+            case 0:
+                break;
+            case 'i':
+                cfg->psz_fin = strdup( optarg );
+                break;
+            case 'o':
+                cfg->psz_fout = strdup( optarg );
+                break;
+            case 'f':
+                cfg->f_fps = atof( optarg );
+                break;
+            case 'c':
+                memset( cfg->fcc, ' ', 4 );
+                memcpy( cfg->fcc, optarg, strlen( optarg ) < 4 ? strlen( optarg ) : 4 );
+                break;
+
+            default:
+                fprintf( stderr, "unknown option (%c)\n", optopt );
+                return -1;
+        }
+    }
+
+
+    return 0;
+}
+
+/*****************************************************************************
+ * h264_parser_*:
+ *****************************************************************************/
+void h264_parser_init( h264_t *h )
+{
+    h->i_width = 0;
+    h->i_height = 0;
+    h->b_key = 0;
+    h->i_nal_type = -1;
+    h->i_ref_idc = -1;
+    h->i_idr_pic_id = -1;
+    h->i_frame_num = -1;
+    h->i_log2_max_frame_num = 0;
+}
+void h264_parser_parse( h264_t *h, nal_t *nal, int *pb_nal_start )
+{
+    bs_t s;
+    *pb_nal_start = 0;
+
+    if( nal->i_type == NAL_SPS || nal->i_type == NAL_PPS )
+        *pb_nal_start = 1;
+
+    bs_init( &s, nal->p_payload, nal->i_payload );
+    if( nal->i_type == NAL_SPS )
+    {
+        int i_tmp;
+
+        bs_skip( &s, 8 + 1+1+1 + 5 + 8 );
+        /* sps id */
+        bs_read_ue( &s );
+        /* Skip i_log2_max_frame_num */
+        h->i_log2_max_frame_num = bs_read_ue( &s ) + 4;
+        /* Read poc_type */
+        i_tmp = bs_read_ue( &s );
+        if( i_tmp == 0 )
+        {
+            /* skip i_log2_max_poc_lsb */
+            bs_read_ue( &s );
+        }
+        else if( i_tmp == 1 )
+        {
+            int i_cycle;
+            /* skip b_delta_pic_order_always_zero */
+            bs_skip( &s, 1 );
+            /* skip i_offset_for_non_ref_pic */
+            bs_read_se( &s );
+            /* skip i_offset_for_top_to_bottom_field */
+            bs_read_se( &s );
+            /* read i_num_ref_frames_in_poc_cycle */
+            i_cycle = bs_read_ue( &s ); 
+            if( i_cycle > 256 ) i_cycle = 256;
+            while( i_cycle > 0 )
+            {
+                /* skip i_offset_for_ref_frame */
+                bs_read_se(&s );
+            }
+        }
+        /* i_num_ref_frames */
+        bs_read_ue( &s );
+        /* b_gaps_in_frame_num_value_allowed */
+        bs_skip( &s, 1 );
+
+        /* Read size */
+        h->i_width  = 16 * ( bs_read_ue( &s ) + 1 );
+        h->i_height = 16 * ( bs_read_ue( &s ) + 1 );
+
+        /* b_frame_mbs_only */
+        i_tmp = bs_read( &s, 1 );
+        if( i_tmp == 0 )
+        {
+            bs_skip( &s, 1 );
+        }
+        /* b_direct8x8_inference */
+        bs_skip( &s, 1 );
+
+        /* crop ? */
+        i_tmp = bs_read( &s, 1 );
+        if( i_tmp )
+        {
+            /* left */
+            h->i_width -= 2 * bs_read_ue( &s );
+            /* right */
+            h->i_width -= 2 * bs_read_ue( &s );
+            /* top */
+            h->i_height -= 2 * bs_read_ue( &s );
+            /* bottom */
+            h->i_height -= 2 * bs_read_ue( &s );
+        }
+
+        /* vui: ignored */
+    }
+    else if( nal->i_type >= NAL_SLICE && nal->i_type <= NAL_SLICE_IDR )
+    {
+        int i_tmp;
+
+        /* i_first_mb */
+        bs_read_ue( &s );
+        /* picture type */
+        switch( bs_read_ue( &s ) )
+        {
+            case 0: case 5: /* P */
+            case 1: case 6: /* B */
+            case 3: case 8: /* SP */
+                h->b_key = 0;
+                break;
+            case 2: case 7: /* I */
+                h->b_key = 1;
+                break;
+            case 4: case 9: /* ? */
+                h->b_key = 1;
+                break;
+        }
+        /* pps id */
+        bs_read_ue( &s );
+
+        /* frame num */
+        i_tmp = bs_read( &s, h->i_log2_max_frame_num );
+
+        if( i_tmp != h->i_frame_num )
+            *pb_nal_start = 1;
+
+        h->i_frame_num = i_tmp;
+
+        if( nal->i_type == NAL_SLICE_IDR )
+        {
+            i_tmp = bs_read_ue( &s );
+            if( h->i_nal_type == NAL_SLICE_IDR && h->i_idr_pic_id != i_tmp )
+                *pb_nal_start = 1;
+
+            h->i_idr_pic_id = i_tmp;
+        }
+    }
+    h->i_nal_type = nal->i_type;
+    h->i_ref_idc = nal->i_ref_idc;
+}
+
+
+static int  ParseNAL( nal_t *nal, avi_t *a, h264_t *h, int *pb_slice )
+{
+    int b_flush = 0;
+    int b_start;
+
+    h264_parser_parse( h, nal, &b_start );
+
+    if( b_start && *pb_slice )
+    {
+        b_flush = 1;
+        *pb_slice = 0;
+    }
+
+    if( nal->i_type >= NAL_SLICE && nal->i_type <= NAL_SLICE_IDR )
+        *pb_slice = 1;
+
+    return b_flush;
+}
+
+/*****************************************************************************
+ * vbuf: variable buffer
+ *****************************************************************************/
+void vbuf_init( vbuf_t *v )
+{
+    v->i_data = 0;
+    v->i_data_max = 10000;
+    v->p_data = malloc( v->i_data_max );
+}
+void vbuf_add( vbuf_t *v, int i_data, void *p_data )
+{
+    if( i_data + v->i_data >= v->i_data_max )
+    {
+        v->i_data_max += i_data;
+        v->p_data = realloc( v->p_data, v->i_data_max );
+    }
+    memcpy( &v->p_data[v->i_data], p_data, i_data );
+
+    v->i_data += i_data;
+}
+void vbuf_reset( vbuf_t *v )
+{
+    v->i_data = 0;
+}
+
+/*****************************************************************************
+ * avi:
+ *****************************************************************************/
+void avi_write_uint16( avi_t *a, uint16_t w )
+{
+    fputc( ( w      ) & 0xff, a->f );
+    fputc( ( w >> 8 ) & 0xff, a->f );
+}
+
+void avi_write_uint32( avi_t *a, uint32_t dw )
+{
+    fputc( ( dw      ) & 0xff, a->f );
+    fputc( ( dw >> 8 ) & 0xff, a->f );
+    fputc( ( dw >> 16) & 0xff, a->f );
+    fputc( ( dw >> 24) & 0xff, a->f );
+}
+
+void avi_write_fourcc( avi_t *a, char fcc[4] )
+{
+    fputc( fcc[0], a->f );
+    fputc( fcc[1], a->f );
+    fputc( fcc[2], a->f );
+    fputc( fcc[3], a->f );
+}
+
+/* Flags in avih */
+#define AVIF_HASINDEX       0x00000010  // Index at end of file?
+#define AVIF_ISINTERLEAVED  0x00000100
+#define AVIF_TRUSTCKTYPE    0x00000800  // Use CKType to find key frames?
+
+#define AVIIF_KEYFRAME      0x00000010L /* this frame is a key frame.*/
+
+void avi_write_header( avi_t *a )
+{
+    avi_write_fourcc( a, "RIFF" );
+    avi_write_uint32( a, a->i_riff > 0 ? a->i_riff - 8 : 0xFFFFFFFF );
+    avi_write_fourcc( a, "AVI " );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  4 + 4*16 + 12 + 4*16 + 4*12 );
+    avi_write_fourcc( a, "hdrl" );
+
+    avi_write_fourcc( a, "avih" );
+    avi_write_uint32( a, 4*16 - 8 );
+    avi_write_uint32( a, 1000000 / a->f_fps );
+    avi_write_uint32( a, 0xffffffff );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, AVIF_HASINDEX|AVIF_ISINTERLEAVED|AVIF_TRUSTCKTYPE);
+    avi_write_uint32( a, a->i_frame );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 1 );
+    avi_write_uint32( a, 1000000 );
+    avi_write_uint32( a, a->i_width );
+    avi_write_uint32( a, a->i_height );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  4 + 4*16 + 4*12 );
+    avi_write_fourcc( a, "strl" );
+
+    avi_write_fourcc( a, "strh" );
+    avi_write_uint32( a,  4*16 - 8 );
+    avi_write_fourcc( a, "vids" );
+    avi_write_fourcc( a, a->fcc );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, 1000 );
+    avi_write_uint32( a, a->f_fps * 1000 );
+    avi_write_uint32( a, 0 );
+    avi_write_uint32( a, a->i_frame );
+    avi_write_uint32( a, 1024*1024 );
+    avi_write_uint32( a, -1 );
+    avi_write_uint32( a, a->i_width * a->i_height );
+    avi_write_uint32( a, 0 );
+    avi_write_uint16( a, a->i_width );
+    avi_write_uint16( a, a->i_height );
+
+    avi_write_fourcc( a, "strf" );
+    avi_write_uint32( a,  4*12 - 8 );
+    avi_write_uint32( a,  4*12 - 8 );
+    avi_write_uint32( a,  a->i_width );
+    avi_write_uint32( a,  a->i_height );
+    avi_write_uint16( a,  1 );
+    avi_write_uint16( a,  24 );
+    avi_write_fourcc( a,  a->fcc );
+    avi_write_uint32( a, a->i_width * a->i_height );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+    avi_write_uint32( a,  0 );
+
+    avi_write_fourcc( a, "LIST" );
+    avi_write_uint32( a,  a->i_movi_end > 0 ? a->i_movi_end - a->i_movi + 4: 0xFFFFFFFF );
+    avi_write_fourcc( a, "movi" );
+}
+
+void avi_write_idx( avi_t *a )
+{
+    avi_write_fourcc( a, "idx1" );
+    avi_write_uint32( a,  a->i_frame * 16 );
+    fwrite( a->idx, a->i_frame * 16, 1, a->f );
+}
+
+void avi_init( avi_t *a, FILE *f, float f_fps, char fcc[4] )
+{
+    a->f = f;
+    a->f_fps = f_fps;
+    memcpy( a->fcc, fcc, 4 );
+    a->i_width = 0;
+    a->i_height = 0;
+    a->i_frame = 0;
+    a->i_movi = 0;
+    a->i_riff = 0;
+    a->i_movi_end = 0;
+    a->i_idx_max = 0;
+    a->idx = NULL;
+
+    avi_write_header( a );
+
+    a->i_movi = ftell( a->f );
+}
+
+static void avi_set_dw( void *_p, uint32_t dw )
+{
+    uint8_t *p = _p;
+
+    p[0] = ( dw      )&0xff;
+    p[1] = ( dw >> 8 )&0xff;
+    p[2] = ( dw >> 16)&0xff;
+    p[3] = ( dw >> 24)&0xff;
+}
+
+void avi_write( avi_t *a, vbuf_t *v, int b_key )
+{
+    int64_t i_pos = ftell( a->f );
+
+    /* chunk header */
+    avi_write_fourcc( a, "00dc" );
+    avi_write_uint32( a, v->i_data );
+
+    fwrite( v->p_data, v->i_data, 1, a->f );
+
+    if( v->i_data&0x01 )
+    {
+        /* pad */
+        fputc( 0, a->f );
+    }
+
+    /* Append idx chunk */
+    if( a->i_idx_max <= a->i_frame )
+    {
+        a->i_idx_max += 1000;
+        a->idx = realloc( a->idx, a->i_idx_max * 16 );
+    }
+
+    memcpy( &a->idx[4*a->i_frame+0], "00dc", 4 );
+    avi_set_dw( &a->idx[4*a->i_frame+1], b_key ? AVIIF_KEYFRAME : 0 );
+    avi_set_dw( &a->idx[4*a->i_frame+2], i_pos );
+    avi_set_dw( &a->idx[4*a->i_frame+3], v->i_data );
+
+    a->i_frame++;
+}
+
+void avi_end( avi_t *a )
+{
+    a->i_movi_end = ftell( a->f );
+
+    /* write index */
+    avi_write_idx( a );
+
+    a->i_riff = ftell( a->f );
+
+    /* Fix header */
+    fseek( a->f, 0, SEEK_SET );
+    avi_write_header( a );
+
+    fprintf( stderr, "avi file written\n" );
+    fprintf( stderr, "  - codec: %4.4s\n", a->fcc );
+    fprintf( stderr, "  - size: %dx%d\n", a->i_width, a->i_height );
+    fprintf( stderr, "  - fps: %.3f\n", a->f_fps );
+    fprintf( stderr, "  - frames: %d\n", a->i_frame );
+}
+
+/*****************************************************************************
+ * nal:
+ *****************************************************************************/
+int nal_decode( nal_t *nal, void *p_data, int i_data )
+{
+    uint8_t *src = p_data;
+    uint8_t *end = &src[i_data];
+    uint8_t *dst = nal->p_payload;
+
+    nal->i_type    = src[0]&0x1f;
+    nal->i_ref_idc = (src[0] >> 5)&0x03;
+
+    src++;
+
+    while( src < end )
+    {
+        if( src < end - 3 && src[0] == 0x00 && src[1] == 0x00  && src[2] == 0x03 )
+        {
+            *dst++ = 0x00;
+            *dst++ = 0x00;
+
+            src += 3;
+            continue;
+        }
+        *dst++ = *src++;
+    }
+
+    nal->i_payload = dst - (uint8_t*)p_data;
+    return 0;
+}
+
--- a/tools/x264-rd.sh
+++ b/tools/x264-rd.sh
@ -0,0 +1,32 @@
+#!/bin/sh
+
+X264="../x264"
+YUV="/usr/src/yuv/af-720x576.yuv"
+OUT="/tmp/x264-$$.h264"
+
+DAT="x264-rd.dat"
+
+OPTS="-c"
+
+# Init
+rm -f "$DAT"
+echo "#QP kb/s   PSNR Y     U     V     fps" > $DAT
+
+for qp in `seq 1 51`
+do
+    LOG="/tmp/x264-$qp-$$.log"
+    # clean
+    rm -f "$LOG"
+    # encode
+    $X264 "$YUV" -o "$OUT" --qp $qp $OPTS 2> "$LOG"
+    # gather stats
+    cat "$LOG" |
+    grep '^x264: overall' |
+    sed 's/^x264: overall PSNR Y:\([[:digit:]]*\.[[:digit:]]*\) U:\([[:digit:]]*\.[[:digit:]]*\) V:\([[:digit:]]*\.[[:digit:]]*\) kb\/s:\([[:digit:]]*\.[[:digit:]]*\) fps:\([[:digit:]]*\.[[:digit:]]*\)$/\1 \2 \3 \4 \5/g' |
+    awk -v QP=$qp '{ printf( "%2d %7.1f      %5.2f %5.2f %5.2f %5.3f\n", QP, $4, $1, $2, $3, $5 ); }' >> $DAT
+done
+
+# Clean
+rm -f "$OUT"
+rm -f "$LOG"
+
--- a/tools/xyuv.c
+++ b/tools/xyuv.c
@ -0,0 +1,607 @@
+/*****************************************************************************
+ * xyuv.c: a SDL yuv 420 planer viewer.
+ *****************************************************************************
+ * Copyright (C) 2004 Laurent Aimar
+ * $Id: xyuv.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <SDL/SDL.h>
+
+#define YUV_MAX 20
+#define SDL_TITLE "xyuv: %s - %d/%d - %.2ffps"
+typedef struct
+{
+    /* globals */
+    int     i_width;
+    int     i_height;
+    int     i_frame_size;
+    int     i_frame;
+    int     i_frames;
+    float   f_fps;
+
+    float   f_y;
+
+    int     b_pause;
+    int     b_grid;
+    int     b_split;
+    int     b_diff;
+    int     i_join;
+
+    /* Constructed picture */
+    int     i_wall_width;   /* in picture count */
+
+    /* YUV files */
+    int     i_yuv;
+    struct
+    {
+        char    *name;
+        FILE    *f;         /* handles */
+        int     i_frames;   /* frames count */
+
+        /* Position in the whole picture */
+        int     x, y;
+    } yuv[YUV_MAX];
+
+    /* SDL */
+    int i_sdl_width;
+    int i_sdl_height;
+
+    int i_display_width;
+    int i_display_height;
+    char *title;
+
+    SDL_Surface *screen;
+    SDL_Overlay *overlay;
+
+    /* */
+    uint8_t *pic;
+
+} xyuv_t;
+
+xyuv_t xyuv = {
+    .i_width = 0,
+    .i_height = 0,
+    .i_frame  = 1,
+    .i_frames = 0,
+    .f_fps = 25.0,
+    .f_y = 0.0,
+    .i_wall_width = 0,
+
+    .i_yuv = 0,
+
+    .b_pause = 0,
+    .b_split = 0,
+    .b_diff = 0,
+    .i_join = -1,
+
+    .title = NULL,
+    .pic = NULL,
+};
+
+static void help( void )
+{
+    fprintf( stderr,
+             "Syntax: xyuv [options] file [file2 ...]\n"
+             "\n"
+             "      --help                  Print this help\n"
+             "\n"
+             "  -s, --size <WIDTHxHEIGHT>   Set input size\n"
+             "  -w, --width <integer>       Set width\n"
+             "  -h, --height <integer>      Set height\n"
+             "\n"
+             "  -S, --split                 Show splited Y/U/V planes\n"
+             "  -d, --diff                  Show difference (only 2 files) in split mode\n"
+             "  -j, --joint <integer>\n"
+             "\n"
+             "  -y <float>                  Set Y factor\n"
+             "\n"
+             "  -g, --grid                  Show a grid (macroblock 16x16)\n"
+             "  -W <integer>                Set wall width (in picture count)\n"
+             "  -f, --fps <float>           Set fps\n"
+             "\n" );
+}
+
+
+static void xyuv_display( xyuv_t *xyuv, int i_frame );
+
+int main( int argc, char **argv )
+{
+    int i;
+
+    /* Parse commande line */
+    for( i = 1; i < argc; i++ ) {
+        if( !strcasecmp( argv[i], "--help" ) ) {
+            help();
+            return 0;
+        }
+        if( !strcmp( argv[i], "-d" ) || !strcasecmp( argv[i], "--diff" ) ) {
+            xyuv.b_diff = 1;
+        } else if( !strcmp( argv[i], "-S" ) || !strcasecmp( argv[i], "--split" ) ) {
+            xyuv.b_split = 1;
+        } else if( !strcmp( argv[i], "-f" ) || !strcasecmp( argv[i], "--fps" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.f_fps = atof( argv[++i] );
+        } else if( !strcmp( argv[i], "-h" ) || !strcasecmp( argv[i], "--height" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_height = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-w" ) || !strcasecmp( argv[i], "--width" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_width = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-s" ) || !strcasecmp( argv[i], "--size" ) ) {
+            char *p;
+
+            if( i >= argc -1 ) goto err_missing_arg;
+
+            xyuv.i_width = strtol( argv[++i], &p, 0 );
+            p++;
+            xyuv.i_height = atoi( p );
+        } else if( !strcmp( argv[i], "-W" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_wall_width = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-y" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.f_y = atof( argv[++i] );
+        } else if( !strcmp( argv[i], "-j" ) || !strcasecmp( argv[i], "--join" ) ) {
+            if( i >= argc -1 ) goto err_missing_arg;
+            xyuv.i_join = atoi( argv[++i] );
+        } else if( !strcmp( argv[i], "-g" ) || !strcasecmp( argv[i], "--grid" ) ) {
+            xyuv.b_grid = 1;
+        } else {
+            FILE *f = fopen( argv[i], "rb" );
+            if( !f ) {
+                fprintf( stderr, "cannot open YUV %s\n", argv[i] );
+            } else {
+                xyuv.yuv[xyuv.i_yuv].name = strdup( argv[i] );
+                xyuv.yuv[xyuv.i_yuv].f = f;
+                xyuv.yuv[xyuv.i_yuv].i_frames = 0;
+
+                xyuv.i_yuv++;
+            }
+        }
+    }
+
+    if( xyuv.i_yuv == 0 ) {
+        fprintf( stderr, "no file to display\n" );
+        return -1;
+    }
+    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
+        char *psz = xyuv.yuv[0].name;
+        char *num;
+        char *x;
+        /* See if we find widthxheight in the file name */
+        for( ;; )
+        {
+            if( !( x = strchr( psz+1, 'x' ) ) )
+            {
+                break;
+            }
+            num = x;
+            while( num > psz && num[-1] >= '0' && num[-1] <= '9' )
+                num--;
+
+            if( num != x && x[1] >= '0' && x[1] <= '9' )
+            {
+                xyuv.i_width = atoi( num );
+                xyuv.i_height = atoi( x+1 );
+                break;
+            }
+            psz = x;
+        }
+        fprintf( stderr, "file name gives %dx%d\n", xyuv.i_width, xyuv.i_height );
+    }
+    if( xyuv.i_width == 0 || xyuv.i_height == 0 ) {
+        fprintf( stderr, "invalid or missing frames size\n" );
+        return -1;
+    }
+    if( xyuv.b_diff && xyuv.i_yuv != 2 ) {
+        fprintf( stderr, "--diff works only with 2 files\n" );
+        return -1;
+    }
+    if( (xyuv.i_join == 0 || xyuv.i_join >= xyuv.i_width) && xyuv.i_yuv != 2 ) {
+        fprintf( stderr, "--join woeks only with two files and range is [1, width-1]\n" );
+        return -1;
+    }
+    if( xyuv.i_join % 2 != 0 ) {
+        if( xyuv.i_join + 1 < xyuv.i_width )
+            xyuv.i_join++;
+        else
+            xyuv.i_join--;
+    }
+
+    /* Now check frames */
+    fprintf( stderr, "displaying :\n" );
+    xyuv.i_frames = 0;
+    xyuv.i_frame_size = 3 * xyuv.i_width * xyuv.i_height / 2;
+    for( i = 0; i < xyuv.i_yuv; i++ ) {
+        /* Beurk but avoid using fstat */
+        fseek( xyuv.yuv[i].f, 0, SEEK_END );
+
+        xyuv.yuv[i].i_frames = ftell( xyuv.yuv[i].f ) / xyuv.i_frame_size;
+
+        fseek( xyuv.yuv[i].f, 0, SEEK_SET );
+
+        fprintf( stderr, " - '%s' : %d frames\n", xyuv.yuv[i].name, xyuv.yuv[i].i_frames );
+
+        if( xyuv.i_frames < xyuv.yuv[i].i_frames )
+            xyuv.i_frames = xyuv.yuv[i].i_frames;
+    }
+
+    if( xyuv.i_frames == 0 ) {
+        fprintf( stderr, "no frames to display\n" );
+    }
+
+    xyuv.pic = malloc( xyuv.i_frame_size );
+
+    /* calculate SDL view */
+    if( xyuv.i_wall_width > xyuv.i_yuv ) {
+        xyuv.i_wall_width = xyuv.i_yuv;
+    }
+    if( xyuv.i_wall_width == 0 ) {
+        while( xyuv.i_wall_width < xyuv.i_yuv && xyuv.i_wall_width * xyuv.i_wall_width < xyuv.i_yuv ) {
+            xyuv.i_wall_width++;
+        }
+    }
+
+    for( i = 0; i < xyuv.i_yuv; i++ ) {
+        if( xyuv.b_diff || xyuv.i_join > 0 ) {
+            xyuv.yuv[i].x = 0;
+            xyuv.yuv[i].y = 0;
+        } else if( xyuv.b_split ) {
+            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * 3 * xyuv.i_width / 2;
+            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
+        } else {
+            xyuv.yuv[i].x = (i%xyuv.i_wall_width) * xyuv.i_width;
+            xyuv.yuv[i].y = (i/xyuv.i_wall_width) * xyuv.i_height;
+        }
+    }
+    if( xyuv.b_diff ) {
+        xyuv.i_sdl_width = 3 * xyuv.i_width / 2;
+        xyuv.i_sdl_height= xyuv.i_height;
+    } else if( xyuv.i_join > 0 ) {
+        xyuv.i_sdl_width = xyuv.i_width;
+        xyuv.i_sdl_height= xyuv.i_height;
+    } else if( xyuv.b_split ) {
+        xyuv.i_sdl_width = xyuv.i_wall_width * 3 * xyuv.i_width / 2;
+        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
+    } else {
+        xyuv.i_sdl_width = xyuv.i_wall_width * xyuv.i_width;
+        xyuv.i_sdl_height= xyuv.i_height * ( ( xyuv.i_yuv  + xyuv.i_wall_width - 1 ) / xyuv.i_wall_width );
+    }
+    xyuv.i_display_width = xyuv.i_sdl_width;
+    xyuv.i_display_height = xyuv.i_sdl_height;
+
+    /* Open SDL */
+    if( SDL_Init( SDL_INIT_EVENTTHREAD|SDL_INIT_NOPARACHUTE|SDL_INIT_VIDEO) ) {
+        fprintf( stderr, "cannot init SDL\n" );
+        return -1;
+    }
+
+    SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, 100 );
+    SDL_EventState( SDL_KEYUP, SDL_IGNORE );
+
+    xyuv.screen = SDL_SetVideoMode( xyuv.i_sdl_width, xyuv.i_sdl_height, 0,
+                                    SDL_HWSURFACE|SDL_RESIZABLE|
+                                    SDL_ASYNCBLIT|SDL_HWACCEL );
+    if( xyuv.screen == NULL ) {
+        fprintf( stderr, "SDL_SetVideoMode failed\n" );
+        return -1;
+    }
+
+    SDL_LockSurface( xyuv.screen );
+    xyuv.overlay = SDL_CreateYUVOverlay( xyuv.i_sdl_width, xyuv.i_sdl_height,
+                                         SDL_YV12_OVERLAY,
+                                         xyuv.screen );
+    /* reset with black */
+    memset( xyuv.overlay->pixels[0],   0, xyuv.overlay->pitches[0] * xyuv.i_sdl_height );
+    memset( xyuv.overlay->pixels[1], 128, xyuv.overlay->pitches[1] * xyuv.i_sdl_height / 2);
+    memset( xyuv.overlay->pixels[2], 128, xyuv.overlay->pitches[2] * xyuv.i_sdl_height / 2);
+    SDL_UnlockSurface( xyuv.screen );
+
+    if( xyuv.overlay == NULL ) {
+        fprintf( stderr, "recon: SDL_CreateYUVOverlay failed\n" );
+        return -1;
+    }
+
+    for( ;; ) {
+        SDL_Event event;
+        int64_t i_start = SDL_GetTicks();
+        int i_wait;
+
+        if( !xyuv.b_pause ) {
+            xyuv_display( &xyuv, xyuv.i_frame );
+        }
+
+        for( ;; ) {
+            int b_refresh = 0;
+            while( SDL_PollEvent( &event ) )  {
+                switch( event.type )
+                {
+                    case SDL_QUIT:
+                        exit( 1 );
+
+                    case SDL_KEYDOWN:
+                        switch( event.key.keysym.sym )
+                        {
+                            case SDLK_q:
+                            case SDLK_ESCAPE:
+                                exit(1);
+
+                            case SDLK_f:
+                                SDL_WM_ToggleFullScreen( xyuv.screen );
+                                break;
+
+                            case SDLK_g:
+                                if( xyuv.b_grid )
+                                    xyuv.b_grid = 0;
+                                else
+                                    xyuv.b_grid = 1;
+                                if( xyuv.b_pause )
+                                    b_refresh = 1;
+                                break;
+
+                            case SDLK_SPACE:
+                                if( xyuv.b_pause )
+                                    xyuv.b_pause = 0;
+                                else
+                                    xyuv.b_pause = 1;
+                                break;
+                            case SDLK_LEFT:
+                                if( xyuv.i_frame > 1 ) xyuv.i_frame--;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_RIGHT:
+                                if( xyuv.i_frame < xyuv.i_frames ) xyuv.i_frame++;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_HOME:
+                                xyuv.i_frame = 1;
+                                if( xyuv.b_pause )
+                                    b_refresh = 1;
+                                break;
+
+                            case SDLK_END:
+                                xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_UP:
+                                xyuv.i_frame += xyuv.i_frames / 20;
+                                if( xyuv.i_frame > xyuv.i_frames )
+                                    xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_DOWN:
+                                xyuv.i_frame -= xyuv.i_frames / 20;
+                                if( xyuv.i_frame < 1 )
+                                    xyuv.i_frame = 1;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_PAGEUP:
+                                xyuv.i_frame += xyuv.i_frames / 10;
+                                if( xyuv.i_frame > xyuv.i_frames )
+                                    xyuv.i_frame = xyuv.i_frames;
+                                b_refresh = 1;
+                                break;
+
+                            case SDLK_PAGEDOWN:
+                                xyuv.i_frame -= xyuv.i_frames / 10;
+                                if( xyuv.i_frame < 1 )
+                                    xyuv.i_frame = 1;
+                                b_refresh = 1;
+                                break;
+
+                            default:
+                                break;
+                        }
+                        break;
+                    case SDL_VIDEORESIZE:
+                        xyuv.i_display_width = event.resize.w;
+                        xyuv.i_display_height = event.resize.h;
+                        xyuv.screen = SDL_SetVideoMode( xyuv.i_display_width, xyuv.i_display_height, 0,
+                                                        SDL_HWSURFACE|SDL_RESIZABLE|
+                                                        SDL_ASYNCBLIT|SDL_HWACCEL );
+                        xyuv_display( &xyuv, xyuv.i_frame );
+                        break;
+
+                    default:
+                        break;
+                }
+            }
+            if( b_refresh ) {
+                xyuv.b_pause = 1;
+                xyuv_display( &xyuv, xyuv.i_frame );
+            }
+            /* wait */
+            i_wait = 1000 / xyuv.f_fps - ( SDL_GetTicks() - i_start);
+            if( i_wait < 0 )
+                break;
+            else if( i_wait > 200 )
+                SDL_Delay( 200 );
+            else {
+                SDL_Delay( i_wait );
+                break;
+            }
+        }
+        if( !xyuv.b_pause ) {
+            /* next frame */
+            if( xyuv.i_frame == xyuv.i_frames )
+                xyuv.b_pause = 1;
+            else if( xyuv.i_frame < xyuv.i_frames )
+                xyuv.i_frame++;
+        }
+    }
+
+
+    return 0;
+
+err_missing_arg:
+    fprintf( stderr, "missing arg for option=%s\n", argv[i] );
+    return -1;
+}
+
+
+static void xyuv_display( xyuv_t *xyuv, int i_frame )
+{
+    SDL_Rect rect;
+    int i_picture = 0;
+    int i;
+
+    if( i_frame > xyuv->i_frames )
+        return;
+
+    xyuv->i_frame = i_frame;
+
+    /* Load and copy pictue data */
+    for( i = 0; i < xyuv->i_yuv; i++ ) {
+        int i_plane;
+
+        if( i_frame - 1 >= xyuv->yuv[i].i_frames )
+            continue;
+        i_picture++;
+
+        fseek( xyuv->yuv[i].f, (xyuv->i_frame-1) * xyuv->i_frame_size, SEEK_SET );
+        fread( xyuv->pic, xyuv->i_frame_size, 1, xyuv->yuv[i].f );
+
+        SDL_LockYUVOverlay( xyuv->overlay );
+
+        if( xyuv->b_diff || xyuv->b_split ) {
+            /* Reset UV */
+            for( i_plane = 1; i_plane < 3; i_plane++ ) {
+                memset( xyuv->overlay->pixels[i_plane], 128, xyuv->overlay->pitches[i_plane] * xyuv->overlay->h / 2 );
+            }
+            /* Show diff in Y plane of overlay */
+
+            for( i_plane = 0; i_plane < 3; i_plane++ ) {
+                int div = i_plane == 0 ? 1 : 2;
+                uint8_t *src = xyuv->pic;
+                uint8_t *dst = xyuv->overlay->pixels[0] +
+                                (xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[0] );
+                int j;
+                if( i_plane == 1 ) {
+                    src +=  5*xyuv->i_width * xyuv->i_height/4;
+                    dst += xyuv->i_width;
+                } else if( i_plane == 2 ) {
+                    src += xyuv->i_width * xyuv->i_height;
+                    dst += xyuv->i_width + xyuv->i_height / 2 * xyuv->overlay->pitches[0];
+                }
+
+                for( j = 0; j < xyuv->i_height / div; j++ ) {
+                    if( i_picture == 1 || xyuv->b_split ) {
+                        memcpy( dst, src, xyuv->i_width / div );
+                    } else {
+                        int k;
+                        for( k = 0; k < xyuv->i_width / div; k++ ) {
+                            dst[k] = abs( dst[k] - src[k]);
+                        }
+                    }
+                    src += xyuv->i_width / div;
+                    dst += xyuv->overlay->pitches[0];
+                }
+            }
+        } else {
+            for( i_plane = 0; i_plane < 3; i_plane++ ) {
+                int div = i_plane == 0 ? 1 : 2;
+                uint8_t *src = xyuv->pic;
+                uint8_t *dst = xyuv->overlay->pixels[i_plane] +
+                                ((xyuv->yuv[i].x + xyuv->yuv[i].y * xyuv->overlay->pitches[i_plane] ) / div );
+                int w = xyuv->i_width / div;
+                int j;
+
+                if( i_plane == 1 ) {
+                    src +=  5*xyuv->i_width * xyuv->i_height/4;
+                } else if( i_plane == 2 ) {
+                    src += xyuv->i_width * xyuv->i_height;
+                }
+                if( xyuv->i_join > 0 ) {
+                    if( i_picture > 1 ) {
+                        src += xyuv->i_join / div;
+                        dst += xyuv->i_join / div;
+                        w = (xyuv->i_width - xyuv->i_join) /div;
+                    } else {
+                        w = xyuv->i_join / div;
+                    }
+                }
+
+                for( j = 0; j < xyuv->i_height / div; j++ ) {
+                    memcpy( dst, src, w );
+                    src += xyuv->i_width / div;
+                    dst += xyuv->overlay->pitches[i_plane];
+                }
+            }
+        }
+
+        SDL_UnlockYUVOverlay( xyuv->overlay );
+    }
+
+    if( xyuv->f_y != 0.0 ) {
+        uint8_t *pix = xyuv->overlay->pixels[0];
+        int j;
+
+        for( j = 0; j < xyuv->i_sdl_height; j++ ) {
+            int k;
+            for( k = 0; k < xyuv->i_sdl_width; k++ ) {
+                int v= pix[k] * xyuv->f_y;
+                if( v > 255 )
+                    pix[k] = 255;
+                else if( v < 0 )
+                    pix[k] = 0;
+                else
+                    pix[k] = v;
+            }
+            pix += xyuv->overlay->pitches[0];
+        }
+    }
+    if( xyuv->b_grid ) {
+        int x, y;
+
+        for( y = 0; y < xyuv->i_sdl_height; y += 4 ) {
+            uint8_t *p = xyuv->overlay->pixels[0] + y * xyuv->overlay->pitches[0];
+            for( x = 0; x < xyuv->i_sdl_width; x += 4 ) {
+                if( x%16== 0 || y%16 == 0 )
+                    p[x] = 0;
+            }
+        }
+    }
+
+    /* Update display */
+    rect.x = 0;
+    rect.y = 0;
+    rect.w = xyuv->i_display_width;
+    rect.h = xyuv->i_display_height;
+    SDL_DisplayYUVOverlay( xyuv->overlay, &rect );
+
+    /* Display title */
+    if( xyuv->title )
+        free( xyuv->title );
+    asprintf( &xyuv->title, SDL_TITLE, xyuv->yuv[0].name, xyuv->i_frame, xyuv->i_frames, xyuv->f_fps );
+    SDL_WM_SetCaption( xyuv->title, "" );
+}
+
+
+
+
--- a/vfw/build/cygwin/Makefile
+++ b/vfw/build/cygwin/Makefile
@ -0,0 +1,117 @@
+##############################################################################
+#
+# Makefile for x264 VFW driver
+#
+# Author: XviD project:
+#            - ??? <cutka at szm.sk>,
+#            - Edouard Gomez <ed.gomez at free.fr>
+#            - Peter Ross <pross@xvid.org>
+# Ported to x264 by Laurent Aimar <fenrir@via.ecp.fr>
+#
+# $Id: Makefile,v 1.1 2004/06/03 19:29:33 fenrir Exp $
+##############################################################################
+
+# Dll to build
+DLL=x264vfw.dll
+
+# Current dir
+DIR_CUR=$(shell pwd)
+
+# Path to include filen library and src
+DIR_INC=$(DIR_CUR)/../../..
+DIR_LIB=$(DIR_CUR)/../../..
+DIR_SRC=$(DIR_CUR)/../..
+
+# Sources
+SRC_C= codec.c config.c driverproc.c
+SRC_RES= resource.rc
+
+# Alias
+RM= rm -rf
+WINDRES=windres
+
+##############################################################################
+# CFLAGS
+##############################################################################
+
+# Constants which should not be modified
+# The `mingw-runtime` package is required when building with -mno-cygwin
+CFLAGS += -I$(DIR_SRC)/w32api -I$(DIR_INC)
+CFLAGS += -D_WIN32_IE=0x0500
+CFLAGS += -mno-cygwin
+
+# Optional Compiler options
+CFLAGS += -Wall
+CFLAGS += -O2
+CFLAGS += -fstrength-reduce
+CFLAGS += -finline-functions
+CFLAGS += -fgcse
+CFLAGS += -freduce-all-givs
+CFLAGS += -ffast-math
+
+##############################################################################
+# Compiler flags for linking stage
+##############################################################################
+
+LDFLAGS += -L$(DIR_LIB) -lx264
+
+##############################################################################
+# Rules
+##############################################################################
+
+OBJECTS = $(SRC_C:.c=.obj)
+OBJECTS+= $(SRC_RES:.rc=.obj)
+
+.SUFFIXES: .obj .rc .c
+
+DIR_BUILD= $(DIR_CUR)/bin
+VPATH = $(DIR_SRC):$(DIR_BUILD)
+
+all: $(DLL)
+
+$(DIR_BUILD):
+	@echo " D: $(DIR_BUILD)"
+	@mkdir -p $(DIR_BUILD)
+
+.rc.obj:
+	@echo " W: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(WINDRES) \
+	--include-dir=$(DIR_SRC) \
+	--input-format=rc \
+	--output-format=coff \
+	-o $(DIR_BUILD)/$@ $<
+
+.c.obj:
+	@echo " C: $(@D)/$(<F)"
+	@mkdir -p $(DIR_BUILD)/$(@D)
+	@$(CC) $(CFLAGS) -c -o $(DIR_BUILD)/$@ $<
+
+$(DLL): $(DIR_BUILD) $(OBJECTS)
+	@echo " L: $(@F)"
+	@cp $(DIR_SRC)/driverproc.def $(DIR_BUILD)/driverproc.def
+	@cd $(DIR_BUILD) && \
+	$(CC) \
+	-mno-cygwin -shared -Wl,-dll,--out-implib,$@.a,--enable-stdcall-fixup \
+	-o $@ \
+	$(OBJECTS) driverproc.def \
+	-lgdi32 -lwinmm -lcomdlg32 -lcomctl32 $(LDFLAGS) 
+
+clean:
+	@echo " Cl: Object files and target lib"
+	@$(RM) $(DIR_BUILD)
+
+install:
+	@echo " I: x264vfw.dll"
+	@cp $(DIR_BUILD)/$(DLL) $(DLL)
+	@cp $(DIR_SRC)/build/win32/bin/x264vfw.inf .
+	@rundll32.exe setupapi,InstallHinfSection DefaultInstall 132 ./x264vfw.inf
+	@rm $(DLL)
+	@rm x264vfw.inf
+
+uninstall:
+	@echo " U: x264vfw.dll"
+	@cp $(DIR_SRC)/build/win32/bin/x264vfw.inf .
+	@rundll32.exe setupapi,InstallHinfSection Remove_x264 132 ./x264vfw.inf
+	@rm x264vfw.inf
+
--- a/vfw/build/win32/bin/x264vfw.inf
+++ b/vfw/build/win32/bin/x264vfw.inf
@ -0,0 +1,91 @@
+; x264 Codec install
+
+[Version]
+Signature = "$CHICAGO$"
+Class = MEDIA
+
+[SourceDisksNames]
+1="x264 Codec Install Disk",, 0001
+
+[SourceDisksFiles]
+x264vfw.dll=1
+x264vfw.inf=1
+
+[Installable.Drivers]
+x264 = 1:x264vfw.dll, "vidc.X264", "x264 H.264 Video Codec" , , ,
+
+[DefaultInstall]
+CopyFiles=H264.Copy.Inf,H264.Copy
+Updateinis = H264.Updateini
+DelReg = H264.DelConfig
+addreg = H264.AddReg,H264.AddReg9x,H264.DoReg
+MediaType = SOFTWARE
+
+[DefaultInstall.ntx86]
+CopyFiles=H264.Copy.Inf,H264.Copy
+DelReg = H264.DelConfig
+addreg = H264.AddReg,H264.AddRegNT,H264.DoReg
+MediaType = SOFTWARE
+
+[Remove_x264]
+AddReg = H264.Unregister
+DelReg = H264.DelReg
+DelFiles = H264.Copy,H264.Copy.Inf
+UpdateInis = H264.DelIni
+
+[H264.Copy]
+x264vfw.dll
+
+[H264.Copy.Inf]
+x264vfw.inf
+
+[H264.UpdateIni]
+system.ini, drivers32,,"vidc.X264=x264vfw.dll"
+
+[H264.DelIni]
+system.ini, drivers32,"vidc.X264=x264vfw.dll",
+
+[H264.AddReg]
+
+[H264.AddReg9x]
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,Description,,%x264%
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,Driver,,x264vfw.dll
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264,FriendlyName,,"x264"
+
+HKLM,%UnInstallPath%,DisplayName,,%UninstallDispName%
+HKLM,%UnInstallPath%,UninstallString,,"%10%\rundll.exe setupx.dll,InstallHinfSection Remove_x264 132 %17%\%InfFile%"
+
+[H264.AddRegNT]
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers.desc,x264vfw.dll,,%x264%
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers32,vidc.X264,,x264vfw.dll
+
+HKLM,%UnInstallPath%,DisplayName,,%UninstallDispName%
+HKLM,%UnInstallPath%,UninstallString,,"%11%\rundll32.exe setupapi,InstallHinfSection Remove_x264 132 %17%\%InfFile%"
+
+[H264.DoReg]
+;HKLM,Software\Microsoft\Windows\CurrentVersion\RunOnce\Setup,"Registering x264 Direct Show ;Decoder...",,"%11%\regsvr32.exe /s %11%\x264.ax"
+
+[H264.DelReg]
+HKLM,SYSTEM\CurrentControlSet\Control\MediaResources\icm\vidc.X264
+
+HKLM,SOFTWARE\Microsoft\Windows NT\CurrentVersion\drivers.desc,x264vfw.dll,,""
+HKLM,%UnInstallPath%
+
+[H264.Unregister]
+;HKLM,Software\Microsoft\Windows\CurrentVersion\RunOnce\Setup,"Unregistering x264 Direct Show ;Decoder...",,"%11%\regsvr32.exe /s /u %11%\x264.ax"
+
+[H264.DelConfig]
+HKCU,Software\GNU\x264
+
+[DestinationDirs]
+DefaultDestDir = 11	; LDID_SYS
+H264.Copy = 11
+H264.Copy.Inf = 17
+
+[Strings]
+x264="x264 H.264 Video Codec"
+InfFile="x264vfw.inf"
+UninstallDispName="x264 H.264/AVC CODEC"
+UnInstallPath="Software\Microsoft\Windows\CurrentVersion\Uninstall\x264"
+MediaClassName="Media Devices"
+mfgname="Fenrir, Justin, CM"
--- a/vfw/build/win32/x264vfw.dsp
+++ b/vfw/build/win32/x264vfw.dsp
@ -0,0 +1,135 @@
+# Microsoft Developer Studio Project File - Name="x264vfw" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Dynamic-Link Library" 0x0102
+
+CFG=x264vfw - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "x264vfw.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "x264vfw.mak" CFG="x264vfw - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "x264vfw - Win32 Release" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE "x264vfw - Win32 Debug" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+MTL=midl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "x264vfw - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "obj/Release"
+# PROP Intermediate_Dir "obj/Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /c
+# ADD CPP /nologo /MT /W3 /GX /O2 /I "../../../extras" /I "../../.." /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /c
+# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386
+# ADD LINK32 winmm.lib vfw32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386 /out:"bin/x264vfw.dll"
+
+!ELSEIF  "$(CFG)" == "x264vfw - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "obj/Debug"
+# PROP Intermediate_Dir "obj/Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /GZ /c
+# ADD CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /I "../../../extras" /I "../../.." /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "VFW_EXPORTS" /YX /FD /GZ /c
+# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 winmm.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /out:"bin/x264vfw.dll" /pdbtype:sept
+
+!ENDIF 
+
+# Begin Target
+
+# Name "x264vfw - Win32 Release"
+# Name "x264vfw - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\codec.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\config.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\driverproc.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\driverproc.def
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\resource.rc
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\resource.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\x264vfw.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# Begin Source File
+
+SOURCE=..\..\..\build\win32\bin\libx264.lib
+# End Source File
+# End Target
+# End Project
--- a/vfw/build/win32/x264vfw.dsw
+++ b/vfw/build/win32/x264vfw.dsw
@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "x264vfw"=.\x264vfw.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
--- a/vfw/codec.c
+++ b/vfw/codec.c
@ -0,0 +1,276 @@
+/*****************************************************************************
+ * codec.c: vfw x264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: codec.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "x264vfw.h"
+
+/* get_csp:
+ *  return a valid x264 CSP or X264_CSP_NULL if unsuported */
+static int get_csp( BITMAPINFOHEADER *hdr )
+{
+    int i_vlip = hdr->biHeight < 0 ? 0 : X264_CSP_VFLIP;
+
+    switch( hdr->biCompression )
+    {
+        case FOURCC_I420:
+        case FOURCC_IYUV:
+            return X264_CSP_I420;
+
+        case FOURCC_YV12:
+            return X264_CSP_YV12;
+
+        case FOURCC_YUYV:
+        case FOURCC_YUY2:
+            return X264_CSP_YUYV;
+
+        case BI_RGB:
+            if( hdr->biBitCount == 24 )
+                return X264_CSP_BGR | i_vlip;
+            if( hdr->biBitCount == 32 )
+                return X264_CSP_BGRA | i_vlip;
+            else
+                return X264_CSP_NONE;
+
+        default:
+            return X264_CSP_NONE;
+    }
+}
+
+/* Test that we can do the compression */
+LRESULT compress_query( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    BITMAPINFOHEADER *inhdr = &lpbiInput->bmiHeader;
+    BITMAPINFOHEADER *outhdr = &lpbiOutput->bmiHeader;
+    CONFIG           *config = &codec->config;
+
+    if( get_csp( inhdr ) == X264_CSP_NONE )
+        return ICERR_BADFORMAT;
+
+    if( lpbiOutput == NULL )
+        return ICERR_OK;
+
+    if( inhdr->biWidth != outhdr->biWidth ||
+        inhdr->biHeight != outhdr->biHeight )
+        return ICERR_BADFORMAT;
+
+    /* We need x16 width/height */
+    if( inhdr->biWidth % 16 != 0 || inhdr->biHeight % 16 != 0 )
+        return ICERR_BADFORMAT;
+
+
+    if( inhdr->biCompression != mmioFOURCC( config->fcc[0], config->fcc[1],
+                                            config->fcc[2], config->fcc[3] ) )
+        return ICERR_BADFORMAT;
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_get_format( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    BITMAPINFOHEADER *inhdr = &lpbiInput->bmiHeader;
+    BITMAPINFOHEADER *outhdr = &lpbiOutput->bmiHeader;
+    CONFIG           *config = &codec->config;
+
+    if( get_csp( inhdr ) == X264_CSP_NONE )
+        return ICERR_BADFORMAT;
+
+    if( lpbiOutput == NULL )
+        return sizeof(BITMAPINFOHEADER);
+
+    memcpy( outhdr, inhdr, sizeof( BITMAPINFOHEADER ) );
+    outhdr->biSize = sizeof( BITMAPINFOHEADER );
+    outhdr->biSizeImage = compress_get_size( codec, lpbiInput, lpbiOutput );
+    outhdr->biXPelsPerMeter = 0;
+    outhdr->biYPelsPerMeter = 0;
+    outhdr->biClrUsed = 0;
+    outhdr->biClrImportant = 0;
+    outhdr->biCompression = mmioFOURCC( config->fcc[0], config->fcc[1],
+                                        config->fcc[2], config->fcc[3] );
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_get_size( CODEC *codec, BITMAPINFO *lpbiInput, BITMAPINFO *lpbiOutput )
+{
+    return 2 * lpbiOutput->bmiHeader.biWidth * lpbiOutput->bmiHeader.biHeight * 3;
+}
+
+/* */
+LRESULT compress_frames_info(CODEC * codec, ICCOMPRESSFRAMES * icf )
+{
+    codec->fincr = icf->dwScale;
+    codec->fbase = icf->dwRate;
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_begin(CODEC * codec, BITMAPINFO * lpbiInput, BITMAPINFO * lpbiOutput )
+{
+    CONFIG *config = &codec->config;
+    x264_param_t param;
+
+    /* Destroy previous handle */
+    if( codec->h != NULL )
+    {
+        x264_encoder_close( codec->h );
+        codec->h = NULL;
+    }
+
+    /* Get default param */
+    x264_param_default( &param );
+
+    /* Set params: TODO to complete */
+    param.i_width = lpbiInput->bmiHeader.biWidth;
+    param.i_height= lpbiInput->bmiHeader.biHeight;
+
+    if( codec->fbase > 0 )
+        param.f_fps   = (float)codec->fincr / (float)codec->fbase;
+
+    param.i_frame_reference = config->i_refmax;
+    param.i_idrframe = config->i_idrframe;
+    param.i_iframe   = config->i_iframe;
+    param.i_qp_constant = config->i_qp;
+    param.b_deblocking_filter = config->b_filter;
+    param.b_cabac = config->b_cabac;
+
+    param.analyse.intra = 0;
+    param.analyse.inter = 0;
+    if( config->b_psub16x16 )
+        param.analyse.inter |= X264_ANALYSE_PSUB16x16;
+    if( config->b_psub8x8 )
+        param.analyse.inter |= X264_ANALYSE_PSUB8x8;
+    if( config->b_i4x4 )
+    {
+        param.analyse.intra |= X264_ANALYSE_I4x4;
+        param.analyse.inter |= X264_ANALYSE_I4x4;
+    }
+
+    switch( config->mode )
+    {
+        case 0: /* 1 PASS */
+            break;
+        default:
+            break;
+    }
+
+    /* Open the encoder */
+    codec->h = x264_encoder_open( &param );
+    if( codec->h == NULL )
+        return ICERR_ERROR;
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress_end(CODEC * codec)
+{
+    if( codec->h != NULL )
+    {
+        x264_encoder_close( codec->h );
+        codec->h = NULL;
+    }
+
+    return ICERR_OK;
+}
+
+/* */
+LRESULT compress( CODEC *codec, ICCOMPRESS *icc )
+{
+    BITMAPINFOHEADER *inhdr = icc->lpbiInput;
+    BITMAPINFOHEADER *outhdr = icc->lpbiOutput;
+
+    x264_picture_t pic;
+
+    int        i_nal;
+    x264_nal_t *nal;
+    int        i_out;
+
+    int i;
+
+    /* Init the picture */
+    memset( &pic, 0, sizeof( x264_picture_t ) );
+    pic.img.i_csp = get_csp( inhdr );
+
+    /* For now biWidth can be divided by 16 so no problem */
+    switch( pic.img.i_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_I420:
+        case X264_CSP_YV12:
+            pic.img.i_plane = 3;
+            pic.img.i_stride[0] = inhdr->biWidth;
+            pic.img.i_stride[1] =
+            pic.img.i_stride[2] = inhdr->biWidth / 2;
+
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            pic.img.plane[1]    = pic.img.plane[0] + inhdr->biWidth * inhdr->biHeight;
+            pic.img.plane[2]    = pic.img.plane[1] + inhdr->biWidth * inhdr->biHeight / 4;
+            break;
+
+        case X264_CSP_YUYV:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 2 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        case X264_CSP_BGR:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 3 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        case X264_CSP_BGRA:
+            pic.img.i_plane = 1;
+            pic.img.i_stride[0] = 4 * inhdr->biWidth;
+            pic.img.plane[0]    = (uint8_t*)icc->lpInput;
+            break;
+
+        default:
+            return ICERR_BADFORMAT;
+    }
+
+    /* encode it */
+    x264_encoder_encode( codec->h, &nal, &i_nal, &pic );
+
+    /* create bitstream */
+    i_out = 0;
+    for( i = 0; i < i_nal; i++ )
+    {
+        int i_size = outhdr->biSizeImage - i_out;
+        x264_nal_encode( (uint8_t*)icc->lpOutput + i_out, &i_size, 1, &nal[i] );
+
+        i_out += i_size;
+    }
+    outhdr->biSizeImage = i_out;
+
+    /* Set key frame only for IDR, as they are real synch point, I frame
+       aren't always synch point (ex: with multi refs, ref marking) */
+    if( pic.i_type == X264_TYPE_IDR )
+        *icc->lpdwFlags = AVIIF_KEYFRAME;
+    else
+        *icc->lpdwFlags = 0;
+
+    return ICERR_OK;
+}
+
--- a/vfw/config.c
+++ b/vfw/config.c
@ -0,0 +1,443 @@
+/*****************************************************************************
+ * config.c: vfw x264 encoder
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: config.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/**************************************************************************
+ *
+ *  History:
+ *
+ *  2004.05.14  CBR encode mode support
+ *
+ **************************************************************************/
+
+#include "x264vfw.h"
+#include <stdio.h>  /* sprintf */
+#include <commctrl.h>
+
+/* Registry */
+#define X264_REG_KEY	HKEY_CURRENT_USER
+#define X264_REG_PARENT "Software\\GNU\\x264"
+#define X264_REG_CHILD  "x264"
+#define X264_REG_CLASS  "config"
+
+/* window controls */
+#define BITRATE_MAX		5000
+#define QUANT_MAX		51
+
+/* description */
+#define X264_NAME		"x264"
+#define X264_DEF_TEXT	"Are you sure you want to load default vaules"
+
+/* Registery handling */
+typedef struct
+{
+    char *reg_value;
+    int  *config_int;
+    int  default_int;
+} reg_int_t;
+
+typedef struct
+{
+    char *reg_value;
+    char *config_str;
+    char *default_str;
+} reg_str_t;
+
+CONFIG reg;
+static const reg_int_t reg_int_table[] =
+{
+    /* Main dialog */
+    { "bitrate",        &reg.bitrate,           800 },
+    { "quantizer",      &reg.i_qp,              26 },
+    { "encoding_type",  &reg.i_encoding_type,   1 },
+
+    /* Advance dialog */
+    { "cabac",          &reg.b_cabac,           1 },
+    { "loop_filter",    &reg.b_filter,          1 },
+    { "idrframe",       &reg.i_idrframe,        1 },
+    { "iframe",         &reg.i_iframe,          150 },
+    { "refmax",         &reg.i_refmax,          1 },
+
+    /* analysis */
+    {"i4x4",            &reg.b_i4x4,            1 },
+    {"psub16x16",       &reg.b_psub16x16,       1 },
+    {"psub8x8",         &reg.b_psub8x8,         1 }
+};
+
+static const reg_str_t reg_str_table[] =
+{
+    { "fourcc",         reg.fcc,                "x264" }
+};
+
+void config_reg_load( CONFIG *config )
+{
+    HKEY    hKey;
+    DWORD   i_size;
+    int     i;
+
+    RegOpenKeyEx( X264_REG_KEY, X264_REG_PARENT "\\" X264_REG_CHILD,
+                  0, KEY_READ, &hKey );
+
+    /* Read all integers */
+    for( i = 0; i < sizeof( reg_int_table )/sizeof( reg_int_t); i++ )
+    {
+        i_size = sizeof( int );
+        if( RegQueryValueEx( hKey, reg_int_table[i].reg_value, 0, 0,
+                             (LPBYTE)reg_int_table[i].config_int,
+                             &i_size ) != ERROR_SUCCESS )
+            *reg_int_table[i].config_int = reg_int_table[i].default_int;
+    }
+
+    /* Read strings */
+    for( i = 0; i < sizeof( reg_str_table )/sizeof( reg_str_t); i++ )
+    {
+        i_size = 5;   /* fourcc + 1 FIXME ugly */
+        if( RegQueryValueEx( hKey, reg_str_table[i].reg_value, 0, 0,
+                             (LPBYTE)reg_str_table[i].config_str,
+                             &i_size ) != ERROR_SUCCESS )
+            memcpy( reg_str_table[i].config_str,
+                    reg_str_table[i].default_str, 5 );
+    }
+
+    RegCloseKey( hKey );
+
+    memcpy( config, &reg, sizeof( CONFIG ) );
+}
+
+void config_reg_save( CONFIG *config )
+{
+    HKEY    hKey;
+    DWORD   i_size;
+    int     i;
+
+    if( RegCreateKeyEx( X264_REG_KEY,
+                        X264_REG_PARENT "\\" X264_REG_CHILD,
+                        0,
+                        X264_REG_CLASS,
+                        REG_OPTION_NON_VOLATILE,
+                        KEY_WRITE,
+                        0, &hKey, &i_size ) != ERROR_SUCCESS )
+        return;
+
+    memcpy( &reg, config, sizeof( CONFIG ) );
+
+    /* Save all integers */
+    for( i = 0; i < sizeof( reg_int_table )/sizeof( reg_int_t); i++ )
+    {
+        RegSetValueEx( hKey, reg_int_table[i].reg_value, 0, REG_DWORD,
+                       (LPBYTE)reg_int_table[i].config_int, sizeof( int ) );
+    }
+
+    /* Save strings */
+    for( i = 0; i < sizeof( reg_str_table )/sizeof( reg_str_t); i++ )
+    {
+        RegSetValueEx( hKey, reg_str_table[i].reg_value, 0, REG_SZ,
+                       (LPBYTE)reg_str_table[i].config_str,
+                        5 );    /* FIXME */
+    }
+
+    RegCloseKey( hKey );
+}
+
+/* config_reg_defaults: */
+void config_reg_defaults( CONFIG *config )
+{
+    HKEY hKey;
+
+    /* Just in case */
+    memset( config, 0, sizeof( CONFIG ) );
+
+    if(RegOpenKeyEx( X264_REG_KEY, X264_REG_PARENT, 0, KEY_ALL_ACCESS, &hKey ))
+        return;
+    if( RegDeleteKey( hKey, X264_REG_CHILD ) )
+        return;
+    RegCloseKey( hKey );
+
+    config_reg_load( config );
+    config_reg_save( config );
+}
+
+/* Enables or Disables Window Elements based on Selection
+ */
+static void main_enable_item( HWND hDlg, CONFIG * config )
+{
+    switch( config->i_encoding_type )
+    {
+    case 0 : /* 1 Pass, Bitrate Based */
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATEEDIT ), TRUE );
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATESLIDER ), TRUE );
+
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTEDIT ), FALSE );
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTSLIDER ), FALSE );
+
+        break;
+    case 1 : /* 1 Pass, Quantizer Based */
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATEEDIT ), FALSE );
+        EnableWindow( GetDlgItem( hDlg, IDC_BITRATESLIDER ), FALSE );
+
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTEDIT ), TRUE );
+        EnableWindow( GetDlgItem( hDlg, IDC_QUANTSLIDER ), TRUE );
+
+        break;
+    case 2 : /* 2 Pass */
+        /* not yet implemented */
+        break;
+    }
+
+    SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETRANGE, TRUE,
+                        (LPARAM) MAKELONG( 0, BITRATE_MAX ) );
+    SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETRANGE, TRUE,
+                        (LPARAM) MAKELONG( 0, QUANT_MAX ) );
+}
+
+/* Updates the window from config */
+static void main_update_dlg( HWND hDlg, CONFIG * config )
+{
+    SetDlgItemInt( hDlg, IDC_BITRATEEDIT, config->bitrate, FALSE );
+    SetDlgItemInt( hDlg, IDC_QUANTEDIT, config->i_qp, FALSE );
+
+    switch( config->i_encoding_type )
+    {
+    case 0 : /* 1 Pass, Bitrate Based */
+        CheckRadioButton( hDlg,
+                          IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOBITRATE);
+        break;
+    case 1 : /* 1 Pass, Quantizer Based */
+        CheckRadioButton(hDlg,
+                         IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOQUANT);
+            break;
+    case 2 : /* 2 Pass */
+        CheckRadioButton(hDlg,
+                         IDC_RADIOBITRATE, IDC_RADIOTWOPASS, IDC_RADIOTWOPASS);
+        break;
+    }
+
+    SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETPOS, TRUE,
+                        config->bitrate );
+    SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETPOS, TRUE,
+                        config->i_qp );
+}
+
+
+/* Main config dialog */
+BOOL CALLBACK callback_main( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    CONFIG* config = (CONFIG*)GetWindowLong(hDlg, GWL_USERDATA);
+
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+        SetWindowLong( hDlg, GWL_USERDATA, lParam );
+        config = (CONFIG*)lParam;
+
+        main_enable_item( hDlg, config );
+        main_update_dlg( hDlg, config );
+
+        break;
+
+    case WM_COMMAND:
+        switch ( HIWORD( wParam ) )
+        {
+        case BN_CLICKED :
+            switch( LOWORD( wParam ) )
+            {
+            case IDOK :
+                config->b_save = TRUE;
+                EndDialog( hDlg, LOWORD(wParam) );
+                break;
+            case IDCANCEL :
+                config->b_save = FALSE;
+                EndDialog( hDlg, LOWORD(wParam) );
+                break;
+            case IDC_ADVANCED :
+                DialogBoxParam( g_hInst, MAKEINTRESOURCE(IDD_ADVANCED),
+                                (HWND)lParam, callback_advanced,
+                                (LPARAM)config );
+                break;
+            case IDC_DEFAULTS :
+                if( MessageBox( hDlg, X264_DEF_TEXT, X264_NAME, MB_YESNO ) == IDYES )
+                {
+                    config_reg_defaults( config );
+                    main_enable_item( hDlg, config );
+                    main_update_dlg( hDlg, config );
+                }
+                break;
+            case IDC_RADIOBITRATE :
+                config->i_encoding_type = 0; /* 1 Pass, Bitrate Mode=0 */
+                main_enable_item( hDlg, config );
+                main_update_dlg( hDlg, config );
+                break;
+            case IDC_RADIOQUANT :
+                config->i_encoding_type = 1; /* 1 Pass, Quantizer Mode=1 */
+                main_enable_item( hDlg, config );
+                main_update_dlg( hDlg, config );
+                break;
+            case IDC_RADIOTWOPASS :
+                config->i_encoding_type = 2; /* 2 Pass Mode=2 */
+                main_enable_item( hDlg,  config );
+                main_update_dlg( hDlg, config );
+                break;
+            }
+            break;
+        case EN_CHANGE :
+            switch( LOWORD( wParam ) )
+            {
+            case IDC_BITRATEEDIT :
+                config->bitrate = GetDlgItemInt( hDlg, IDC_BITRATEEDIT, FALSE, FALSE );
+                SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_SETPOS, TRUE, config->bitrate );
+                break;
+            case IDC_QUANTEDIT :
+                config->i_qp = GetDlgItemInt( hDlg, IDC_QUANTEDIT, FALSE, FALSE );
+                SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_SETPOS, TRUE, config->i_qp );
+                break;
+            }
+            break;
+        default:
+            break;
+        }
+        break;
+
+        case WM_HSCROLL :
+            if( (HWND) lParam == GetDlgItem( hDlg, IDC_BITRATESLIDER ) )
+            {
+                config->bitrate = SendDlgItemMessage( hDlg, IDC_BITRATESLIDER, TBM_GETPOS, 0, 0 );
+                SetDlgItemInt( hDlg, IDC_BITRATEEDIT, config->bitrate, FALSE );
+
+            }
+            else if( (HWND) lParam == GetDlgItem( hDlg, IDC_QUANTSLIDER ) )
+            {
+                config->i_qp = SendDlgItemMessage( hDlg, IDC_QUANTSLIDER, TBM_GETPOS, 0, 0 );
+                SetDlgItemInt( hDlg, IDC_QUANTEDIT, config->i_qp, FALSE );
+            }
+            break;
+
+    default :
+        return 0;
+    }
+
+    return 1;
+}
+
+/* About dialog */
+BOOL CALLBACK callback_about( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+    {
+        char temp[1024];
+        sprintf( temp, "Core %d, build %s %s", X264_BUILD, __DATE__, __TIME__ );
+        SetDlgItemText( hDlg, IDC_BUILD,  temp );
+        break;
+    }
+
+    case WM_COMMAND:
+        if (LOWORD(wParam) == IDC_HOMEPAGE && HIWORD(wParam) == STN_CLICKED)
+            ShellExecute( hDlg, "open", X264_WEBSITE, NULL, NULL, SW_SHOWNORMAL );
+        else if (LOWORD(wParam) == IDOK || LOWORD(wParam) == IDCANCEL)
+            EndDialog( hDlg, LOWORD(wParam) );
+        break;
+
+    default :
+        return 0;
+    }
+
+    return 1;
+}
+
+static void adv_update_dlg( HWND hDlg, CONFIG * config )
+{
+    char fourcc[5];
+
+    CheckDlgButton( hDlg,IDC_CABAC,
+                    config->b_cabac ? BST_CHECKED : BST_UNCHECKED );
+    CheckDlgButton( hDlg,IDC_LOOPFILTER,
+                    config->b_filter ? BST_CHECKED: BST_UNCHECKED );
+
+    SetDlgItemInt( hDlg, IDC_IDRFRAMES, config->i_idrframe, FALSE );
+    SetDlgItemInt( hDlg, IDC_IFRAMES, config->i_iframe, FALSE );
+    SetDlgItemInt( hDlg, IDC_KEYFRAME, config->i_refmax, FALSE );
+
+    memcpy( fourcc, config->fcc, 4 );
+    fourcc[4] = '\0';
+
+    SetDlgItemText( hDlg, IDC_FOURCC, fourcc );
+}
+
+
+/* advanced configuration dialog process */
+BOOL CALLBACK callback_advanced( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam )
+{
+    CONFIG* config = (CONFIG*)GetWindowLong(hDlg, GWL_USERDATA);
+
+    switch( uMsg )
+    {
+    case WM_INITDIALOG :
+        SetWindowLong( hDlg, GWL_USERDATA, lParam );
+        config = (CONFIG*)lParam;
+
+        adv_update_dlg( hDlg, config );
+        break;
+
+    case WM_COMMAND:
+        switch ( HIWORD( wParam ) )
+        {
+        case BN_CLICKED :
+            switch( LOWORD( wParam ) )
+            {
+            case IDOK :
+                EndDialog( hDlg, LOWORD( wParam ) );
+                break;
+            case IDC_CABAC :
+                config->b_cabac = ( IsDlgButtonChecked( hDlg, IDC_CABAC ) == BST_CHECKED );
+                break;
+            case IDC_LOOPFILTER :
+                config->b_filter = ( IsDlgButtonChecked( hDlg, IDC_LOOPFILTER ) == BST_CHECKED );
+                break;
+            }
+            break;
+        case EN_CHANGE :
+            switch( LOWORD( wParam ) )
+            {
+            case IDC_IDRFRAMES :
+                config->i_idrframe = GetDlgItemInt( hDlg, IDC_IDRFRAMES, FALSE, FALSE );
+                break;
+            case IDC_IFRAMES :
+                config->i_iframe = GetDlgItemInt( hDlg, IDC_IFRAMES, FALSE, FALSE );
+                break;
+            case IDC_KEYFRAME :
+                config->i_refmax = GetDlgItemInt( hDlg, IDC_KEYFRAME, FALSE, FALSE );
+                break;
+            case IDC_FOURCC :
+                GetDlgItemText( hDlg, IDC_FOURCC, config->fcc, 5 );
+                break;
+            }
+            break;
+        }
+        break;
+    default :
+        return 0;
+    }
+    return 1;
+}
+
--- a/vfw/driverproc.c
+++ b/vfw/driverproc.c
@ -0,0 +1,234 @@
+/*****************************************************************************
+ * drvproc.c: vfw x264 wrapper
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: driverproc.c,v 1.1 2004/06/03 19:27:09 fenrir Exp $
+ *
+ * Authors: Justin Clay
+ *          Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "x264vfw.h"
+
+/* Global dll instance */
+HINSTANCE g_hInst;
+
+
+/* Calling back point for our DLL so we can keep track of the window in g_hInst */
+BOOL WINAPI DllMain( HANDLE hModule, DWORD ul_reason_for_call, LPVOID lpReserved )
+{
+    g_hInst = (HINSTANCE) hModule;
+    return TRUE;
+}
+
+/* This little puppy handles the calls which vfw programs send out to the codec */
+LRESULT WINAPI DriverProc( DWORD dwDriverId, HDRVR hDriver, UINT uMsg, LPARAM lParam1, LPARAM lParam2 )
+{
+    CODEC *codec = (CODEC *)dwDriverId;
+
+    switch( uMsg )
+    {
+        case DRV_LOAD:
+        case DRV_FREE:
+            return DRV_OK;
+
+        case DRV_OPEN:
+        {
+            ICOPEN *icopen = (ICOPEN *)lParam2;
+
+            if( icopen != NULL && icopen->fccType != ICTYPE_VIDEO )
+                return DRV_CANCEL;
+
+            if( ( codec = malloc( sizeof( CODEC ) ) ) == NULL )
+            {
+                if( icopen != NULL )
+                    icopen->dwError = ICERR_MEMORY;
+                return 0;
+            }
+
+            memset( codec, 0, sizeof( CODEC ) );
+            config_reg_load( &codec->config );
+            codec->h = NULL;
+
+            if( icopen != NULL )
+                icopen->dwError = ICERR_OK;
+            return (LRESULT)codec;
+        }
+
+        case DRV_CLOSE:
+            /* From xvid: compress_end/decompress_end don't always get called */
+            compress_end(codec);
+            free( codec );
+            return DRV_OK;
+
+        case DRV_DISABLE:
+        case DRV_ENABLE:
+            return DRV_OK;
+
+        case DRV_INSTALL:
+        case DRV_REMOVE:
+            return DRV_OK;
+
+        case DRV_QUERYCONFIGURE:
+        case DRV_CONFIGURE:
+            return DRV_CANCEL;
+
+        /* info */
+        case ICM_GETINFO:
+        {
+            ICINFO *icinfo = (ICINFO *)lParam1;
+
+            /* return a description */
+            icinfo->fccType      = ICTYPE_VIDEO;
+            icinfo->fccHandler   = FOURCC_X264;
+            icinfo->dwFlags      = VIDCF_COMPRESSFRAMES | VIDCF_FASTTEMPORALC;
+
+            icinfo->dwVersion    = 0;
+            icinfo->dwVersionICM = ICVERSION;
+
+            wcscpy( icinfo->szName, X264_NAME_L);
+            wcscpy( icinfo->szDescription, X264_DESC_L);
+
+            return lParam2; /* size of struct */
+        }
+
+        case ICM_ABOUT:
+            if( lParam1 != -1 )
+            {
+                DialogBoxParam(g_hInst, MAKEINTRESOURCE(IDD_ABOUT), (HWND)lParam1, callback_about, 0 );
+            }
+            return ICERR_OK;
+
+        case ICM_CONFIGURE:
+            if( lParam1 != -1 )
+            {
+                CONFIG temp;
+
+                codec->config.b_save = FALSE;
+			    memcpy( &temp, &codec->config, sizeof(CONFIG) );
+
+                DialogBoxParam( g_hInst, MAKEINTRESOURCE(IDD_MAINCONFIG), (HWND)lParam1, callback_main, (LPARAM)&temp );
+
+			    if( temp.b_save )
+			    {
+				    memcpy( &codec->config, &temp, sizeof(CONFIG) );
+                    config_reg_save( &codec->config );
+			    }
+            }
+            return ICERR_OK;
+
+        case ICM_GETSTATE:
+            if( (void*)lParam1 == NULL )
+            {
+                return sizeof( CONFIG );
+            }
+            memcpy( (void*)lParam1, &codec->config, sizeof( CONFIG ) );
+            return ICERR_OK;
+
+        case ICM_SETSTATE:
+            if( (void*)lParam1 == NULL )
+            {
+                config_reg_load( &codec->config );
+                return 0;
+            }
+            memcpy( &codec->config, (void*)lParam1, sizeof( CONFIG ) );
+            return 0;
+
+        /* not sure the difference, private/public data? */
+        case ICM_GET:
+        case ICM_SET:
+            return ICERR_OK;
+
+
+        /* older-stype config */
+        case ICM_GETDEFAULTQUALITY:
+        case ICM_GETQUALITY:
+        case ICM_SETQUALITY:
+        case ICM_GETBUFFERSWANTED:
+        case ICM_GETDEFAULTKEYFRAMERATE:
+            return ICERR_UNSUPPORTED;
+
+
+        /* compressor */
+        case ICM_COMPRESS_QUERY:
+            return compress_query(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_GET_FORMAT:
+            return compress_get_format(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_GET_SIZE:
+            return compress_get_size(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_FRAMES_INFO:
+            return compress_frames_info(codec, (ICCOMPRESSFRAMES *)lParam1);
+
+        case ICM_COMPRESS_BEGIN:
+            return compress_begin(codec, (BITMAPINFO *)lParam1, (BITMAPINFO *)lParam2);
+
+        case ICM_COMPRESS_END:
+            return compress_end(codec);
+
+        case ICM_COMPRESS:
+            return compress(codec, (ICCOMPRESS *)lParam1);
+
+        /* decompressor : not implemented */
+        case ICM_DECOMPRESS_QUERY:
+        case ICM_DECOMPRESS_GET_FORMAT:
+        case ICM_DECOMPRESS_BEGIN:
+        case ICM_DECOMPRESS_END:
+        case ICM_DECOMPRESS:
+        case ICM_DECOMPRESS_GET_PALETTE:
+        case ICM_DECOMPRESS_SET_PALETTE:
+        case ICM_DECOMPRESSEX_QUERY:
+        case ICM_DECOMPRESSEX_BEGIN:
+        case ICM_DECOMPRESSEX_END:
+        case ICM_DECOMPRESSEX:
+            return ICERR_UNSUPPORTED;
+
+#if 0
+        /* VFWEXT entry point : XXX what's that ? */
+        case ICM_USER+0x0fff :
+            if (lParam1 == VFWEXT_CONFIGURE_INFO) {
+                VFWEXT_CONFIGURE_INFO_T * info = (VFWEXT_CONFIGURE_INFO_T*)lParam2;
+                DPRINTF("%i %i %i %i %i %i",
+                    info->ciWidth, info->ciHeight,
+                    info->ciRate, info->ciScale,
+                    info->ciActiveFrame, info->ciFrameCount);
+
+                codec->config.ci_valid = 1;
+                memcpy(&codec->config.ci, (void*)lParam2, sizeof(VFWEXT_CONFIGURE_INFO_T));
+                return ICERR_OK;
+            }
+            return ICERR_UNSUPPORTED;
+#endif
+
+        default:
+            return DefDriverProc( dwDriverId, hDriver, uMsg, lParam1, lParam2 );
+    }
+}
+
+void WINAPI Configure(HWND hwnd, HINSTANCE hinst, LPTSTR lpCmdLine, int nCmdShow)
+{
+    DWORD dwDriverId;
+
+    dwDriverId = DriverProc(0, 0, DRV_OPEN, 0, 0);
+    if (dwDriverId != (DWORD)NULL)
+    {
+        DriverProc(dwDriverId, 0, ICM_CONFIGURE, (LPARAM)GetDesktopWindow(), 0);
+        DriverProc(dwDriverId, 0, DRV_CLOSE, 0, 0);
+    }
+}
--- a/vfw/resource.h
+++ b/vfw/resource.h
@ -0,0 +1,52 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Developer Studio generated include file.
+// Used by resource.rc
+//
+#define IDD_DIALOG1                     101
+#define IDD_MAINCONFIG                  101
+#define IDD_ADVANCED                    102
+#define IDD_ABOUT                       103
+#define IDC_BITRATESLIDER               1002
+#define IDC_BITRATEEDIT                 1003
+#define IDC_BITRATESLIDER2              1004
+#define IDC_QUANTSLIDER                 1004
+#define IDC_CABAC                       1005
+#define IDC_BITRATEEDIT2                1005
+#define IDC_QUANTEDIT                   1005
+#define IDC_LOOPFILTER                  1007
+#define IDC_BITRATELOW                  1009
+#define IDC_BITRATELOW2                 1010
+#define IDC_BITRATEHIGH                 1011
+#define IDC_BFRAMES                     1012
+#define IDC_BITRATEHIGH2                1012
+#define IDC_IDRFRAMES                   1012
+#define IDC_BFRAMES2                    1013
+#define IDC_IFRAMES                     1013
+#define IDC_EDIT3                       1014
+#define IDC_KEYFRAME                    1014
+#define IDC_DEFAULTS                    1016
+#define IDC_CHECK3                      1017
+#define IDC_ADVANCED                    1018
+#define IDC_RADIO1                      1022
+#define IDC_RADIOBITRATE                1022
+#define IDC_MODE                        1023
+#define IDC_RADIOQUALITY                1024
+#define IDC_RADIOQUANT                  1024
+#define IDC_RADIOTWOPASS                1026
+#define IDC_USEADVANCED                 1029
+#define IDC_ADVDEFAULTS                 1030
+#define IDC_HOMEPAGE                    1034
+#define IDC_X264                        1035
+#define IDC_BUILD                       1036
+#define IDC_FOURCC                      1039
+
+// Next default values for new objects
+// 
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        104
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1040
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
--- a/vfw/x264vfw.h
+++ b/vfw/x264vfw.h
@ -0,0 +1,103 @@
+#ifndef _X264_VFW_H
+#define _X264_VFW_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <windows.h>
+#include <vfw.h>
+
+#include <x264.h>
+
+#include "resource.h"
+
+/* Name */
+#define X264_NAME_L     L"x264"
+#define X264_DESC_L     L"x264 - H264/AVC encoder"
+
+/* Codec fcc */
+#define FOURCC_X264 mmioFOURCC('X','2','6','4')
+
+/* yuv 4:2:0 planar */
+#define FOURCC_I420 mmioFOURCC('I','4','2','0')
+#define FOURCC_IYUV mmioFOURCC('I','Y','U','V')
+#define FOURCC_YV12 mmioFOURCC('Y','V','1','2')
+
+/* yuv 4:2:2 packed */
+#define FOURCC_YUY2 mmioFOURCC('Y','U','Y','2')
+#define FOURCC_YUYV mmioFOURCC('Y','U','Y','V')
+
+#define X264_WEBSITE	"http://lyra.via.ecp.fr/"
+
+/* CONFIG: vfw config
+ */
+typedef struct
+{
+    /********** ATTENTION **********/
+    int mode;                   /* Vidomi directly accesses these vars */
+    int bitrate;
+    int desired_size;           /* please try to avoid modifications here */
+    char stats[MAX_PATH];
+    /*******************************/
+
+    /* Our config */
+    int i_refmax;
+    int i_idrframe;
+    int i_iframe;
+
+    int i_qp;
+    int b_filter;
+
+    int b_cabac;
+
+    int b_i4x4;
+    int b_psub16x16;
+    int b_psub8x8;
+
+    /* vfw interface */
+    int b_save;
+    /* fourcc used */
+    char fcc[4+1];
+    int  i_encoding_type;
+} CONFIG;
+
+/* CODEC: vfw codec instance
+ */
+typedef struct
+{
+    CONFIG config;
+
+    /* handle */
+    x264_t *h;
+
+    /* XXX: needed ? */
+    unsigned int fincr;
+    unsigned int fbase;
+} CODEC;
+
+/* Compress functions */
+LRESULT compress_query(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_get_format(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_get_size(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_frames_info(CODEC *, ICCOMPRESSFRAMES *);
+LRESULT compress_begin(CODEC *, BITMAPINFO *, BITMAPINFO *);
+LRESULT compress_end(CODEC *);
+LRESULT compress(CODEC *, ICCOMPRESS *);
+
+
+/* config functions */
+void config_reg_load( CONFIG * config );
+void config_reg_save( CONFIG * config );
+
+
+/* Dialog callbacks */
+BOOL CALLBACK callback_about( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+BOOL CALLBACK callback_main ( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+BOOL CALLBACK callback_advanced( HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam );
+
+/* Dll instance */
+extern HINSTANCE g_hInst;
+
+#endif
+
--- a/x264.c
+++ b/x264.c
@ -0,0 +1,558 @@
+/*****************************************************************************
+ * x264: h264 encoder/decoder testing program.
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: x264.c,v 1.1 2004/06/03 19:24:12 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <math.h>
+
+#include <signal.h>
+#define _GNU_SOURCE
+#include <getopt.h>
+
+#ifdef _MSC_VER
+#include <io.h>     /* _setmode() */
+#include <fcntl.h>  /* _O_BINARY */
+#endif
+
+#include "x264.h"
+#include "core/common.h"
+
+#define DATA_MAX 3000000
+uint8_t data[DATA_MAX];
+
+/* Ctrl-C handler */
+static int     i_ctrl_c = 0;
+static void    SigIntHandler( int a )
+{
+    i_ctrl_c = 1;
+}
+
+static void Help( void );
+static int  Parse( int argc, char **argv, x264_param_t  *param, FILE **p_fin, FILE **p_fout, int *pb_decompress );
+static int  Encode( x264_param_t  *param, FILE *fyuv,  FILE *fout );
+static int  Decode( x264_param_t  *param, FILE *fh26l, FILE *fout );
+
+/****************************************************************************
+ * main:
+ ****************************************************************************/
+int main( int argc, char **argv )
+{
+    x264_param_t param;
+
+    FILE    *fout;
+    FILE    *fin;
+
+    int     b_decompress;
+    int     i_ret;
+
+#ifdef _MSC_VER
+    _setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcos Morais <morais at dee.ufcg.edu.br> */
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    x264_param_default( &param );
+    param.f_fps = 25.0;
+
+    /* Parse command line */
+    if( Parse( argc, argv, &param, &fin, &fout, &b_decompress ) < 0 )
+    {
+        return -1;
+    }
+
+    /* Control-C handler */
+    signal( SIGINT, SigIntHandler );
+
+    if( b_decompress )
+        i_ret = Decode( &param, fin, fout );
+    else
+        i_ret = Encode( &param, fin, fout );
+
+    return i_ret;
+}
+
+/*****************************************************************************
+ * Help:
+ *****************************************************************************/
+static void Help( void )
+{
+    fprintf( stderr,
+             "x264 build:0x%4.4x\n"
+             "Syntax: x264 [options] [-o out.h26l] in.yuv widthxheigh\n"
+             "\n"
+             "  -h, --help                  Print this help\n"
+             "\n"
+             "  -I, --idrframe <integer>    Each 'number' I frames are IDR frames\n"
+             "  -i, --iframe <integer>      Frequency of I frames\n"
+             "  -b, --bframe <integer>      Number of B-frames between I and P\n"
+             "\n"
+             "  -c, --cabac                 Enable CABAC\n"
+             "  -r, --ref <integer>         Number of references\n"
+             "  -n, --nf                    Disable loop filter\n"
+             "  -f, --filter <alpha:beta>   Loop filter AplhaCO and Beta parameters\n"
+             "  -q, --qp <integer>          Set QP\n"
+             "  -B, --bitrate <integer>     Set bitrate [broken]\n"
+             "\n"
+             "  -A, --analyse <string>      Analyse options:\n"
+             "                                  - i4x4\n"
+             "                                  - psub16x16,psub8x8\n"
+             "                                  - none, all\n"
+             "\n"
+             "  -s, --sar width:height      Specify Sample Aspect Ratio\n"
+             "  -o, --output                Specify output file\n"
+             "\n"
+             "      --no-asm                Disable any CPU optims\n"
+             "\n",
+            X264_BUILD
+           );
+}
+
+/*****************************************************************************
+ * Parse:
+ *****************************************************************************/
+static int  Parse( int argc, char **argv,
+                   x264_param_t  *param,
+                   FILE **p_fin, FILE **p_fout, int *pb_decompress )
+{
+    char *psz_filename = NULL;
+
+    /* Default output */
+    *p_fout = stdout;
+    *p_fin  = stdin;
+    *pb_decompress = 0;
+
+    /* Parse command line options */
+    opterr = 0; // no error message
+    for( ;; )
+    {
+        int long_options_index;
+        static struct option long_options[] =
+        {
+            { "help",    no_argument,       NULL, 'h' },
+            { "bitrate", required_argument, NULL, 'B' },
+            { "bframe",  required_argument, NULL, 'b' },
+            { "iframe",  required_argument, NULL, 'i' },
+            { "idrframe",required_argument, NULL, 'I' },
+            { "nf",      no_argument,       NULL, 'n' },
+            { "filter",  required_argument, NULL, 'f' },
+            { "cabac",   no_argument,       NULL, 'c' },
+            { "qp",      required_argument, NULL, 'q' },
+            { "ref",     required_argument, NULL, 'r' },
+            { "no-asm",  no_argument,       NULL, 'C' },
+            { "sar",     required_argument, NULL, 's' },
+            { "output",  required_argument, NULL, 'o' },
+            { "analyse", required_argument, NULL, 'A' },
+            {0, 0, 0, 0}
+        };
+
+        int c;
+
+        c = getopt_long( argc, argv, "hi:I:b:r:cxB:q:no:s:A:",
+                         long_options, &long_options_index);
+
+        if( c == -1 )
+        {
+            break;
+        }
+
+        switch( c )
+        {
+            case 'h':
+                Help();
+                return -1;
+
+            case 0:
+                break;
+            case 'B':
+                param->i_bitrate = atol( optarg );
+                break;
+            case 'b':
+                param->i_bframe = atol( optarg );
+                break;
+            case 'i':
+                param->i_iframe = atol( optarg );
+                break;
+            case 'I':
+                param->i_idrframe = atol( optarg );
+                break;
+            case 'n':
+                param->b_deblocking_filter = 0;
+                break;
+            case 'f':
+            {
+                char *p = strchr( optarg, ':' );
+                if( p )
+                {
+                    param->i_deblocking_filter_alphac0 = atoi( optarg );
+                    param->i_deblocking_filter_beta = atoi( p );
+                }
+                break;
+            }
+            case 'q':
+                param->i_qp_constant = atoi( optarg );
+                break;
+            case 'r':
+                param->i_frame_reference = atoi( optarg );
+                break;
+            case 'c':
+                param->b_cabac = 1;
+                break;
+            case 'x':
+                *pb_decompress = 1;
+                break;
+            case 'C':
+                param->cpu = 0;
+                break;
+            case'o':
+                if( ( *p_fout = fopen( optarg, "wb" ) ) == NULL )
+                {
+                    fprintf( stderr, "cannot open output file `%s'\n", optarg );
+                    return -1;
+                }
+                break;
+            case 's':
+            {
+                char *p = strchr( optarg, ':' );
+                if( p )
+                {
+                    param->vui.i_sar_width = atoi( optarg );
+                    param->vui.i_sar_height = atoi( p + 1 );
+                }
+                break;
+            }
+            case 'A':
+                param->analyse.inter = 0;
+                if( strstr( optarg, "none" ) )  param->analyse.inter = 0x000000;
+                if( strstr( optarg, "all" ) )   param->analyse.inter = X264_ANALYSE_I4x4|X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8;
+
+                if( strstr( optarg, "i4x4" ) )      param->analyse.inter |= X264_ANALYSE_I4x4;
+                if( strstr( optarg, "psub16x16" ) ) param->analyse.inter |= X264_ANALYSE_PSUB16x16;
+                if( strstr( optarg, "psub8x8" ) )   param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+                break;
+
+            default:
+                fprintf( stderr, "unknown option (%c)\n", optopt );
+                return -1;
+        }
+    }
+
+    /* Get the file name */
+    if( optind > argc - 1 )
+    {
+        Help();
+        return -1;
+    }
+    psz_filename = argv[optind++];
+
+    if( !(*pb_decompress) )
+    {
+        char *psz_size = NULL;
+        char *p;
+
+
+        if( optind > argc - 1 )
+        {
+            char *psz = psz_filename;
+            char *x = NULL;
+            /* try to parse the file name */
+            while( *psz )
+            {
+                while( *psz && ( *psz < '0' || *psz > '9' ) ) psz++;
+                x = strchr( psz, 'x' );
+                if( !x )
+                    break;
+                if( ( x[1] >= '0' && x[1] <= '9' ) )
+                {
+                    psz_size = psz;
+                    break;
+                }
+            }
+            if( psz_size == NULL )
+            {
+                Help();
+                return -1;
+            }
+            fprintf( stderr, "x264: file name gives %dx%d\n", atoi(psz), atoi(x+1) );
+        }
+        else
+        {
+            psz_size = argv[optind++];
+        }
+
+        param->i_width           = strtol( psz_size, &p, 0 );
+        param->i_height          = strtol( p+1, &p, 0 );
+    }
+
+    /* open the input */
+    if( !strcmp( psz_filename, "-" ) )
+    {
+        *p_fin = stdin;
+        optind++;
+    }
+    else if( ( *p_fin = fopen( psz_filename, "rb" ) ) == NULL )
+    {
+        fprintf( stderr, "could not open input file '%s'\n", psz_filename );
+        return -1;
+    }
+
+    return 0;
+}
+
+/*****************************************************************************
+ * Decode:
+ *****************************************************************************/
+static int  Decode( x264_param_t  *param, FILE *fh26l, FILE *fout )
+{
+    fprintf( stderr, "decompressor not working (help is welcome)\n" );
+    return -1;
+#if 0
+    x264_nal_t nal;
+    int i_data;
+    int b_eof;
+
+    //param.cpu = 0;
+    if( ( h = x264_decoder_open( &param ) ) == NULL )
+    {
+        fprintf( stderr, "x264_decoder_open failed\n" );
+        return -1;
+    }
+
+    i_start = x264_mdate();
+    b_eof = 0;
+    i_frame = 0;
+    i_data  = 0;
+    nal.p_payload = malloc( DATA_MAX );
+
+    while( !i_ctrl_c )
+    {
+        uint8_t *p, *p_next, *end;
+        int i_size;
+        /* fill buffer */
+        if( i_data < DATA_MAX && !b_eof )
+        {
+            int i_read = fread( &data[i_data], 1, DATA_MAX - i_data, fh26l );
+            if( i_read <= 0 )
+            {
+                b_eof = 1;
+            }
+            else
+            {
+                i_data += i_read;
+            }
+        }
+
+        if( i_data < 3 )
+        {
+            break;
+        }
+
+        end = &data[i_data];
+
+        /* extract one nal */
+        p = &data[0];
+        while( p < end - 3 )
+        {
+            if( p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x01 )
+            {
+                break;
+            }
+            p++;
+        }
+
+        if( p >= end - 3 )
+        {
+            fprintf( stderr, "garbage (i_data = %d)\n", i_data );
+            i_data = 0;
+            continue;
+        }
+
+        p_next = p + 3;
+        while( p_next < end - 3 )
+        {
+            if( p_next[0] == 0x00 && p_next[1] == 0x00 && p_next[2] == 0x01 )
+            {
+                break;
+            }
+            p_next++;
+        }
+
+        if( p_next == end - 3 && i_data < DATA_MAX )
+        {
+            p_next = end;
+        }
+
+        /* decode this nal */
+        i_size = p_next - p - 3;
+        if( i_size <= 0 )
+        {
+            if( b_eof )
+            {
+                break;
+            }
+            fprintf( stderr, "nal too large (FIXME) ?\n" );
+            i_data = 0;
+            continue;
+        }
+
+        x264_nal_decode( &nal, p +3, i_size );
+
+        /* decode the content of the nal */
+        x264_decoder_decode( h, &pic, &nal );
+
+        if( pic != NULL )
+        {
+            int i;
+
+            i_frame++;
+
+            for( i = 0; i < pic->i_plane;i++ )
+            {
+                int i_line;
+                int i_div;
+
+                i_div = i==0 ? 1 : 2;
+                for( i_line = 0; i_line < pic->i_height/i_div; i_line++ )
+                {
+                    fwrite( pic->plane[i]+i_line*pic->i_stride[i], 1, pic->i_width/i_div, fout );
+                }
+            }
+        }
+
+        memmove( &data[0], p_next, end - p_next );
+        i_data -= p_next - &data[0];
+    }
+
+    i_end = x264_mdate();
+    free( nal.p_payload );
+    fprintf( stderr, "\n" );
+
+    x264_decoder_close( h );
+
+    fclose( fh26l );
+    if( fout != stdout )
+    {
+        fclose( fout );
+    }
+    if( i_frame > 0 )
+    {
+        double fps = (double)i_frame * (double)1000000 /
+                     (double)( i_end - i_start );
+        fprintf( stderr, "decoded %d frames %ffps\n", i_frame, fps );
+    }
+#endif
+}
+
+/*****************************************************************************
+ * Encode:
+ *****************************************************************************/
+static int  Encode( x264_param_t  *param, FILE *fyuv, FILE *fout )
+{
+    x264_t *h;
+    x264_picture_t pic;
+
+    int     i_frame, i_frame_total;
+    int64_t i_start, i_end;
+    int64_t i_file;
+
+    i_frame_total = 0;
+    if( !fseek( fyuv, 0, SEEK_END ) )
+    {
+        int64_t i_size = ftell( fyuv );
+        fseek( fyuv, 0, SEEK_SET );
+        i_frame_total = (int)(i_size / ( param->i_width * param->i_height * 3 / 2 ));
+    }
+
+    if( ( h = x264_encoder_open( param ) ) == NULL )
+    {
+        fprintf( stderr, "x264_encoder_open failed\n" );
+        return -1;
+    }
+
+    /* Create a new pic */
+    x264_picture_alloc( &pic, X264_CSP_I420, param->i_width, param->i_height );
+
+    i_start = x264_mdate();
+    for( i_frame = 0, i_file = 0; i_ctrl_c == 0 ; i_frame++ )
+    {
+        int         i_nal;
+        x264_nal_t  *nal;
+
+        int         i;
+
+        /* read a frame */
+        if( fread( pic.img.plane[0], 1, param->i_width * param->i_height, fyuv ) <= 0 ||
+            fread( pic.img.plane[1], 1, param->i_width * param->i_height / 4, fyuv ) <= 0 ||
+            fread( pic.img.plane[2], 1, param->i_width * param->i_height / 4, fyuv ) <= 0 )
+        {
+            break;
+        }
+
+        /* Do not force any parameters */
+        pic.i_type = X264_TYPE_AUTO;
+        if( x264_encoder_encode( h, &nal, &i_nal, &pic ) < 0 )
+        {
+            fprintf( stderr, "x264_encoder_encode failed\n" );
+        }
+
+        for( i = 0; i < i_nal; i++ )
+        {
+            int i_size;
+            int i_data;
+
+            i_data = DATA_MAX;
+            if( ( i_size = x264_nal_encode( data, &i_data, 1, &nal[i] ) ) > 0 )
+            {
+                i_file += fwrite( data, 1, i_size, fout );
+            }
+            else if( i_size < 0 )
+            {
+                fprintf( stderr,
+                         "need to increase buffer size (size=%d)\n", -i_size );
+            }
+        }
+    }
+    i_end = x264_mdate();
+    x264_picture_clean( &pic );
+    x264_encoder_close( h );
+    fprintf( stderr, "\n" );
+
+    fclose( fyuv );
+    if( fout != stdout )
+    {
+        fclose( fout );
+    }
+
+    if( i_frame > 0 )
+    {
+        double fps = (double)i_frame * (double)1000000 /
+                     (double)( i_end - i_start );
+
+        fprintf( stderr, "encoded %d frames %ffps %lld kb/s\n", i_frame, fps, i_file * 8 * 25 / i_frame / 1000 );
+    }
+
+    return 0;
+}
+
+
--- a/x264.h
+++ b/x264.h
@ -0,0 +1,249 @@
+/*****************************************************************************
+ * x264.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2003 Laurent Aimar
+ * $Id: x264.h,v 1.1 2004/06/03 19:24:12 fenrir Exp $
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _X264_H
+#define _X264_H 1
+
+#define X264_BUILD 0x0008
+
+/* x264_t:
+ *      opaque handler for decoder and encoder */
+typedef struct x264_t x264_t;
+
+/****************************************************************************
+ * Initialisation structure and function.
+ ****************************************************************************/
+/* CPU flags
+ */
+#define X264_CPU_MMX        0x000001    /* mmx */
+#define X264_CPU_MMXEXT     0x000002    /* mmx-ext*/
+#define X264_CPU_SSE        0x000004    /* sse */
+#define X264_CPU_SSE2       0x000008    /* sse 2 */
+#define X264_CPU_3DNOW      0x000010    /* 3dnow! */
+#define X264_CPU_3DNOWEXT   0x000020    /* 3dnow! ext */
+#define X264_CPU_ALTIVEC    0x000040    /* altivec */
+
+/* Analyse flags
+ */
+#define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
+#define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
+#define X264_ANALYSE_PSUB8x8    0x0020  /* Analyse p8x4, p4x8, p4x4 */
+
+/* Colorspace type
+ */
+#define X264_CSP_MASK           0x00ff  /* */
+#define X264_CSP_NONE           0x0000  /* Invalid mode     */
+#define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
+#define X264_CSP_I422           0x0002  /* yuv 4:2:2 planar */
+#define X264_CSP_I444           0x0003  /* yuv 4:4:4 planar */
+#define X264_CSP_YV12           0x0004  /* yuv 4:2:0 planar */
+#define X264_CSP_YUYV           0x0005  /* yuv 4:2:2 packed */
+#define X264_CSP_RGB            0x0006  /* rgb 24bits       */
+#define X264_CSP_BGR            0x0007  /* bgr 24bits       */
+#define X264_CSP_BGRA           0x0008  /* bgr 32bits       */
+#define X264_CSP_VFLIP          0x1000  /* */
+
+/* Slice type
+ */
+#define X264_TYPE_AUTO          0x0000  /* Let x264 choose the right type */
+#define X264_TYPE_IDR           0x0001
+#define X264_TYPE_I             0x0002
+#define X264_TYPE_P             0x0003
+#define X264_TYPE_B             0x0004
+
+typedef struct
+{
+    /* CPU flags */
+    unsigned int cpu;
+
+    /* Video Properties */
+    int         i_width;
+    int         i_height;
+    int         i_csp;  /* CSP of encoded bitstream, only i420 supported */
+
+    struct
+    {
+        /* they will be reduced to be 0 < x <= 65535 and prime */
+        int         i_sar_height;
+        int         i_sar_width;
+    } vui;
+
+    float       f_fps;  /* Used for rate control only */
+
+    /* Bitstream parameters */
+    int         i_frame_reference;  /* Maximum number of reference frames */
+    int         i_idrframe; /* every i_idrframe I frame are marked as IDR */
+    int         i_iframe;   /* every i_iframe are intra */
+    int         i_bframe;   /* how many b-frame between 2 references pictures */
+
+    int         b_deblocking_filter;
+    int         i_deblocking_filter_alphac0;    /* [-6, 6] -6 light filter, 6 strong */
+    int         i_deblocking_filter_beta;       /* [-6, 6]  idem */
+
+    int         b_cabac;
+    int         i_cabac_init_idc;
+
+    int         i_qp_constant;  /* 1-51 */
+    int         i_bitrate;      /* not working yet */
+
+    /* Encoder analyser parameters */
+    struct
+    {
+        unsigned int intra;     /* intra flags */
+        unsigned int inter;     /* inter flags */
+    } analyse;
+
+} x264_param_t;
+
+/* x264_param_default:
+ *      fill x264_param_t with default values and do CPU detection */
+void    x264_param_default( x264_param_t * );
+
+/****************************************************************************
+ * Picture structures and functions.
+ ****************************************************************************/
+typedef struct
+{
+    int     i_csp;
+
+    int     i_plane;
+    int     i_stride[4];
+    uint8_t *plane[4];
+} x264_image_t;
+
+typedef struct
+{
+    /* In: force picture type (if not auto) XXX: ignored for now
+     * Out: type of the picture encoded */
+    int     i_type;
+    /* In: force quantizer for > 0 */
+    int     i_qpplus1;
+    /* In: user pts, Out: pts of encoded picture (user)*/
+    int64_t i_pts;
+
+    /* In: raw data */
+    x264_image_t img;
+} x264_picture_t;
+
+/* x264_picture_alloc:
+ *  alloc data for a picture. You must call x264_picture_clean on it. */
+void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
+
+/* x264_picture_clean:
+ *  free associated resource for a x264_picture_t allocated with
+ *  x264_picture_alloc ONLY */
+void x264_picture_clean( x264_picture_t *pic );
+
+/****************************************************************************
+ * NAL structure and functions:
+ ****************************************************************************/
+/* nal */
+enum nal_unit_type_e
+{
+    NAL_UNKNOWN = 0,
+    NAL_SLICE   = 1,
+    NAL_SLICE_DPA   = 2,
+    NAL_SLICE_DPB   = 3,
+    NAL_SLICE_DPC   = 4,
+    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+    NAL_SEI         = 6,    /* ref_idc == 0 */
+    NAL_SPS         = 7,
+    NAL_PPS         = 8
+    /* ref_idc == 0 for 6,9,10,11,12 */
+};
+enum nal_priority_e
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+    NAL_PRIORITY_LOW        = 1,
+    NAL_PRIORITY_HIGH       = 2,
+    NAL_PRIORITY_HIGHEST    = 3,
+};
+
+typedef struct
+{
+    int i_ref_idc;  /* nal_priority_e */
+    int i_type;     /* nal_unit_type_e */
+
+    /* This data are raw payload */
+    int     i_payload;
+    uint8_t *p_payload;
+} x264_nal_t;
+
+/* x264_nal_encode:
+ *      encode a nal into a buffer, setting the size.
+ *      if b_annexeb then a long synch work is added
+ *      XXX: it currently doesn't check for overflow */
+int x264_nal_encode( void *, int *, int b_annexeb, x264_nal_t *nal );
+
+/* x264_nal_decode:
+ *      decode a buffer nal into a x264_nal_t */
+int x264_nal_decode( x264_nal_t *nal, void *, int );
+
+/****************************************************************************
+ * Encoder functions:
+ ****************************************************************************/
+
+/* x264_encoder_open:
+ *      create a new encoder handler, all parameters from x264_param_t are copied */
+x264_t *x264_encoder_open   ( x264_param_t * );
+/* x264_encoder_headers:
+ *      return the SPS and PPS that will be used for the whole stream */
+int     x264_encoder_headers( x264_t *, x264_nal_t **, int * );
+/* x264_encoder_encode:
+ *      encode one picture */
+int     x264_encoder_encode ( x264_t *, x264_nal_t **, int *, x264_picture_t * );
+/* x264_encoder_close:
+ *      close an encoder handler */
+void    x264_encoder_close  ( x264_t * );
+
+/* XXX: decoder isn't working so no need to export it */
+#if 0
+/****************************************************************************
+ * Decoder functions:
+ ****************************************************************************
+ * XXX: Not yet working so do not try ...
+ ****************************************************************************/
+/* x264_decoder_open:
+ */
+x264_t *x264_decoder_open   ( x264_param_t * );
+/* x264_decoder_decode:
+ */
+int     x264_decoder_decode ( x264_t *, x264_picture_t **, x264_nal_t * );
+/* x264_decoder_close:
+ */
+void    x264_decoder_close  ( x264_t * );
+#endif
+
+/****************************************************************************
+ * Private stuff for internal usage:
+ ****************************************************************************/
+#ifdef __X264__
+#   ifdef _MSC_VER
+#       define inline __inline
+#       define DECLARE_ALIGNED( type, var, n ) __declspec(align(n)) type var
+#   else
+#       define DECLARE_ALIGNED( type, var, n ) type var __attribute__((aligned(n)))
+#   endif
+#endif
+
+#endif