Nesvideos-piece

Nesvideos-piece is my toolset for creating AVI videos from any particular emulator with a simple interface.
This document is divided in three parts:
This core is identical for all emulators.
Linking requirements:

How to modify the emulator

  1. Architectural setup:
    1. Ensure that your emulator does not skip or duplicate frames. Disable all features that try to throttle the emulation speed to match the host system's speed.
    2. Ensure that your emulator produces perfectly synchronized audio (synchronized to emulated system, NOT to host system). For example, if your emulated system (not host!) runs at 60 fps and you are outputting audio at 44100 Hz, the emulator must output exactly 735 (44100/60) samples per frame, regardless of the host system speed.
  2. API setup:
    1. Enable AVI logging: set LoggingEnabled=2; after the proper movie file has been loaded.
    2. Specify the recording command: Use the function NESVideoSetVideoCmd(). At simplest, a call such as NESVideoSetVideoCmd(getenv("VIDEOCMD")); will do fine. This delegates the specifying of the video encoding command to the user, who has to supply the VIDEOCMD environment variable as specified above.
  3. Feed data:
    1. Call NESVideoLoggingVideo() at each video frame.
    2. Call NESVideoLoggingAudio() each time you have new audio data.
    3. If the video or audio specifications (fps, resolution, sampling rate, etc.) change and you cannot resample them to a common rate, call NESVideoNextAVI() to start a new AVI. You can also call the function voluntarily whenever you want to split the movie into multiple AVIs.
Notes:

Source code of the core feature set

nesvideos-piece.hh

This is the interface to nesvideos-piece. It includes functions and global variables that are to be accessed by the emulator.
#ifndef NESVPIECEhh
#define NESVPIECEhh

#define NESVIDEOS_LOGGING 1

#ifdef __cplusplus
extern "C" {
#endif

/* Is video logging enabled? 0=no, 1=yes, 2=active. Default value: 0 */ 
extern int LoggingEnabled; 

/* Get and set the video recording command (shell command) */ 
extern const char* NESVideoGetVideoCmd(void); 
extern void NESVideoSetVideoCmd(const char *cmd);

/* Save 1 frame of video. (Assumed to be 16-bit RGB) */ 
/* FPS is scaled by 24 bits (*0x1000000) */
/* Does not do anything if LoggingEnabled<2. */ 
extern void NESVideoLoggingVideo
    (const void*data, unsigned width, unsigned height,
     unsigned fps_scaled,
     unsigned bpp); 

/* Save N bytes of audio. bytes_per_second is required on the first call. */ 
/* Does not do anything if LoggingEnabled<2. */ 
/* The interval of calling this function is not important, as long as all the audio
 * data is eventually written without too big delay (5 seconds is too big)
 * This function may be called multiple times per video frame, or once per a few video
 * frames, or anything in between. Just that all audio data must be written exactly once,
 * and in order. */ 
extern void NESVideoLoggingAudio
    (const void*data,
     unsigned rate, unsigned bits, unsigned chans,
     unsigned nsamples);
/* nsamples*chans*(bits/8) = bytes in *data. */

/* Requests current AVI to be closed and new be started */
/* Use when encoding parameters have changed */
extern void NESVideoNextAVI();

#ifdef __cplusplus
}
#endif

#endif

nesvideos-piece.cc

This is the source code of the functionality behind nesvideos-piece.
#define THREAD_SAFETY

#include <cmath>
#include <string>
#include <vector>
#include <deque>
#include <list>
#include <map>

#include <unistd.h>   // mknod, unlink, write
#include <stdio.h>
#include <sys/stat.h> // S_IFIFO
#include <fcntl.h>    // fcntl
#include <sys/poll.h> // poll
#include <stdlib.h>   // setenv
#include <string.h>   // strrchr
#include <errno.h>
#include <glob.h>

#include <gd.h>

#ifdef THREAD_SAFETY
# include <pthread.h>
#endif

/* Note: This module assumes everyone uses BGR16 as display depth */

//#define LOGO_LENGTH_HEADER  (1.2)
//#define LOGO_LENGTH_OVERLAP (10.0-LOGO_LENGTH_HEADER)
//#define LOGO_LENGTH_HEADER  (1.1)
//#define LOGO_LENGTH_OVERLAP (6.3-LOGO_LENGTH_HEADER)
//#define LOGO_LENGTH_HEADER  (1.4)
#define LOGO_LENGTH_OVERLAP (0)
#define LOGO_LENGTH_HEADER (0)

static std::string VIDEO_CMD = "";
/*
-rawvideo on:fps=60:format=0x42475220:w=256:h=224:size=$[1024*224]
-audiofile "+AUDIO_FN+"
*/
static std::string AUDIO_FN = "s.log";

static bool Terminate=false;
static unsigned videonumber = 0;

#ifdef THREAD_SAFETY
static pthread_mutex_t APIlock = PTHREAD_MUTEX_INITIALIZER;
#endif

static unsigned NonblockWrite(FILE* fp, const unsigned char*buf, unsigned length)
{
  Retry:
    int result = write(fileno(fp), buf, length);
    if(result == -1 && errno==EAGAIN)
    {
        return 0;
    }
    if(result == -1 && errno==EINTR) goto Retry;
    if(result == -1)
    {
        perror("write");
        Terminate=true;
        return 0;
    }
    return result;
}
static int WaitUntilOneIsWritable(FILE*f1, FILE*f2)
{
    struct pollfd po[2] = { {fileno(f1),POLLOUT,0}, {fileno(f2),POLLOUT,0} };
    poll(po, 2, -1);
    return ((po[0].revents & POLLOUT) ? 1 : 0)
         | ((po[1].revents & POLLOUT) ? 2 : 0);
}

#define BGR32 0x42475220  // BGR32 fourcc
#define BGR24 0x42475218  // BGR24 fourcc
#define BGR16 0x42475210  // BGR16 fourcc
#define BGR15 0x4247520F  // BGR15 fourcc
#define I420  0x30323449  // I420 fourcc
#define YUY2  0x32595559  // YUY2 fourcc

static unsigned USE_FOURCC = BGR16;
static unsigned INPUT_BPP  = 16;

#define u32(n) (n)&255,((n)>>8)&255,((n)>>16)&255,((n)>>24)&255
#define u16(n) (n)&255,((n)>>8)&255
#define s4(s) s[0],s[1],s[2],s[3]

static const unsigned FPS_SCALE = 0x1000000;

static struct Construct
{
    Construct()
    {
        char Buf[4096];
        getcwd(Buf,sizeof(Buf));
        Buf[sizeof(Buf)-1]=0;
        AUDIO_FN = Buf + std::string("/") + AUDIO_FN;
    }
} Construct;

class AVI
{
    FILE* vidfp;
    FILE* audfp;
    
    bool KnowVideo;
    unsigned vid_width;
    unsigned vid_height;
    unsigned vid_fps_scaled;
    std::list<std::vector<unsigned char> > VideoBuffer;
    unsigned VidBufSize;
    
    bool KnowAudio;
    unsigned aud_rate;
    unsigned aud_chans;
    unsigned aud_bits;
    std::list<std::vector<unsigned char> > AudioBuffer;
    unsigned AudBufSize;
    
public:
    AVI() :
        vidfp(NULL),
        audfp(NULL),
        KnowVideo(false), VidBufSize(0),
        KnowAudio(false), AudBufSize(0)
    {
    }
    ~AVI()
    {
        while(VidBufSize && AudBufSize)
        {
            CheckFlushing();
        }
        if(audfp) fclose(audfp);
        if(vidfp) pclose(vidfp);
        unlink(AUDIO_FN.c_str());
    }
    
    void Audio(unsigned r,unsigned b,unsigned c,
               const unsigned char*d, unsigned nsamples)
    {
        if(Terminate) return;
        if(!KnowAudio)
        {
            aud_rate = r;
            aud_chans = c;
            aud_bits = b;
            KnowAudio = true;
        }
        CheckFlushing();
        
        unsigned bytes = nsamples * aud_chans * (aud_bits / 8);
        
        unsigned wrote = 0;
        if(KnowVideo && AudioBuffer.empty())
        {
            //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "aud", (void*)d, (void*)audfp);
            wrote = NonblockWrite(audfp, d, bytes);
            //fprintf(stderr, "Wrote %u\n", wrote);
        }
        if(wrote < bytes)
        {
            unsigned remain = bytes-wrote;
            //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "aud", d+wrote, d+bytes);
            AudioBuffer.push_back(std::vector<unsigned char>(d+wrote, d+bytes));
            AudBufSize += remain;
        }
        CheckFlushing();
    }
    void Video(unsigned w,unsigned h,unsigned f, const unsigned char*d)
    {
        if(Terminate) return;
        if(!KnowVideo)
        {
            vid_width      = w;
            vid_height     = h;
            vid_fps_scaled = f;
            KnowVideo = true;
        }
        CheckFlushing();
        
        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
        unsigned bytes = vid_width * vid_height * bpp / 8;
        
        //std::vector<unsigned char> tmp(bytes, 'k');
        //d = &tmp[0];
        
        unsigned wrote = 0;
        if(KnowAudio && VideoBuffer.empty())
        {
            CheckBegin();
            //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "vid", (void*)d, (void*)vidfp);
            wrote = NonblockWrite(vidfp, d, bytes);
            //fprintf(stderr, "Wrote %u\n", wrote);
        }
        
        if(wrote < bytes)
        {
            unsigned remain = bytes-wrote;
            //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "vid", d+wrote, d+bytes);

            VideoBuffer.push_back(std::vector<unsigned char>(d+wrote, d+bytes));
            VidBufSize += remain;
        }
        CheckFlushing();
    }

private:
    /* fp is passed as a reference because it may be NULL
     * prior to calling, and this function changes it. */
    template<typename BufType>
    void FlushBufferSome(BufType& List, unsigned& Size, FILE*& fp, const char* what)
    {
        what=what;
        
    Retry:
        if(List.empty() || Terminate) return;
        
        typename BufType::iterator i = List.begin();
        std::vector<unsigned char>& buf = *i;
        
        if(buf.empty())
        {
            List.erase(i);
            goto Retry;
        }
        
        unsigned bytes = buf.size();
        
        CheckBegin();
        //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, what, (void*)&buf[0], (void*)fp);
        
        unsigned ate = NonblockWrite(fp, &buf[0], bytes);

        //fprintf(stderr, "Wrote %u\n", ate);
        
        buf.erase(buf.begin(), buf.begin()+ate);
        
        Size -= ate;
        
        if(buf.empty())
        {
            List.erase(i);
        }
    }

    void CheckFlushing()
    {
        //AudioBuffer.clear();
        //VideoBuffer.clear();
        
        if(KnowAudio && KnowVideo && !Terminate)
        {
            if(!AudioBuffer.empty() && !VideoBuffer.empty())
            {
                do {
                    /* vidfp = &1, audfp = &2 */
                    int attempt = WaitUntilOneIsWritable(vidfp, audfp);
                    
                    if(attempt <= 0) break; /* Some kind of error can cause this */

                    // Flush Video
                    if(attempt&1) FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid");
                    
                    // Flush Audio
                    if(attempt&2) FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud");
                } while (!AudioBuffer.empty() && !VideoBuffer.empty());
            }
            else
            {
                FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid");
                FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud");
            }
            /*
            fprintf(stderr, "Buffer Sizes: Audio %u(%u) video %u(%u)\n",
                (unsigned)AudioBuffer.size(), AudBufSize,
                (unsigned)VideoBuffer.size(), VidBufSize);
            */
        }
    }
    std::string GetMEncoderRawvideoParam() const
    {
        char Buf[512];
        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
        sprintf(Buf, "fps=%g:format=0x%04X:w=%u:h=%u:size=%u",
            vid_fps_scaled / (double)FPS_SCALE,
            USE_FOURCC,
            vid_width,
            vid_height,
            vid_width*vid_height * bpp/8);
        return Buf;
    }
    std::string GetMEncoderRawaudioParam() const
    {
        char Buf[512];
        sprintf(Buf, "channels=%u:rate=%u:samplesize=%u:bitrate=%u",
            aud_chans,
            aud_rate,
            aud_bits/8,
            aud_rate*aud_chans*(aud_bits/8) );
        return Buf;
    }
    std::string GetMEncoderCommand() const
    {
        std::string mandatory = "-audiofile " + AUDIO_FN
                              + " -audio-demuxer rawaudio"
                              + " -demuxer rawvideo"
                              + " -rawvideo " + GetMEncoderRawvideoParam()
                              + " -rawaudio " + GetMEncoderRawaudioParam()
                              ;
        std::string cmd = VIDEO_CMD;

        std::string::size_type p = cmd.find("NESV""SETTINGS");
        if(p != cmd.npos)
            cmd = cmd.replace(p, 4+8, mandatory);
        else
            fprintf(stderr, "Warning: NESVSETTINGS not found in videocmd\n");
        
        char videonumstr[64];
        sprintf(videonumstr, "%u", videonumber);
        
        for(;;)
        {
            p = cmd.find("VIDEO""NUMBER");
            if(p == cmd.npos) break;
            cmd = cmd.replace(p, 5+6, videonumstr);
        }
        
        fprintf(stderr, "Launch: %s\n", cmd.c_str()); fflush(stderr);
        
        return cmd;
    }

    void CheckBegin()
    {
        if(!audfp)
        {
            unlink(AUDIO_FN.c_str());
            mknod(AUDIO_FN.c_str(), S_IFIFO|0666, 0);
        }
        
        if(!vidfp)
        {
            /* Note: popen does not accept b/t in mode param */
            setenv("LD_PRELOAD", "", 1);
            vidfp = popen(GetMEncoderCommand().c_str(), "w");
            if(!vidfp)
            {
                perror("Launch failed");
            }
            else
            {
                fcntl(fileno(vidfp), F_SETFL, O_WRONLY | O_NONBLOCK);
            }
        }
        
        if(!audfp)
        {
        Retry:
            audfp = fopen(AUDIO_FN.c_str(), "wb");
            
            if(!audfp)
            {
                perror(AUDIO_FN.c_str());
                if(errno == ESTALE) goto Retry;
            }
            else
            {
                fcntl(fileno(audfp), F_SETFL, O_WRONLY | O_NONBLOCK);
            }
        }
    }
};

static AVI* AVI = 0;

namespace LogoInfo
{
    unsigned width;
    unsigned height;
}

#include "quantize.hh"
#include "rgbtorgb.hh"

extern "C"
{
    int LoggingEnabled = 0; /* 0=no, 1=yes, 2=recording! */

    const char* NESVideoGetVideoCmd()
    {
        return VIDEO_CMD.c_str();
    }
    void NESVideoSetVideoCmd(const char *cmd)
    {
#ifdef THREAD_SAFETY
        struct ScopedLock
        { ScopedLock() { 
                         pthread_mutex_lock(&APIlock);
                         //fprintf(stderr, "audio start\n"); fflush(stderr);
                       }
          ~ScopedLock() {
                         //fprintf(stderr, "audio end\n"); fflush(stderr);
                         pthread_mutex_unlock(&APIlock); }
        } ScopedLock;
#endif

        VIDEO_CMD = cmd;
    }
    
    static class AVI& GetAVIptr()
    {
        if(!AVI)
        {
            fprintf(stderr, "Starting new AVI (num %u)\n", videonumber);
            AVI = new class AVI;
        }
        return *AVI;
    }
    
    void NESVideoNextAVI()
    {
#ifdef THREAD_SAFETY
        struct ScopedLock
        { ScopedLock() { 
                         pthread_mutex_lock(&APIlock);
                         //fprintf(stderr, "audio start\n"); fflush(stderr);
                       }
          ~ScopedLock() {
                         //fprintf(stderr, "audio end\n"); fflush(stderr);
                         pthread_mutex_unlock(&APIlock); }
        } ScopedLock;
#endif

        if(AVI)
        {
            fprintf(stderr, "Closing AVI (next will be started)\n");
            delete AVI;
            AVI = 0;
            ++videonumber;
        }
    }

    static void Overlay32With32(unsigned char* target, const unsigned char* source, int alpha)
    {
        target[0] += ((int)(source[0] - target[0])) * alpha / 255;
        target[1] += ((int)(source[1] - target[1])) * alpha / 255;
        target[2] += ((int)(source[2] - target[2])) * alpha / 255;
    }
    
    static void OverlayLogoFrom(const char* fn, std::vector<unsigned char>& data)
    {
        FILE*fp = fopen(fn, "rb");
        if(!fp) perror(fn);
        if(!fp) return; /* Silently ignore missing frames */
        
        gdImagePtr im = gdImageCreateFromPng(fp);
        if(!gdImageTrueColor(im))
        {
          fprintf(stderr, "'%s': Only true color images are supported\n", fn);
          goto CloseIm;
        }
        {/*scope begin*/
        
        unsigned new_width = gdImageSX(im);
        unsigned new_height= gdImageSY(im);
        
        if(new_width != LogoInfo::width
        || new_height != LogoInfo::height)
        {
            if(new_height < LogoInfo::height || new_height > LogoInfo::height+20)
            fprintf(stderr, "'%s': ERROR, expected %dx%d, got %dx%d\n", fn,
                LogoInfo::width, LogoInfo::height,
                new_width, new_height);
        }

        for(unsigned y=0; y<LogoInfo::height; ++y)
        {
            unsigned char pixbuf[4] = {0,0,0,0};
            for(unsigned x = 0; x < LogoInfo::width; ++x)
            {
                int color = gdImageTrueColorPixel(im, x,y);
                int alpha = 255-gdTrueColorGetAlpha(color)*256/128;
                pixbuf[2] = gdTrueColorGetRed(color);
                pixbuf[1] = gdTrueColorGetGreen(color);
                pixbuf[0] = gdTrueColorGetBlue(color);
                Overlay32With32(&data[(y*LogoInfo::width+x)*3], pixbuf, alpha);
            }
        }
        }/* close scope */
    CloseIm:
        gdImageDestroy(im);
        fclose(fp);
    }
    
    static const std::string GetLogoFileName(unsigned frameno)
    {
        std::string avdir = "/home/bisqwit/povray/nesvlogov5/";
        //std::string avdir = "/home/bisqwit/povray/nesvlogov6/cv2/";
        //std::string avdir = "/home/bisqwit/povray/nesvlogov6/kuros/";
        
        char AvName[512];
        sprintf(AvName, "logo_%d_%d_f%03u.png",
            LogoInfo::width,
            LogoInfo::height,
            frameno);
        
        std::string want = avdir + AvName;
        int ac = access(want.c_str(), R_OK);
        if(ac != 0)
        {
            /* No correct avatar file? Check if there's an approximate match. */
            static std::map<int, std::vector<std::string> > files;
            if(files.empty()) /* Cache the list of logo files. */
            {
                static const char GlobPat[] = "logo_*_*_f*.png";
                glob_t globdata;
                globdata.gl_offs = 0;
                fprintf(stderr, "Loading list of usable logo animation files in %s...\n", avdir.c_str());
                int globres = glob( (avdir + GlobPat).c_str(), GLOB_NOSORT, NULL, &globdata);
                if(globres == 0)
                {
                    for(size_t n=0; n<globdata.gl_pathc; ++n)
                    {
                        const char* fn = globdata.gl_pathv[n];
                        const char* slash = strrchr(fn, '/');
                        if(slash) fn = slash+1;
                        
                        int gotw=0, goth=0, gotf=0;
                        sscanf(fn, "logo_%d_%d_f%d", &gotw,&goth,&gotf);
                        files[gotf].push_back(fn);
                    }
                }
                globfree(&globdata);
            }
            
            std::map<int, std::vector<std::string> >::const_iterator
                i = files.find(frameno);
            if(i != files.end())
            {
                std::string best;
                int bestdist = -1;
                
                const std::vector<std::string>& fnames = i->second;
                for(size_t b=fnames.size(), a=0; a<b; ++a)
                {
                    unsigned gotw=0, goth=0;
                    sscanf(fnames[a].c_str(), "logo_%u_%u", &gotw,&goth);
                    if(gotw < LogoInfo::width || goth < LogoInfo::height) continue;
                    
                    int dist = std::max(gotw - LogoInfo::width,
                                        goth - LogoInfo::height);
                    
                    if(bestdist == -1 || dist < bestdist)
                        { bestdist = dist; best = fnames[a]; }
                }
                
                if(bestdist >= 0) want = avdir + best;
            }
        }
        return want;
    }
    
    static const std::vector<unsigned char> NVConvert24To16Frame
        (const std::vector<unsigned char>& logodata)
    {
        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 2);
        Convert24To16Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
        return result;
    }
    static const std::vector<unsigned char> NVConvert24To15Frame
        (const std::vector<unsigned char>& logodata)
    {
        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 2);
        Convert24To15Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
        return result;
    }
    
    static const std::vector<unsigned char> NVConvert24To_I420Frame
        (const std::vector<unsigned char>& logodata)
    {
        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 3 / 2);
        Convert24To_I420Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
        return result;
    }
    
    static const std::vector<unsigned char> NVConvert24To_YUY2Frame
        (const std::vector<unsigned char>& logodata)
    {
        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 3 / 2);
        Convert24To_YUY2Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
        return result;
    }
    
    static const std::vector<unsigned char> NVConvert16To24Frame
        (const void* data, unsigned npixels)
    {
        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
        Convert16To24Frame(data, &logodata[0], npixels);
        return logodata;
    }
    
    static const std::vector<unsigned char> NVConvert15To24Frame
        (const void* data, unsigned npixels)
    {
        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
        Convert15To24Frame(data, &logodata[0], npixels);
        return logodata;
    }
    
    static const std::vector<unsigned char> NVConvert_I420To24Frame
        (const void* data, unsigned npixels)
    {
        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
        Convert_I420To24Frame(data, &logodata[0], npixels, LogoInfo::width);
        return logodata;
    }
    
    static const std::vector<unsigned char> NVConvert_YUY2To24Frame
        (const void* data, unsigned npixels)
    {
        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
        Convert_YUY2To24Frame(data, &logodata[0], npixels, LogoInfo::width);
        return logodata;
    }
    
    static void SubstituteWithBlackIfNeeded(const void*& data)
    {
        /* If the first frames of the animation consist of a
         * single color (such as gray for NES), replace them
         * with black to avoid ugly backgrounds on logo animations
         */
    
        static bool Deviate = false;
        static short* Replacement = 0;
        static unsigned wid=0, hei=0;
        if(Deviate)
        {
            if(Replacement) { delete[] Replacement; Replacement=0; }
            return;
        }
        
        unsigned dim = LogoInfo::width * LogoInfo::height;
        const short* p = (const short*)data;
        for(unsigned a=0; a<dim; ++a)
            if(p[a] != p[0])
            {
                Deviate = true;
                return;
            }
        
        if(Replacement && (wid != LogoInfo::width || hei != LogoInfo::height))
        {
            delete[] Replacement;
            Replacement = 0;
        }
        
        wid = LogoInfo::width;
        hei = LogoInfo::height;
        
        if(!Replacement)
        {
            Replacement = new short[dim];
            for(unsigned a=0; a<dim; ++a) Replacement[a]=0x0000;
        }
        data = (void*)Replacement;
    }

    void NESVideoLoggingVideo
        (const void*data, unsigned width,unsigned height,
         unsigned fps_scaled,
         unsigned bpp
        )
    {
        if(LoggingEnabled < 2) return;
        
#ifdef THREAD_SAFETY
        struct ScopedLock
        { ScopedLock() { 
                         pthread_mutex_lock(&APIlock);
                         //fprintf(stderr, "video start\n"); fflush(stderr);
                       }
          ~ScopedLock() {
                         //fprintf(stderr, "video end\n"); fflush(stderr);
                         pthread_mutex_unlock(&APIlock); }
        } ScopedLock;
#endif

        if(bpp == 32) /* Convert 32 to 24 */
        {
            bpp = 24;
            
            static std::vector<unsigned char> VideoBuf;
            VideoBuf.resize(width*height * 3);
            
            Convert32To24Frame(data, &VideoBuf[0], width*height);
            data = (void*)&VideoBuf[0];
        }
        
        if(bpp) INPUT_BPP = bpp;
        
        switch(INPUT_BPP)
        {
            case 32: USE_FOURCC = BGR32; break;
            case 24: USE_FOURCC = BGR24; break;
            case 16: USE_FOURCC = BGR16; break;
            case 15: USE_FOURCC = BGR15; break;
            case 12: USE_FOURCC = I420; break;
            case 17: USE_FOURCC = YUY2; break;
        }
        //USE_FOURCC = BGR24; // FIXME TEMPORARY
        
        const int LogoFramesHeader  = (int)( (LOGO_LENGTH_HEADER  * fps_scaled) / (1 << 24) );
        const int LogoFramesOverlap = (int)( (LOGO_LENGTH_OVERLAP * fps_scaled) / (1 << 24) );
        
        LogoInfo::width  = width;
        LogoInfo::height = height;
        
        if(INPUT_BPP == 16 || INPUT_BPP == 15)
        {
            SubstituteWithBlackIfNeeded(data);
        }
        else if(INPUT_BPP != 24 && INPUT_BPP != 12 && INPUT_BPP != 17)
        {
            fprintf(stderr, "NESVIDEOS_PIECE only supports 16 and 24 bpp, you gave %u bpp\n",
                bpp);
            return;
        }

        static bool LogoHeaderPartSent = false;
        if(!LogoHeaderPartSent)
        {
            /* Send animation frames that do not involve source video? */
            LogoHeaderPartSent=true;

            if(LogoFramesHeader > 0)
            {
                for(int frame = 0; frame < LogoFramesHeader; ++frame)
                {
                    std::vector<unsigned char> logodata(width*height*3); /* filled with black. */
                    
                    std::string fn = GetLogoFileName(frame);
                    /*fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n",
                        width, LogoInfo::width,
                        height, LogoInfo::height,
                        fn.c_str());*/
                    OverlayLogoFrom(fn.c_str(), logodata);
                    
                    //INPUT_BPP = 24; USE_FOURCC = BGR24; // FIXME TEMPORARY
                    
                    if(INPUT_BPP == 16)
                    {
                        std::vector<unsigned char> result = NVConvert24To16Frame(logodata);
                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
                    }
                    else if(INPUT_BPP == 15)
                    {
                        std::vector<unsigned char> result = NVConvert24To15Frame(logodata);
                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
                    }
                    else if(INPUT_BPP == 12)
                    {
                        std::vector<unsigned char> result = NVConvert24To_I420Frame(logodata);
                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
                    }
                    else if(INPUT_BPP == 17)
                    {
                        std::vector<unsigned char> result = NVConvert24To_YUY2Frame(logodata);
                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
                    }
                    else
                    {
                        GetAVIptr().Video(width,height,fps_scaled, &logodata[0]);
                    }
                }
            }
        }
        
        static int LogoOverlapSent = 0;
        if(LogoOverlapSent < LogoFramesOverlap)
        {
            /* Send animation frames that mix source and animation? */

            std::string fn = GetLogoFileName(LogoOverlapSent + LogoFramesHeader);
            /*
            fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n",
                width, LogoInfo::width,
                height, LogoInfo::height,
                fn.c_str());*/

            std::vector<unsigned char> logodata;
            if(INPUT_BPP == 16)
            {
                logodata = NVConvert16To24Frame(data, width*height);
            }
            else if(INPUT_BPP == 15)
            {
                logodata = NVConvert15To24Frame(data, width*height);
            }
            else if(INPUT_BPP == 17)
            {
                logodata = NVConvert_YUY2To24Frame(data, width*height);
            }
            else if(INPUT_BPP == 12)
            {
                logodata = NVConvert_I420To24Frame(data, width*height);
            }
            else
            {
                logodata.resize(width*height*3); /* filled with black. */
                memcpy(&logodata[0], data, width*height*3);
            }

            OverlayLogoFrom(fn.c_str(), logodata);
            
//            INPUT_BPP = 24; USE_FOURCC = BGR24; // FIXME TEMPORARY

            if(INPUT_BPP == 16)
            {
                std::vector<unsigned char> result = NVConvert24To16Frame(logodata);
                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
            }
            else if(INPUT_BPP == 15)
            {
                std::vector<unsigned char> result = NVConvert24To15Frame(logodata);
                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
            }
            else if(INPUT_BPP == 12)
            {
                std::vector<unsigned char> result = NVConvert24To_I420Frame(logodata);
                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
            }
            else if(INPUT_BPP == 17)
            {
                std::vector<unsigned char> result = NVConvert24To_YUY2Frame(logodata);
                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
            }
            else
            {
                GetAVIptr().Video(width,height,fps_scaled, &logodata[0]);
            }

            ++LogoOverlapSent;
            return;
        }
        
        GetAVIptr().Video(width,height,fps_scaled,  (const unsigned char*) data);
    }

    void NESVideoLoggingAudio
        (const void*data,
         unsigned rate, unsigned bits, unsigned chans,
         unsigned nsamples)
    {
        if(LoggingEnabled < 2) return;
        
#ifdef THREAD_SAFETY
        struct ScopedLock
        { ScopedLock() { 
                         pthread_mutex_lock(&APIlock);
                         //fprintf(stderr, "audio start\n"); fflush(stderr);
                       }
          ~ScopedLock() {
                         //fprintf(stderr, "audio end\n"); fflush(stderr);
                         pthread_mutex_unlock(&APIlock); }
        } ScopedLock;
#endif

        static bool LogoHeaderPartSent = false;
        if(!LogoHeaderPartSent && LOGO_LENGTH_HEADER > 0)
        {
            LogoHeaderPartSent=true;
            
            double HdrLength = LOGO_LENGTH_HEADER; // N64 workaround
            
            const long n = (long)(rate * HdrLength)/*
                - (rate * 0.11)*/;
            
            if(n > 0) {
            unsigned bytes = n*chans*(bits/8);
            unsigned char* buf = (unsigned char*)malloc(bytes);
            if(buf)
            {
                memset(buf,0,bytes);
                GetAVIptr().Audio(rate,bits,chans, buf, n);
                free(buf);
            } }
        }
        
        /*
        fprintf(stderr, "Writing %u samples (%u bits, %u chans, %u rate)\n",
            nsamples, bits, chans, rate);*/
        
        /*
        static FILE*fp = fopen("audiodump.wav", "wb");
        fwrite(data, 1, nsamples*(bits/8)*chans, fp);
        fflush(fp);*/
        
        GetAVIptr().Audio(rate,bits,chans, (const unsigned char*) data, nsamples);
    }
} /* extern "C" */

quantize.hh

This include file provides functions for a single R/G/B value from some bit depth to a lesser bit depth, such as RGB32 to RGB15, with ordered dithering.
/*
 Ordered dithering methods provided for:
   8x8 (Quantize8x8)
   4x4 (Quantize4x4)
   3x3 (Quantize3x3)
   4x2 (Quantize4x2)
   3x2 (Quantize3x2)
   2x2 (Quantize2x2)
 The functions are:
 
   template<int m, int in_max>
   int QuantizeFunc(size_t quant_pos, double value)
   
      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
      - quant_pos tells the coordinate into the dithering matrix

   template<int m, int in_max>
   int QuantizeFunc(size_t quant_pos, unsigned value)

      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
      - quant_pos tells the coordinate into the dithering matrix

 Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
*/

#define OrderedDitherDecl(n) \
    static const double flts[n]; \
    static const int ints[n]; \
    enum { mul = n+1, \
           maxin = in_max, \
           even = !(maxin % mul), \
           intmul = even ? 1 : mul };

/* macroes for initializing dither tables */
#define d(n) (n)/double(mul) - 0.5
#define i(n) even ? (n*in_max/mul - (int)in_max/2) \
                  : (n*in_max - (int)mul*in_max/2)

template<int m, int in_max = 255>
struct QuantizeNoDither
{
    int res;
    template<typename IntType>
    QuantizeNoDither(IntType v) : res(v * m / in_max) { }
    operator int() const { return res; }
};

template<int m, typename Base>
struct QuantizeFuncBase: private Base
{
    int res;
    
    QuantizeFuncBase(size_t quant_pos, double v) : res(0)
    {
        if(v > 0.0)
        {
            const double dither_threshold = Base::flts[quant_pos];
            res = (int)(v * (m / double(Base::maxin)) + dither_threshold);
            if(res > m) res = m;
        }
    }
    
    QuantizeFuncBase(size_t quant_pos, unsigned char v) : res(v)
    {
        if(m == Base::maxin) return;
        if(m < Base::maxin)
        {
            // With dithering
            const int dither_threshold = Base::ints[quant_pos];
            const int intmul = Base::intmul;
            res = (res * (m * intmul) + dither_threshold) / (Base::maxin * intmul);
        }
        else
        {
            // Without dithering
            res = QuantizeNoDither<m, Base::maxin> (res);
        }
    }
};

#define QuantizeFuncDecl(name, base) \
  template<int m, int in_max=255> \
  struct name: private QuantizeFuncBase<m, base<in_max> > \
  { \
      typedef QuantizeFuncBase<m, base<in_max> > Base; \
      template<typename A, typename B> name(A a, B b) : Base(a, b) { } \
      operator int() const { return Base::res; } \
  }

/******* Quantizing with 8x8 ordered dithering ********/
template<int in_max> struct OrderedDither_8x8 { OrderedDitherDecl(8*8) };
    template<int in_max>
    const double OrderedDither_8x8<in_max>::flts[] /* A table for 8x8 ordered dithering */
    = { d(1 ), d(49), d(13), d(61), d( 4), d(52), d(16), d(64),
        d(33), d(17), d(45), d(29), d(36), d(20), d(48), d(32),
        d(9 ), d(57), d( 5), d(53), d(12), d(60), d( 8), d(56),
        d(41), d(25), d(37), d(21), d(44), d(28), d(40), d(24),
        d(3 ), d(51), d(15), d(63), d( 2), d(50), d(14), d(62),
        d(35), d(19), d(47), d(31), d(34), d(18), d(46), d(30),
        d(11), d(59), d( 7), d(55), d(10), d(58), d( 6), d(54),
        d(43), d(27), d(39), d(23), d(42), d(26), d(38), d(22) };
    template<int in_max>
    const int OrderedDither_8x8<in_max>::ints[]
    = { i(1 ), i(49), i(13), i(61), i( 4), i(52), i(16), i(64),
        i(33), i(17), i(45), i(29), i(36), i(20), i(48), i(32),
        i(9 ), i(57), i( 5), i(53), i(12), i(60), i( 8), i(56),
        i(41), i(25), i(37), i(21), i(44), i(28), i(40), i(24),
        i(3 ), i(51), i(15), i(63), i( 2), i(50), i(14), i(62),
        i(35), i(19), i(47), i(31), i(34), i(18), i(46), i(30),
        i(11), i(59), i( 7), i(55), i(10), i(58), i( 6), i(54),
        i(43), i(27), i(39), i(23), i(42), i(26), i(38), i(22) };
QuantizeFuncDecl(Quantize8x8, OrderedDither_8x8);


/******* Quantizing with 4x4 ordered dithering ********/
template<int in_max> struct OrderedDither_4x4 { OrderedDitherDecl(4*4) };
    template<int in_max>
    const double OrderedDither_4x4<in_max>::flts[] /* A table for 4x4 ordered dithering */
    = { d( 1), d( 9), d( 3), d(11),
        d(13), d( 5), d(15), d( 7),
        d( 4), d(12), d( 2), d(10),  
        d(16), d( 8), d(14), d( 6) };
    template<int in_max>
    const int OrderedDither_4x4<in_max>::ints[]
    = { i( 1), i( 9), i( 3), i(11),
        i(13), i( 5), i(15), i( 7),
        i( 4), i(12), i( 2), i(10),
        i(16), i( 8), i(14), i( 6) };
QuantizeFuncDecl(Quantize4x4, OrderedDither_4x4);

/******* Quantizing with 3x3 ordered dithering ********/
template<int in_max> struct OrderedDither_3x3 { OrderedDitherDecl(3*3) };
    template<int in_max>
    const double OrderedDither_3x3<in_max>::flts[] /* A table for 3x3 ordered dithering */
    = { d(1), d(7), d(3),
        d(6), d(4), d(9),
        d(8), d(2), d(5) };
    template<int in_max>
    const int OrderedDither_3x3<in_max>::ints[]
    = { i(1), i(7), i(3),
        i(6), i(4), i(9),  
        i(8), i(2), i(5) };
QuantizeFuncDecl(Quantize3x3, OrderedDither_3x3);

/******* Quantizing with 4x2 ordered dithering ********/
template<int in_max> struct OrderedDither_4x2 { OrderedDitherDecl(4*2) };
    template<int in_max>
    const double OrderedDither_4x2<in_max>::flts[] /* A table for 4x2 ordered dithering */
    = { d(1), d(5), d(2), d(6),
        d(7), d(3), d(8), d(4) };
    template<int in_max>
    const int OrderedDither_4x2<in_max>::ints[]
    = { i(1), i(5), i(2), i(6),
        i(7), i(3), i(8), i(4) };
QuantizeFuncDecl(Quantize4x2, OrderedDither_4x2);

/******* Quantizing with 3x2 ordered dithering ********/
template<int in_max> struct OrderedDither_3x2 { OrderedDitherDecl(3*2) };
    template<int in_max>
    const double OrderedDither_3x2<in_max>::flts[] /* A table for 3x2 ordered dithering */
    = { d(1), d(5), d(3),
        d(4), d(2), d(6) };
    template<int in_max>
    const int OrderedDither_3x2<in_max>::ints[]
    = { i(1), i(5), i(3),
        i(4), i(2), i(6) };
QuantizeFuncDecl(Quantize3x2, OrderedDither_3x2);

/******* Quantizing with 2x2 ordered dithering ********/
template<int in_max> struct OrderedDither_2x2 { OrderedDitherDecl(2*2) };
    template<int in_max>
    const double OrderedDither_2x2<in_max>::flts[] /* A table for 2x2 ordered dithering */
    = { d(1), d(4),
        d(3), d(2) };
    template<int in_max>
    const int OrderedDither_2x2<in_max>::ints[]
    = { i(1), i(4),
        i(3), i(2) };
QuantizeFuncDecl(Quantize2x2, OrderedDither_2x2);


#undef OrderedDitherDecl
#undef QuantizeFuncDecl
#undef i
#undef d

simd.hh

#if defined(__MMX__) && !defined(__x86_64)
#define USE_MMX
#endif
#if defined(__SSE__)
#define USE_SSE
#endif

/* SIMD interface (MMX) written by Bisqwit
 * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
 */

#ifdef __3dNOW__
# include <mm3dnow.h> /* Note: not available on ICC */ 
#elif defined(__MMX__)
# include <mmintrin.h>
#endif
#ifdef __SSE__
#include <xmmintrin.h>
 #ifdef __ICC
 typedef __m128 __v4sf;
 #endif
#endif

struct c64_common
{
    static signed char clamp_s8(int_fast64_t v)
        { return v<-128 ? -128 : (v > 127 ? 127 : v); }
    static unsigned char clamp_u8(int_fast64_t v)
        { return v<0 ? 0 : (v > 255 ? 255 : v); }
    static short clamp_s16(int_fast64_t v)
        { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); }

    static inline uint_fast64_t expand32_8(uint_fast32_t a)
    {
        // 0000abcd -> 0a0b0c0d
        typedef uint_fast64_t v;
        return (a&0xFFU)
            | ((a&0xFF00U)<<8)    // base: 8+8 = 16
            | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32
            | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48
    }
    static inline uint_fast64_t expand32_16(uint_fast32_t a)
    {
        // 0000abcd -> 00ab00cd
        typedef uint_fast64_t v;
        return (a&0xFFFFU)
         | ((v)(a&0xFFFF0000UL)<<16);   // base: 16+16 = 32
    }
};

#ifdef __MMX__
/* 64-bit integers that use MMX / 3Dnow operations where relevant */
struct c64_MMX: public c64_common
{
    typedef c64_MMX c64;

    __m64 value;
    
    inline c64_MMX() { }
    inline c64_MMX(__m64 v) : value(v) { }
    inline c64_MMX(const uint64_t& v) : value( *(const __m64*)& v) { }

    inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
    inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
    c64& operator<<= (int n) { return *this = shl64(n); }
    c64& operator>>= (int n) { return *this = shr64(n); }

    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
    c64 conv_s16_s8() const { return conv_s16_s8(*this); }

    void Get(const unsigned char* p)      { value = *(const __m64*)p; }
    void Put(      unsigned char* p)const { *(__m64*)p =  value; }
    
    void Init16(short a,short b,short c, short d)
        { value = _mm_setr_pi16(a,b,c,d); }
    void Init16(short a)
        { value = _mm_set1_pi16(a); }

    void GetD(const unsigned char* p)      { value = *(const __m64*)p; }
    
    template<int n>
    short Extract16() const { return ((const short*)&value)[n]; }
    template<int n>
    int Extract32() const { return ((const int*)&value)[n]; }
    
    short Extract88_from_1616lo() const
    {
        const unsigned char* data = (const unsigned char*)&value;
        // bytes:  76543210
        // shorts: 33221100
        // take:        H L
        return data[0] | *(short*)(data+1);
        //return data[0] | ((*(const unsigned int*)data) >> 8);
    }
    short Extract88_from_1616hi() const
    {
        const unsigned char* data = 4+(const unsigned char*)&value;
        // bytes:  76543210
        // shorts: 33221100
        // take:    H L
        return data[0] | *(short*)(data+1);
        //return data[0] | ((*(const unsigned int*)data) >> 8);
    }
    

    c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; }
    c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; }
    c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; }
    c64& operator+= (const c64& b) { return *this = *this + b; }
    c64& operator-= (const c64& b) { return *this = *this - b; }
    
            /* psllqi: p = packed
                       s = shift
                       r = right, l = left
                       l = shift in zero, a = shift in sign bit
                       q = 64-bit, d = 32-bit, w = 16-bit
                      [i = immed amount]
             */
    c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); }
    c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); }
    c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); }
    
    c64 operator- (const c64& b) const
    {
        #ifdef __SSE2__
        return _mm_sub_si64(value, b.value);
        #else
        return (const uint64_t&)value - (const uint64_t&)b.value;
        #endif
    }
    c64 operator+ (const c64& b) const
    {
        #ifdef __SSE2__
        return _mm_add_si64(value, b.value);
        #else
        return (const uint64_t&)value + (const uint64_t&)b.value;
        #endif
    }
    

    c64 shl64(int b) const { return _mm_slli_si64(value, b); }
    c64 shr64(int b) const { return _mm_srli_si64(value, b); }
    c64 shl16(int b) const { return _mm_slli_pi16(value, b); }
    c64 shr16(int b) const { return _mm_srli_pi16(value, b); }
    c64 sar32(int b) const { return _mm_srai_pi32(value, b); }
    c64 sar16(int b) const { return _mm_srai_pi16(value, b); }
    c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); }
    c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); }
    c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); }
    c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); }
    c64 mul16(const c64& b) const   { return _mm_mullo_pi16(value, b.value); }
    c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); }
    //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
    c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); }
    c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); }
    
    c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); }
    c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); }
    c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); }
    c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); }
    c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); }
    c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); }

    c64 operator& (const uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); }
    
    c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); }
    c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); }
    c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); }
};
#endif

struct c64_nonMMX: public c64_common
{
    typedef c64_nonMMX c64;
    
    uint_least64_t value;
    
    inline c64_nonMMX() { }
    inline c64_nonMMX(uint64_t v) : value(v) { }

    c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
    c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
    c64& operator<<= (int n) { return *this = shl64(n); }
    c64& operator>>= (int n) { return *this = shr64(n); }

    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
    c64 conv_s16_s8() const { return conv_s16_s8(*this); }

    void Init16(short a,short b,short c, short d)
        { uint_fast64_t aa = (unsigned short)a,
                        bb = (unsigned short)b,
                        cc = (unsigned short)c,
                        dd = (unsigned short)d;
          value = aa | (bb << 16) | (cc << 32) | (dd << 48); }
    void Init16(short a)
        { Init16(a,a,a,a); }
    void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d,
               unsigned char e,unsigned char f,unsigned char g,unsigned char h)
    {
        value = ((uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24)))
              | (((uint_fast64_t)e) << 32)
              | (((uint_fast64_t)f) << 40)
              | (((uint_fast64_t)g) << 48)
              | (((uint_fast64_t)h) << 56);
    }

    void Get(const unsigned char* p)      { value = *(const uint_least64_t*)p; }
    void Put(      unsigned char* p)const { *(uint_least64_t*)p =  value; }
    
    c64& operator&= (const c64& b) { value&=b.value; return *this; }
    c64& operator|= (const c64& b) { value|=b.value; return *this; }
    c64& operator^= (const c64& b) { value^=b.value; return *this; }
    c64& operator+= (const c64& b) { value+=b.value; return *this; }
    c64& operator-= (const c64& b) { value-=b.value; return *this; }
    c64 operator& (const c64& b) const { return value & b.value; }
    c64 operator| (const c64& b) const { return value | b.value; }
    c64 operator^ (const c64& b) const { return value ^ b.value; }
    c64 operator- (const c64& b) const { return value - b.value; }
    c64 operator+ (const c64& b) const { return value + b.value; }

    c64 operator& (uint_fast64_t b) const { return value & b; }
    
    #define usimdsim(type, count, op) \
        type* p = (type*)&res.value; \
        for(int n=0; n<count; ++n) p[n] = (p[n] op b)

    #define simdsim(type, count, op) \
        type* p = (type*)&res.value; \
        const type* o = (const type*)&b.value; \
        for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
    
    c64 shl64(int b) const { return value << b; }
    c64 shr64(int b) const { return value >> b; }
    c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; }
    c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; }
    c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; }
    c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; }

    c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; }
    c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; }
    c64 add32(const c64& b) const { c64 res = *this; simdsim(int,   2, +); return res; }
    c64 sub32(const c64& b) const { c64 res = *this; simdsim(int,   2, -); return res; }
    c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; }
    c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; }
    c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; }
    c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; }
    
    #undef simdsim
    #undef usimdsim
    
    c64 conv_s32_s16(const c64& b) const
    {
        c64 res; res.
        Init16(clamp_s16(value & 0xFFFFFFFFU),
               clamp_s16(value >> 32),
               clamp_s16(b.value & 0xFFFFFFFFU),
               clamp_s16(b.value >> 32));
        return res;
    }
    c64 conv_s16_u8(const c64& b) const
    {
        c64 res; res.
        Init8(clamp_u8(value & 0xFFFF),
              clamp_u8((value >> 16) & 0xFFFF),
              clamp_u8((value >> 32) & 0xFFFF),
              clamp_u8((value >> 48) & 0xFFFF),
              clamp_u8(b.value & 0xFFFF),
              clamp_u8((b.value >> 16) & 0xFFFF),
              clamp_u8((b.value >> 32) & 0xFFFF),
              clamp_u8((b.value >> 48) & 0xFFFF));
        return res;
    }
    c64 conv_s16_s8(const c64& b) const
    {
        c64 res; res.
        Init8(clamp_s8(value & 0xFFFF),
              clamp_s8((value >> 16) & 0xFFFF),
              clamp_s8((value >> 32) & 0xFFFF),
              clamp_s8((value >> 48) & 0xFFFF),
              clamp_s8(b.value & 0xFFFF),
              clamp_s8((b.value >> 16) & 0xFFFF),
              clamp_s8((b.value >> 32) & 0xFFFF),
              clamp_s8((b.value >> 48) & 0xFFFF));
        return res;
    }

    /* TODO: Verify that these are correct (though they should never be used anyway) */
    c64 unpacklbw(const c64& p) const
    {
    #if defined(__MMX__) && !defined(__ICC)
        /* ICC says [error: type of cast must be integral or enum]
         * on the return value cast,
         * so we cannot use this code on ICC. Fine for GCC. */
        return (uint_least64_t)_m_punpcklbw(*(__m64*)&p.value, *(__m64*)&value);
    #else
        uint_fast64_t a=value, b=p.value;
        return expand32_8(a) | (expand32_8(b) << 8);
    #endif
    }
    c64 unpackhbw(const c64& p) const
    {
    #if defined(__MMX__) && !defined(__ICC)
        return (uint_least64_t)_m_punpckhbw(*(__m64*)&p.value, *(__m64*)&value);
    #else
        uint_fast64_t a=value, b=p.value;
        return expand32_8(a>>32) | (expand32_8(b>>32) << 8);
    #endif
    }
    c64 unpacklwd(const c64& p) const
    {
    #if defined(__MMX__) && !defined(__ICC)
        return (uint_least64_t)_m_punpcklwd(*(__m64*)&p.value, *(__m64*)&value);
    #else
        uint_fast64_t a=value, b=p.value;
        return expand32_16(a) | (expand32_16(b) << 16);
    #endif
    }
    c64 unpackhwd(const c64& p) const
    {
    #if defined(__MMX__) && !defined(__ICC)
        return (uint_least64_t)_m_punpckhwd(*(__m64*)&p.value, *(__m64*)&value);
    #else
        uint_fast64_t a=value, b=p.value;
        return expand32_16(a>>32) | (expand32_16(b>>32) << 16);
    #endif
    }
    c64 unpackldq() const { return unpackldq(*this); }
    c64 unpackldq(const c64& p) const
    {
    #if defined(__MMX__) && !defined(__ICC)
        return (uint_least64_t)_m_punpckldq(*(__m64*)&p.value, *(__m64*)&value);
    #else
        return value | (p.value << 32);
    #endif
    }
};

#ifdef USE_MMX
typedef c64_MMX c64;
#else
typedef c64_nonMMX c64;
#endif

static inline void MMX_clear()
{
    #ifdef __3dNOW__
    _m_femms(); /* Note: not available on ICC */
    #elif defined(__MMX__)
    _mm_empty();
    #endif
}

rgbtorgb.hh

This include file provides functions for converting images of a certain bit depth to another bit depth. The functions use MMX optimizations where possible. Note: If your compiler does not support attribute((noinline)), you can remove that expression with no ill effects. It is there simply for my debugging.
#ifdef __cplusplus
extern "C" {
  #define defaulttrue =true
#else
  #define defaulttrue
  #define bool       int
#endif

/* RGB to RGB and RGB from/to YCbRr (YUV) conversions written by Bisqwit
 * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
 *
 * Concepts:
 *   15 = RGB15 or BGR15
 *   16 = RGB16 or BGR16
 *   24 = RGB24 or BGR24
 *   32 = RGB32 or BGR32
 * I420 = YCbCr where Y is issued for each pixel,
 *                    followed by Cr for 2x2 pixels,
 *                    followed by Cb for 2x2 pixels
 * YUY2 = YCbCr where for each pixel, Y is issued,
 *                    followed by Cr for 2x1 pixels (if even pixel)
 *                             or Cb for 2x1 pixels (if odd pixel)
 *
 * Note: Not all functions honor the swap_red_blue setting.
 */

void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
    __attribute__((noinline));

void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
    __attribute__((noinline));

void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
    __attribute__((noinline));

void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
    __attribute__((noinline));

void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
    __attribute__((noinline));

void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);

void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);

void Convert_I420To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
    __attribute__((noinline));

void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);

void Convert_YUY2To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
    __attribute__((noinline));

void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);

#ifdef __cplusplus
}
  #undef defaulttrue
#else
  #undef defaulttrue
  #undef bool
#endif

rgbtorgb.cc

This is the source code of the functionality behind rgbtorgb.hh.
#include <stdint.h>
#include <stdlib.h> // for size_t
#include <vector>
#include <cmath>

/* RGB to RGB and RGB from/to I420 conversions written by Bisqwit
 * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
 */

typedef uint_least64_t uint64_t;

#include "quantize.hh"
#include "rgbtorgb.hh"
#include "simd.hh"

/* For BPP conversions */

static const uint64_t zero64         __attribute__((aligned(8))) = 0ULL;
static const uint64_t mask24l        __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
static const uint64_t mask24h        __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
static const uint64_t mask24hh       __attribute__((aligned(8))) = 0xffff000000000000ULL;
static const uint64_t mask24hhh      __attribute__((aligned(8))) = 0xffffffff00000000ULL;
static const uint64_t mask24hhhh     __attribute__((aligned(8))) = 0xffffffffffff0000ULL;

static const uint64_t mask64h        __attribute__((aligned(8))) = 0xFF00FF00FF00FF00ULL;
static const uint64_t mask64l        __attribute__((aligned(8))) = 0x00FF00FF00FF00FFULL;
static const uint64_t mask64hw       __attribute__((aligned(8))) = 0xFFFF0000FFFF0000ULL;
static const uint64_t mask64lw       __attribute__((aligned(8))) = 0x0000FFFF0000FFFFULL;
static const uint64_t mask64hd       __attribute__((aligned(8))) = 0xFFFFFFFF00000000ULL;
static const uint64_t mask64ld       __attribute__((aligned(8))) = 0x00000000FFFFFFFFULL;

/* For RGB2YUV: */

static const int RGB2YUV_SHIFT = 15; /* highest value where [RGB][YUV] fit in signed short */

static const int RY = 8414;  //  ((int)(( 65.738/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
static const int RV = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
static const int RU = -4856; //  ((int)((-37.945/256.0)*(1<<RGB2YUV_SHIFT)+0.5));

static const int GY = 16519; //  ((int)((129.057/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
static const int GV = -12051;//  ((int)((-94.154/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
static const int GU = -9534; //  ((int)((-74.494/256.0)*(1<<RGB2YUV_SHIFT)+0.5));

static const int BY = 3208;  //  ((int)(( 25.064/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
static const int BV = -2339; //  ((int)((-18.285/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
static const int BU = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));

static const int Y_ADD = 16;
static const int U_ADD = 128;
static const int V_ADD = 128;

/* For YUV2RGB: */

static const int YUV2RGB_SHIFT = 13; /* highest value where UB still fits in signed short */

static const int Y_REV = 9539; // ((int)( (  255 / 219.0 )     * (1<<YUV2RGB_SHIFT)+0.5));
static const int VR = 14688;   // ((int)( ( 117504 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
static const int VG = -6659;   // ((int)( ( -53279 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
static const int UG = -3208;   // ((int)( ( -25675 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
static const int UB = 16525;   // ((int)( ( 132201 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));

/****************/

template<typename c64>
static inline void Convert32To24_32bytes(c64 w0, c64 w1, c64 w2, c64 w3, unsigned char* dest)
{
    c64 r0 = (w0 & mask24l) | ((w0 >> 8) & mask24h); /* bbbaaa */
    c64 r1 = (w1 & mask24l) | ((w1 >> 8) & mask24h); /* dddccc */
    c64 r2 = (w2 & mask24l) | ((w2 >> 8) & mask24h); /* fffeee */
    c64 r3 = (w3 & mask24l) | ((w3 >> 8) & mask24h); /* hhhggg */
    
    /* ccbbbaaa */
    ((r0      )  | ((r1 << 48) & mask24hh)).Put(dest+0);
    /* feeedddc */
    ((r1 >> 16)  | ((r2 << 32) & mask24hhh)).Put(dest+8);
    /* hhhgggff */
    ((r2 >> 32)  | ((r3 << 16) & mask24hhhh)).Put(dest+16);
}

#if defined(__x86_64) || defined(USE_MMX)
static void Convert32To24_32bytes(const unsigned char* src,
                                  unsigned char* dest)
{
    c64 w0; w0.Get(src+0);
    c64 w1; w1.Get(src+8);
    c64 w2; w2.Get(src+16);
    c64 w3; w3.Get(src+24);
    Convert32To24_32bytes(w0,w1,w2,w3, dest);
}
#endif

void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
{
    const unsigned char* src = (const unsigned char*)data;
    
    #if defined(__x86_64) || defined(USE_MMX)
    while(npixels >= 8)
    {
        Convert32To24_32bytes(src, dest);
        src  += 4*8;
        dest += 3*8;
        npixels -= 8;
    }
     #ifdef USE_MMX
     MMX_clear();
     #endif
    #endif
    
    for(unsigned pos=0; pos<npixels; ++pos)
    {
        dest[3*pos+0] = src[4*pos+0];
        dest[3*pos+1] = src[4*pos+1];
        dest[3*pos+2] = src[4*pos+2];
    }
}

static void Unbuild16(unsigned char* target, unsigned rgb16)
{
    unsigned B = (rgb16%32)*256/32;
    unsigned G = ((rgb16/32)%64)*256/64;
    unsigned R = ((rgb16/(32*64))%32)*256/32;
    target[0] = R;
    target[1] = G;
    target[2] = B;
}

static void Unbuild15(unsigned char* target, unsigned rgb16)
{
    unsigned B = (rgb16%32)*256/32;
    unsigned G = ((rgb16/32)%32)*256/32;
    unsigned R = ((rgb16/(32*32))%32)*256/32;
    target[0] = R;
    target[1] = G;
    target[2] = B;
}

template<int basevalue_lo, int basevalue_hi>
struct Bits16const
{
    static const uint64_t value;
};
template<int basevalue_lo, int basevalue_hi>
const uint64_t Bits16const<basevalue_lo, basevalue_hi>::value =
       (( ((uint64_t)(unsigned short) basevalue_lo) << 0)
      | ( ((uint64_t)(unsigned short) basevalue_hi) << 16)
      | ( ((uint64_t)(unsigned short) basevalue_lo) << 32)
      | ( ((uint64_t)(unsigned short) basevalue_hi) << 48));

template<int basevalue_lo, int basevalue_hi>
struct Bits32const
{
    static const uint64_t value;
};
template<int basevalue_lo, int basevalue_hi>
const uint64_t Bits32const<basevalue_lo, basevalue_hi>::value =
       (( ((uint64_t)(unsigned int) basevalue_lo) << 0)
      | ( ((uint64_t)(unsigned int) basevalue_hi) << 32));

template<uint64_t basevalue_lo, uint64_t basevalue_hi>
struct Bits8const
{
    static const uint64_t value =
       ((basevalue_lo << 0)
      | (basevalue_hi << 8)
      | (basevalue_lo << 16)
      | (basevalue_hi << 24)
      | (basevalue_lo << 32)
      | (basevalue_hi << 40)
      | (basevalue_lo << 48)
      | (basevalue_hi << 56));
};


template<int lowbitcount, int highbitcount, int leftshift>
struct MaskBconst
{
    static const uint64_t basevalue_lo = (1 <<  lowbitcount) - 1;
    static const uint64_t basevalue_hi = (1 << highbitcount) - 1;
    static const uint64_t value = Bits8const<basevalue_lo,basevalue_hi>::value << leftshift;
};

template<int bits>
struct Convert_2byte_consts
{
    static const uint64_t mask_lo;//   = MaskBconst<bits,0, 0>::value;
    static const uint64_t mask_hi;//   = MaskBconst<bits,0, 8>::value;
    static const uint64_t mask_frac;// = MaskBconst<8-bits,8-bits, 0>::value;
};
template<int bits>
const uint64_t Convert_2byte_consts<bits>::mask_lo   = MaskBconst<bits, 0, 0>::value;
template<int bits>
const uint64_t Convert_2byte_consts<bits>::mask_hi   = MaskBconst<bits, 0, 8>::value;
template<int bits>
const uint64_t Convert_2byte_consts<bits>::mask_frac = MaskBconst<8-bits, 8-bits, 0>::value;

template<int offs, int bits>
struct Convert_2byte_helper
{
    c64 lo, hi;
    
    Convert_2byte_helper(c64 p4a, c64 p4b)
    {
        const uint64_t& mask_lo   = Convert_2byte_consts<bits>::mask_lo;
        const uint64_t& mask_hi   = Convert_2byte_consts<bits>::mask_hi;
        const uint64_t& mask_frac = Convert_2byte_consts<bits>::mask_frac;
        
        /* STEP 1: SEPARATE THE PIXELS INTO RED, GREEN AND BLUE COMPONENTS */

        /* 000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb */
        c64 s5 = ((p4a >> offs) & mask_lo) | ((p4b << (8-offs)) & mask_hi);

        /* STEP 2: SCALE THE COLOR COMPONENTS TO 256 RANGE */
        
        /* BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000 */
        /* 00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb */
        c64 v8 = (s5 << (8-bits)) | ((s5 >> (bits-(8-bits))) & mask_frac);
        /* v8:
         *
         * BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb *
         */
        
        /* STEP 3: DEINTERLACE THE PIXELS */
        lo = (v8     ) & mask64l;
        hi = (v8 >> 8) & mask64l;
    }
};

/*
template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
static void Convert_2byte_to_24Common(const unsigned char* src, unsigned char* dest)
    __attribute((noinline));
*/
template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits, bool rgb24>
static void Convert_2byte_to_24or32Common(const unsigned char* src, unsigned char* dest)
{
    c64 p4a; p4a.Get(src+0); // four pixels
    c64 p4b; p4b.Get(src+8); // another four pixels
    
    /* in: In both registers: */
    
    Convert_2byte_helper<roffs,rbits> r(p4a,p4b);
    Convert_2byte_helper<boffs,bbits> b(p4a,p4b);
    Convert_2byte_helper<goffs,gbits> g(p4a,p4b);

    /* STEP 4: CONVERT PIXELS INTO RGB32 */
    
    /* Now we have:
     *               b.lo =  0j0g0d0a
     *               g.lo =  0k0h0e0b
     *               r.lo =  0l0i0f0c
     *               b.hi =  0J0G0D0A
     *               g.hi =  0K0H0E0B
     *               r.hi =  0L0I0F0C
     * We want:
     *                 w1 =  0fed0cba
     *                 w2 =  0lkj0ihg
     *                 w3 =  0FED0CBA
     *                 w4 =  0LKJ0IHG
     */
   
#if 0 && defined(__MMX__) /* FIXME why is this 0&&? */
    // punpcklbw  0k0h0e0b, 0j0g0d0a -> 00ed00ba
    // punpcklwd  0l0i0f0c, ________ -> 0f__0c__
    c64 w1 = r.lo.unpacklwd(zero64) | g.lo.unpacklbw(b.lo); // pix 0,1
    // punpckhbw  0k0h0e0b, 0j0g0d0a -> 00kj00hg
    // punpckhwd  0l0i0f0c, ________ -> 0l__0i__
    c64 w2 = r.lo.unpackhwd(zero64) | g.lo.unpackhbw(b.lo); // pix 2,3
    
    c64 w3 = r.hi.unpacklwd(zero64) | g.hi.unpacklbw(b.hi); // pix 4,5
    c64 w4 = r.hi.unpackhwd(zero64) | g.hi.unpackhbw(b.hi); // pix 6,7
    #ifndef USE_MMX
     MMX_clear();
    #endif
#else
    /* With 64-bit registers, this code is greatly simpler than
     * the emulation of unpack opcodes. However, when the
     * unpack opcodes is available, using them is shorter.
     * Which way is faster? FIXME: Find out
     */

    //        mask64lw:  00**00**
    //        mask64hw:  **00**00
    // b.lo & mask64lw:  000g000a
    // g.lo & mask64lw:  000h000b
    // r.lo & mask64lw:  000i000c
    // b.lo & mask64hw:  0j000d00
    // g.lo & mask64hw:  0k000e00
    // r.lo & mask64hw:  0l000f00
    
    c64 tlo1 = ((b.lo & mask64lw)     ) | ((g.lo & mask64lw) << 8) | ((r.lo & mask64lw) << 16);
    c64 tlo2 = ((b.lo & mask64hw) >>16) | ((g.lo & mask64hw) >> 8) | ((r.lo & mask64hw)      );

    c64 thi1 = ((b.hi & mask64lw)     ) | ((g.hi & mask64lw) << 8) | ((r.hi & mask64lw) << 16);
    c64 thi2 = ((b.hi & mask64hw) >>16) | ((g.hi & mask64hw) >> 8) | ((r.hi & mask64hw)      );
    /*
     *                tlo1 =  0ihg0cba
     *                tlo2 =  0lkj0fed
     *                thi1 =  0IHG0CBA
     *                thi2 =  0LKJ0FED
     *            mask64ld =  0000****
     *            mask64hd =  ****0000
     */
     
    c64 w1 = (tlo1 & mask64ld) | ((tlo2 & mask64ld) << 32); // 00000cba | 00000fed = 0fed0bca
    c64 w2 = (tlo2 & mask64hd) | ((tlo1 & mask64hd) >> 32); // 0lkj0000 | 0ihg0000 = 0lkj0ihg

    c64 w3 = (thi1 & mask64ld) | ((thi2 & mask64ld) << 32);
    c64 w4 = (thi2 & mask64hd) | ((thi1 & mask64hd) >> 32);
#endif
    
    if(rgb24)
    {
        /* STEP 5A: CONVERT PIXELS INTO RGB24 */
        Convert32To24_32bytes(w1,w2,w3,w4, dest);
    }
    else
    {
        /* STEP 5B: STORE RGB32 */
        w1.Put(dest+0);
        w2.Put(dest+8);
        w3.Put(dest+16);
        w4.Put(dest+24);
    }
     
    /*
     punpcklbw    ____ABCD, ____abcd = AaBbCcDd
     punpcklwd    ____ABCD, ____abcd = ABabCDcd
     punpckldq    ____ABCD, ____abcd = ABCDabcd
     
     punpckhbw    ABCD____, abcd____ = AaBbCcDd
     punpckhwd    ABCD____, abcd____ = ABabCDcd
     punpckhdq    ABCD____, abcd____ = ABCDabcd
    */
}

void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
{
    const unsigned char* src = (const unsigned char*)data;
    
    if(swap_red_blue)
        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
            Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, true> (src, dest);
    else
        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
            Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, true> (src, dest);

    #ifdef USE_MMX
     MMX_clear();
    #endif
    for(unsigned a=0; a<npixels; ++a)
    {
        unsigned short v = ((const unsigned short*)src)[a];
        Unbuild15(&dest[a*3], v);
    }
}

void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
{
    const unsigned char* src = (const unsigned char*)data;
    
    if(swap_red_blue)
        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
            Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, true> (src, dest);
    else
        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
            Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, true> (src, dest);

    #ifdef USE_MMX
     MMX_clear();
    #endif
    for(unsigned a=0; a<npixels; ++a)
    {
        unsigned short v = ((const unsigned short*)src)[a];
        Unbuild16(&dest[a*3], v);
    }
}

void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
{
    const unsigned char* src = (const unsigned char*)data;
    
    if(swap_red_blue)
        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
            Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, false> (src, dest);
    else
        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
            Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, false> (src, dest);

    #ifdef USE_MMX
     MMX_clear();
    #endif
    for(unsigned a=0; a<npixels; ++a)
    {
        unsigned short v = ((const unsigned short*)src)[a];
        Unbuild15(&dest[a*4], v);
    }
}

void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
{
    const unsigned char* src = (const unsigned char*)data;
    
    if(swap_red_blue)
        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
            Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, false> (src, dest);
    else
        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
            Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, false> (src, dest);

    #ifdef USE_MMX
     MMX_clear();
    #endif
    for(unsigned a=0; a<npixels; ++a)
    {
        unsigned short v = ((const unsigned short*)src)[a];
        Unbuild16(&dest[a*4], v);
    }
}

static inline unsigned Build16(unsigned x,unsigned y, const unsigned char* rgbdata)
{
    unsigned o16 = (x + 4*y) % 16;
    return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
         | (Quantize4x4<63>(o16, rgbdata[1]) << 5)
         | (Quantize4x4<31>(o16, rgbdata[0]) << 11);
}
static inline unsigned Build15(unsigned x,unsigned y, const unsigned char* rgbdata)
{
    unsigned o16 = (x + 4*y) % 16;
    return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
         | (Quantize4x4<31>(o16, rgbdata[1]) << 5)
         | (Quantize4x4<31>(o16, rgbdata[0]) << 10);
}

void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    const unsigned char* logodata = (const unsigned char*) data;
    unsigned short* result = (unsigned short*) dest;
    unsigned x=0,y=0;
    for(unsigned pos=0; pos<npixels; ++pos)
    {
        result[pos] = Build16(x,y, &logodata[pos*3]);
        if(++x >= width) { x=0; ++y; }
    }
}

void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    const unsigned char* logodata = (const unsigned char*) data;
    unsigned short* result = (unsigned short*) dest;
    unsigned x=0,y=0;
    for(unsigned pos=0; pos<npixels; ++pos)
    {
        result[pos] = Build15(x,y, &logodata[pos*3]);
        if(++x >= width) { x=0; ++y; }
    }
}

#ifdef __MMX__
static inline void Convert_I420_MMX_Common
    (c64_MMX p0_1, c64_MMX p2_3,
     unsigned char* dest_y0,
     unsigned char* dest_y1,
     unsigned char* dest_u,
     unsigned char* dest_v)
{
    c64_MMX p0 = c64_MMX(zero64).unpacklbw(p0_1); // expand to 64-bit (4*16)
    c64_MMX p1 = c64_MMX(zero64).unpackhbw(p0_1);
    c64_MMX p2 = c64_MMX(zero64).unpacklbw(p2_3);
    c64_MMX p3 = c64_MMX(zero64).unpackhbw(p2_3);
    
    c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
    c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
    c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);

    c64_MMX ctotal = p0.add16(
                     p2.add16(
                     p1.add16(
                     p3)));
  
    p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
    p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
    p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
    p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
    
    c64_MMX yy;
    yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
               ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
               ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
               ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
    yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
    
    // Because we're writing to adjacent pixels, we optimize this by
    // writing two 8-bit values at once in both cases.
    *(short*)dest_y0 = yy.Extract88_from_1616lo();
    *(short*)dest_y1 = yy.Extract88_from_1616hi();
    
    c64_MMX u_total32 = _mm_madd_pi16(rgb_u.value, ctotal.value);
    c64_MMX v_total32 = _mm_madd_pi16(rgb_v.value, ctotal.value);
    
    *dest_u = U_ADD + ((u_total32.Extract32<0>() + u_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
    *dest_v = V_ADD + ((v_total32.Extract32<0>() + v_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
}

static inline void Convert_YUY2_MMX_Common
    (c64_MMX p0_1, c64_MMX p2_3,
     unsigned char* dest_yvyu)
{
    c64_MMX p0 = c64_MMX(zero64).unpacklbw(p0_1); // expand to 64-bit (4*16)
    c64_MMX p1 = c64_MMX(zero64).unpackhbw(p0_1);
    c64_MMX p2 = c64_MMX(zero64).unpacklbw(p2_3); // expand to 64-bit (4*16)
    c64_MMX p3 = c64_MMX(zero64).unpackhbw(p2_3);
    
    c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
    c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
    c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);

    c64_MMX ctotal0 = p0.add16(p1);
    c64_MMX ctotal2 = p2.add16(p3);
  
    p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
    p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
    p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
    p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
    
    c64_MMX yy;
    yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
               ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
               ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
               ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );

    yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
    
    c64_MMX u_total32_0 = _mm_madd_pi16(rgb_u.value, ctotal0.value);
    c64_MMX v_total32_0 = _mm_madd_pi16(rgb_v.value, ctotal0.value);
    c64_MMX u_total32_2 = _mm_madd_pi16(rgb_u.value, ctotal2.value);
    c64_MMX v_total32_2 = _mm_madd_pi16(rgb_v.value, ctotal2.value);
    
    c64_MMX quadword = yy; // four y values: at 0, 2, 4 and 6
    
    c64_MMX uv; uv.Init16(
        ((v_total32_0.Extract32<0>() + v_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
        ((u_total32_0.Extract32<0>() + u_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
        ((v_total32_2.Extract32<0>() + v_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
        ((u_total32_2.Extract32<0>() + u_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)) );
    c64_MMX uv_adds; uv_adds.Init16(V_ADD, U_ADD, V_ADD, U_ADD);
    uv = uv.add16(uv_adds);
    
    quadword |= uv << 8;     // two u and v values: at 1, 3, 5 and 7.
    quadword.Put(dest_yvyu); // write four y values: at 0, 2, 4 and 6
}
#endif

/*template<int PixStride>
void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
    __attribute__((noinline));*/

template<int PixStride>
void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    const unsigned char* src = (const unsigned char*) data;
    unsigned height = npixels / width;
    unsigned pos = 0;
    unsigned ypos = 0;
    unsigned vpos = npixels;
    unsigned upos = vpos + npixels / 4;
    unsigned stride = width*PixStride;

    /* This function is based on code from x264 svn version 711 */
    /* TODO: Apply MMX optimization for 24-bit pixels */
    
    for(unsigned y=0; y<height; y += 2)
    {
        for(unsigned x=0; x<width; x += 2)
        {
        #ifdef __MMX__
          if(PixStride == 4)
          {
            c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
            c64_MMX p2_3; p2_3.Get(&src[pos+stride]); // two 32-bit pixels

            pos += PixStride*2;
            
            Convert_I420_MMX_Common(p0_1, p2_3,
                dest+ypos,
                dest+ypos+width,
                dest+upos++,
                dest+vpos++);
          }
          else
        #endif
          {
            int c[3], rgb[3][4];
            
            /* luma */
            for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
            for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n + stride];
            pos += PixStride;
            
            for(int n=0; n<3; ++n) c[n] += rgb[n][2] = src[pos + n];
            for(int n=0; n<3; ++n) c[n] += rgb[n][3] = src[pos + n + stride];
            pos += PixStride;

            unsigned destpos[4] = { ypos, ypos+width, ypos+1, ypos+width+1 };
            for(int n=0; n<4; ++n)
            {
                dest[destpos[n]]
                    = Y_ADD + ((RY * rgb[0][n]
                              + GY * rgb[1][n]
                              + BY * rgb[2][n]
                               ) >> RGB2YUV_SHIFT);  // y
            }
            
            dest[upos++] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2)) );
            dest[vpos++] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)) ); 
          }
            
            ypos += 2;
        }
        pos += stride;
        ypos += width;
    }
    #ifdef __MMX__
     MMX_clear();
    #endif
}

template<int PixStride>
void Convert_4byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    const unsigned char* src = (const unsigned char*) data;
    unsigned height = npixels / width;
    unsigned pos = 0;
    unsigned ypos = 0;
    unsigned stride = width*PixStride;

    /* This function is based on code from x264 svn version 711 */
    /* TODO: Apply MMX optimization for 24-bit pixels */
    
    for(unsigned y=0; y<height; ++y)
    {
        for(unsigned x=0; x<width; x += 2)
        {
        #ifdef __MMX__
          if(PixStride == 4)
          {
            c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
            pos += PixStride*2;
            
            c64_MMX p2_3; p2_3.Get(&src[pos]);        // two 32-bit pixels (4*8)
            pos += PixStride*2;
            x += 2;
            
            Convert_YUY2_MMX_Common(p0_1, p2_3,
                dest+ypos);
          
            ypos += 4;
          }
          else
        #endif
          {
            int c[3], rgb[3][2];
            
            /* luma */
            for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
            pos += PixStride;
            
            for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n];
            pos += PixStride;

            for(int n=0; n<2; ++n)
            {
                dest[ypos + n*2]
                    = Y_ADD + ((RY * rgb[0][n]
                              + GY * rgb[1][n]
                              + BY * rgb[2][n]
                               ) >> RGB2YUV_SHIFT);  // y
            }
            
            dest[ypos+3] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1)) );
            dest[ypos+1] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)) ); 
          }
            ypos += 4;
        }
    }
    #ifdef __MMX__
    MMX_clear();
    #endif
}

/*template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
    __attribute__((noinline));*/
    
template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    const unsigned PixStride = 2;
    const unsigned char* src = (const unsigned char*) data;
    unsigned height = npixels / width;
    unsigned pos = 0;
    unsigned ypos = 0;
    unsigned vpos = npixels;
    unsigned upos = vpos + npixels / 4;
    unsigned stride = width*PixStride;

    /* This function is based on code from x264 svn version 711 */
    
    for(unsigned y=0; y<height; y += 2)
    {
        for(unsigned x=0; x<width; x += 8)
        {
            unsigned char Rgb2byteBuf[2][8][4];
            
            /* Convert 8 pixels from two scanlines (16 in total)
             * from RGB15 / RGB16 to RGB32
             * (Not RGB32, because RGB32 conversion is faster)
             */
            Convert_2byte_to_24or32Common
                <roffs,rbits, goffs,gbits, boffs,bbits, false>
                (src+pos,        Rgb2byteBuf[0][0]);

            Convert_2byte_to_24or32Common
                <roffs,rbits, goffs,gbits, boffs,bbits, false>
                (src+pos+stride, Rgb2byteBuf[1][0]);

            pos += 16;
            
            for(int x8 = 0; x8 < 8; x8 += 2)
            {
              #ifdef __MMX__
                c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[0][x8][0]); // two 32-bit pixels (4*8)
                c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[1][x8][0]); // two 32-bit pixels

                Convert_I420_MMX_Common(p0_1, p2_3,
                    dest+ypos,
                    dest+ypos+width,
                    dest+upos++,
                    dest+vpos++);
              #else
                int c[3];
                /* TODO: Some faster means than using pointers */
                unsigned char* rgb[4] =
                {
                    Rgb2byteBuf[0][x8+0],
                    Rgb2byteBuf[0][x8+1],
                    Rgb2byteBuf[1][x8+0],
                    Rgb2byteBuf[1][x8+1]
                };
                
                for(int m=0; m<3; ++m) c[m] = 0;
                for(int n=0; n<4; ++n)
                    for(int m=0; m<3; ++m)
                        c[m] += rgb[n][m];
                
                unsigned destpos[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
                for(int n=0; n<4; ++n)
                {
                    dest[destpos[n]]
                        = Y_ADD + ((RY * rgb[n][0]
                                  + GY * rgb[n][1]
                                  + BY * rgb[n][2]
                                   ) >> RGB2YUV_SHIFT);  // y
                }
                
                /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
                // Note: +2 is because c[] contains 4 values
                dest[upos++] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2));
                dest[vpos++] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)); 
              #endif
                ypos += 2;
            }
        }
        pos += stride;
        ypos += width;
    }

    #ifdef __MMX__
    MMX_clear();
    #endif
}

template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
void Convert_2byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    const unsigned PixStride = 2;
    const unsigned char* src = (const unsigned char*) data;
    unsigned height = npixels / width;
    unsigned pos = 0;
    unsigned ypos = 0;
    unsigned stride = width*PixStride;

    for(unsigned y=0; y<height; ++y)
    {
        for(unsigned x=0; x<width; x += 8)
        {
            unsigned char Rgb2byteBuf[8][4];
            
            /* Convert 8 pixels from a scanline
             * from RGB15 / RGB16 to RGB32
             * (Not RGB32, because RGB32 conversion is faster)
             */
            Convert_2byte_to_24or32Common
                <roffs,rbits, goffs,gbits, boffs,bbits, false>
                (src+pos, Rgb2byteBuf[0]);

            pos += 16;
            
            for(int x8 = 0; x8 < 8; )
            {
              #ifdef __MMX__
                c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[x8  ][0]); // two 32-bit pixels (4*8)
                c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[x8+2][0]); // two 32-bit pixels (4*8)
                Convert_YUY2_MMX_Common(p0_1, p2_3, dest+ypos);
                x8   += 4;
                ypos += 8;
              #else
                int c[3];
                /* TODO: Some faster means than using pointers */
                unsigned char* rgb[2] =
                {
                    Rgb2byteBuf[x8+0],
                    Rgb2byteBuf[x8+1],
                };
                
                for(int m=0; m<3; ++m) c[m] = 0;
                for(int n=0; n<2; ++n)
                    for(int m=0; m<3; ++m)
                        c[m] += rgb[n][m];
                
                for(int n=0; n<2; ++n)
                {
                    dest[ypos + n*2]
                        = Y_ADD + ((RY * rgb[n][0]
                                  + GY * rgb[n][1]
                                  + BY * rgb[n][2]
                                   ) >> RGB2YUV_SHIFT);  // y
                }
                
                /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
                // Note: +2 is because c[] contains 4 values
                dest[ypos+3] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1));
                dest[ypos+1] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)); 
                x8   += 2;
                ypos += 4;
              #endif
            }
        }
    }

    #ifdef __MMX__
    MMX_clear();
    #endif
}


/***/

void Convert_I420To24Frame(const void* data, unsigned char* dest,
                           unsigned npixels, unsigned width, bool swap_red_blue)
{
    const unsigned char* src = (const unsigned char*) data;
    unsigned height = npixels / width;
    unsigned pos = 0;
    unsigned ypos = 0;
    unsigned vpos = npixels;
    unsigned upos = vpos + npixels / 4;
    
    #ifdef __MMX__
    c64_MMX rgb[4], yy[4];
    #endif
    
    /*
        Y input: 16..235
        U input: 16..240
        V input: 16..240
        
    */
    
  #pragma omp parallel for
    for(unsigned y=0; y<height; y += 2)
    {
        for(unsigned x=0; x<width; )
        {
        #ifdef __MMX__
            /* Load 4 U and V values and subtract U_ADD and V_ADD from them. */
            uint64_t tmp_u = *(uint32_t*)&src[upos];
            uint64_t tmp_v = *(uint32_t*)&src[vpos];
            c64_MMX uuq = c64_MMX(zero64)
                     .unpacklbw(tmp_u) // 8-bit to 16-bit
                     .sub16(Bits16const<U_ADD,U_ADD>::value)
                     .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
            c64_MMX vvq = c64_MMX(zero64)
                     .unpacklbw(tmp_v)
                     .sub16(Bits16const<V_ADD,V_ADD>::value)
                     .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
            
            const short* uu = (const short*)&uuq;
            const short* vv = (const short*)&vvq;
          #if 1
            c64_MMX vmul; vmul.Init16(VR, VG, 0, 0);  // R,G,B,0 * vmul = V
            c64_MMX umul; umul.Init16(0, UG, UB, 0);  // R,G,B,0 * umul = U
          #else
            // pmaddw does: A,B,C,D and E,F,G,H,    A*E + B*F,  C*G + D*H
            
            // we do:                           R=   VR*v +  0*u, G= VG*v + UG*u
            //                                  B=    0*v + UB*u,     0*0 + 0*0
            c64_MMX vumul1; vumul1.Init16(VR, 0, VG, UG);
            c64_MMX vumul2; vumul2.Init16(0, UB,  0,  0);
          #endif
            
            /* c64_MMX rgb[4]; // four sets of 4*int16, each representing 1 rgb value */
            for(int n=0; n<4; ++n)
            {
              #if 1
                /* vv is shifted by 3 bits, vmul is shifted by 13 bits
                 * 16 bits in total, so mul16hi gets the 16-bit downscaled part */
                c64_MMX v; v.Init16(vv[n]);
                c64_MMX u; u.Init16(uu[n]);
                rgb[n] = v.mul16hi(vmul).add16(
                         u.mul16hi(umul)      );
              #else
                c64_MMX vuvu; vuvu.Init16(vv[n], uu[n], vv[n], uu[n]);
                c64_MMX madd1 = _mm_madd_pi16(vumul1.value, vuvu.value);
                c64_MMX madd2 = _mm_madd_pi16(vumul2.value, vuvu.value);
                rgb[n] = madd1.sar32(YUV2RGB_SHIFT)
                              .conv_s32_s16(
                         madd2.sar32(YUV2RGB_SHIFT));
              #endif
            }
            
            /* rgb[0] : U,V increment of RGB32 for x0,y0 - x1,y1
             * rgb[1] : U,V increment of RGB32 for x2,y0 - x3,y1
             * rgb[2] : U,V increment of RGB32 for x4,y0 - x5,y1
             * rgb[3] : U,V increment of RGB32 for x6,y0 - x7,y1
             */
            
            unsigned yyoffs[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
            /* c64_MMX yy[4]; // four sets of 4*int16, each representing four Y values */
            for(int n=0; n<4; ++n)
            {
                c64_MMX luma; luma.Init16(
                    src[yyoffs[0]+n*2],
                    src[yyoffs[1]+n*2],
                    src[yyoffs[2]+n*2],
                    src[yyoffs[3]+n*2]
                );
                luma = luma.sub16(Bits16const<Y_ADD,Y_ADD>::value);
                luma = luma.shl16(16 - YUV2RGB_SHIFT);
                yy[n] = luma.mul16hi(Bits16const<Y_REV,Y_REV>::value);
            }
            const short* const yyval = (const short*) &yy[0].value;
            /*
                values in order:
                   x0y0 x1y0 x0y1 x1y1
                   x2y0 x3y0 x2y1 x3y1
                   x4y0 x5y0 x4y1 x5y1
                   x6y0 x7y0 x6y1 x7y1
            */
            int tmppos = pos;
            for(int ny = 0; ny < 4; ny += 2)
            {
                /* Note: We must use 16-bit pixels here instead of 8-bit,
                 * because the rgb+Y addition can overflow. conv_s16_u8()
                 * does the necessary clamping, which would not be done
                 * if the values were 8-bit.
                 */
                // 8 pixels for one scanline, repeated twice
                /* Note: C++ has no named constructors, so we
                 * use statement blocks here as substitutes.
                 */
                c64_MMX r0
                    = rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+0]); tmp; }) )
                           .conv_s16_u8(
                      rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+1]); tmp; }) ));
                c64_MMX r1
                    = rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+4]); tmp; }) )
                           .conv_s16_u8(
                      rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+5]); tmp; }) ));
                c64_MMX r2
                    = rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+8]); tmp; }) )
                           .conv_s16_u8(
                      rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+9]); tmp; }) ));
                c64_MMX r3
                    = rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+12]); tmp; }) )
                           .conv_s16_u8(
                      rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+13]); tmp; }) ));

                Convert32To24_32bytes(r0,r1,r2,r3, &dest[tmppos]);
                tmppos += width*3; // next line
            }
            upos += 4;
            vpos += 4;
            ypos += 8; // eight bytes for this line (and eight from next too)
            pos  += 8*3; // eight triplets generated on this line
            x    += 8; // eight yy values used on this line
        #else /* non-MMX */
            int u = src[upos] - U_ADD;
            int v = src[vpos] - V_ADD;

            int rgb[3] =
                {
                   (VR * v         ) >> (YUV2RGB_SHIFT),
                   (VG * v + UG * u) >> (YUV2RGB_SHIFT),
                   (       + UB * u) >> (YUV2RGB_SHIFT)
                };
            
            unsigned incr[4] = {0,1,width,width+1};

            for(unsigned r=0; r<4; ++r)
                for(unsigned doffs=pos + incr[r]*3, yoffs=ypos + incr[r],
                        yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
                        n=0; n<3; ++n)
                    dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);

            upos += 1;
            vpos += 1;
            ypos += 2; // two bytes for this line (two from next line)
            pos  += 2*3; // two triplets generated on this line
            x    += 2; // two yy values used on this line
        #endif
        }
        ypos += width;
        pos += 3*width;
    }
    #ifdef __MMX__
    MMX_clear();
    #endif
}

void Convert_YUY2To24Frame(const void* data, unsigned char* dest,
                           unsigned npixels, unsigned width, bool swap_red_blue)
{
    const unsigned char* src = (const unsigned char*) data;
    unsigned height = npixels / width;
    unsigned pos = 0;
    unsigned ypos = 0;
    
    /* TODO: MMX optimization */
    
    /*
        Y input: 16..235
        U input: 16..240
        V input: 16..240
        
    */
  #pragma omp parallel for
    for(unsigned y=0; y<height; ++y)
    {
        for(unsigned x=0; x<width; x += 2)
        {
            /* non-MMX */
            int u = src[ypos+1] - U_ADD;
            int v = src[ypos+3] - V_ADD;

            int rgb[3] =
                {
                   (VR * v         ) >> (YUV2RGB_SHIFT),
                   (VG * v + UG * u) >> (YUV2RGB_SHIFT),
                   (       + UB * u) >> (YUV2RGB_SHIFT)
                };
            
            for(unsigned r=0; r<2; ++r)
                for(unsigned doffs=pos + r*3, yoffs=ypos+r*2,
                        yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
                        n=0; n<3; ++n)
                    dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);

            ypos += 4; // four bytes for this line (y,u,y,v)
            pos  += 2*3; // two triplets generated on this line
            x    += 2; // two yy values used on this line
        }
    }
}

/***/
void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_4byte_To_I420Frame<3>(data,dest,npixels,width);
}
void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_4byte_To_I420Frame<4>(data,dest,npixels,width);
}
void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_2byte_To_I420Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
}
void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_2byte_To_I420Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
}
/***/
void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_4byte_To_YUY2Frame<3>(data,dest,npixels,width);
}
void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_4byte_To_YUY2Frame<4>(data,dest,npixels,width);
}
void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_2byte_To_YUY2Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
}
void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
{
    Convert_2byte_To_YUY2Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
}

How to use the modified emulator:

An example of a working value for NESVideoSetVideoCmd():
mencoder - -o testVIDEONUMBER.avi -mc 0 \
  -ovc x264 -x264encopts crf=0:me=dia:turbo=2:frameref=4 \
  -oac mp3lame -lameopts mode=3:preset=60:aq=1 \
  -vf decimate=10:0:0:1 -nocache \
  NESVSETTINGS &> mencoder.log
nesvideos-piece will replace NESVSETTINGS with the relevant settings required to interpret the raw video and audio data properly. It will also replace VIDEONUMBER with a successive number starting from 0 each time a new AVI is started.
If your emulator reads the encoding command from the VIDEOLOG environment variable, you can start the emulator like this, for example:
 
RESULTFILE="`pwd`/testVIDEONUMBER.avi"
VIDEO="-ovc x264 -x264encopts crf=0:me=dia:turbo=2:frameref=4"
AUDIO="-oac mp3lame -lameopts mode=3:preset=60:aq=1"
OPT="-vf decimate=10:0:0:1 -nocache -mc 0"
VIDEOCMD="mencoder - -o $RESULTFILE $OPT $VIDEO $AUDIO NESVSETTINGS"
VIDEOLOG="$VIDEOCMD" ./emulator --autodemo moviefile.mmv romfile.sms

Using a remote computer for running mencoder

In my setups, I always use a remote computer for encoding the AVI, different from the computer where I run the emulator. This has double benefits:
To accomplish this, I modify the encoding script as follows:
REXEC="rexec -pPASSWORD HOSTNAME"
FLAGFILE="`pwd`/s.status"
# These commands are common for both local and remote cases:
RESULTFILE="`pwd`/testVIDEONUMBER.avi"
VIDEO="-ovc x264 -x264encopts crf=0:me=dia:turbo=2:frameref=4"
AUDIO="-oac mp3lame -lameopts mode=3:preset=60:aq=1"
OPT="-vf decimate=10:0:0:1 -nocache -mc 0"
VIDEOCMD="mencoder - -o $RESULTFILE $OPT $VIDEO $AUDIO NESVSETTINGS"
# This is again for the remote case:
COM="lzop -F4"
DECOM="lzop -Fd"
rm -f "$FLAGFILE" s.log
VIDEOCMD="touch '$FLAGFILE';$COM | $REXEC $DECOM \| $VIDEOCMD"
(while [ ! -f "$FLAGFILE" ];do sleep 0.25;done
 $REXEC "cat > `pwd`/s.log" < s.log) &
# This is again common for both local and remote cases: 
VIDEOLOG="$VIDEOCMD" ./emulator --autodemo moviefile.mmv romfile.sms
# lastly, cleanup (remote only)
rm -f "$FLAGFILE" s.log
The s.log file is a FIFO through which nesvideos-piece.cc passes the audio to mencoder. Making a FIFO work through a remote connection requires some extra work:
Optimizations:

HomePages/Bisqwit/Source/NesvideosPiece last edited by Flygon on 4/2/2010 10:43 AM
Page History Latest diff List referrers View Source