/* automatically generated by fb-xlat-auto.sh, do not edit! */

/*
 * Copyright (c) 2003, 2005 Matt Fredette
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Matt Fredette.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

_TME_RCSID("$Id: fb-xlat-auto.sh,v 1.12 2009/08/30 21:51:53 fredette Exp $");

/* the central feature of these translation functions is the "bit
   FIFO", a first-in, first-out stream of bits.  source bit FIFOs are
   used to read pixel bits out of source images, and destination bit
   FIFOs are used to write pixel bits into destination images.

   a bit FIFO has a visible part and an invisible part.

   the visible part of a bit FIFO is 32 bits wide, meaning the
   translation code can read (for a source bit FIFO) or write (for a
   destination bit FIFO) up to 32 bits of pixel data before a bit
   FIFO shift.

   the invisible part of a source bit FIFO contains pixel bits that
   have already been read from the source image memory, but that have
   yet to be shifted in to the visible part.

   the invisible part of a destination bit FIFO contains the pixel
   bits that have been shifted out of the visible part, but that have
   yet to be written to the destination image memory.

   depending on various attributes of an image format, it may be
   possible for the translation code to always read or write the
   entire 32 visible bits of a bit FIFO at a time, and as a result
   always shift the bit FIFO 32 bits at a time.  this is desirable
   because it minimizes bit FIFO shifts.

   when this does happen, it may also be the case that the visible
   part of the bit FIFO always perfectly corresponds to an aligned
   32-bit word in the image buffer.

   this is the optimal situation, as it makes it unnecessary to track
   the invisible part in C local variables at all - each 32-bit FIFO
   shift of a source bit FIFO just reads the next 32-bit word directly
   into the visible part, and each 32-bit FIFO shift of a destination
   bit FIFO just writes the next 32-bit word directly out of the
   visible part.

   within the visible part of a bit FIFO, which bits belong to which
   pixels depends on the "order" of the image format.  most-significant
   (big-endian) images have earlier (more to the left on the screen)
   pixels in more-significant bit positions.  least-significant
   (little-endian) images have earlier pixels in less-significant bit
   bit positions.

   bit significance *within* a pixel never depends on image order.
   once the bits belonging to a pixel have been identified according
   to image order, the least significant bit in that pixel's value is
   always the bit with the least absolute value.

   some pictures may help explain this better.  each picture shows the
   32 bit visible part of a bit FIFO, with the bits marked from 0
   (least significant) to 31 (most significant).  all of these
   examples are for a two-bit-deep/two-bits-per-pixel image, and in
   the visible part sixteen pixels are numbered.  lesser numbered
   pixels are earlier (more to the left on the screen).  some of the
   invisible part is also shown, along with the direction that the bit
   FIFO shifts in:

   a source bit FIFO for a little-endian source image with two bits
   per pixel looks like:

                        |
                        | 31 30 29 28     7  6  5  4  3  2  1  0
      --+--+--+--+--+--+|+--+--+--+--+-  --+--+--+--+--+--+--+--+
    .. p18 | p17 | p16 ||| p15 | p14 | .. p3  | p2  | p1  | p0  |
      --+--+--+--+--+--+|+--+--+--+--+-  --+--+--+--+--+--+--+--+
                        |
   invisible part       |             visible part

          shift -> shift -> shift -> shift -> shift ->

   a source bit FIFO for a big-endian source image with two bits per
   pixel looks like:

                                            |
     31 30 29 28 27 26 25 26  4  3  2  1  0 |
    +--+--+--+--+--+--+--+--  -+--+--+--+--+|+--+--+--+--+--+--
    | p0  | p1  | p2  | p3  .. | p14 | p15 ||| p16 | p17 | p18 ..
    +--+--+--+--+--+--+--+--  -+--+--+--+--+|+--+--+--+--+--+--
                                            |
               visible part                 |    invisible part

        <- shift <- shift <- shift <- shift <- shift

   a destination bit FIFO for a little-endian destination image with
   two bits per pixel looks like:

                                            |
     31 30 29 28 27 26 25 26  4  3  2  1  0 |
    +--+--+--+--+--+--+--+--  -+--+--+--+--+|+--+--+--+--+--+--
    | p23 | p22 | p21 | p20 .. | p9  | p8  ||| p7  | p6  | p5  ..
    +--+--+--+--+--+--+--+--  -+--+--+--+--+|+--+--+--+--+--+--
                                            |
               visible part                 |    invisible part

          shift -> shift -> shift -> shift -> shift ->

   a destination bit FIFO for a big-endian destination image with
   two bits per pixel looks like:

                        |
                        | 31 30 29 28     7  6  5  4  3  2  1  0
      --+--+--+--+--+--+|+--+--+--+--+-  --+--+--+--+--+--+--+--+
    .. p5  | p6  | p7  ||| p8  | p9  | .. p20 | p21 | p22 | p23 |
      --+--+--+--+--+--+|+--+--+--+--+-  --+--+--+--+--+--+--+--+
                        |
   invisible part       |             visible part

        <- shift <- shift <- shift <- shift <- shift

  each translation function has at least one source bit FIFO and at
  least one destination bit FIFO.  the general idea is to read source
  pixel value(s) from the source image, map those source pixel
  value(s) somehow into destination pixel value(s), and write those
  destination pixel value(s) to the destination image.

  translation functions that do not scale the image have exactly one
  source bit FIFO and exactly one destination bit FIFO.  one pixel
  read from the source bit FIFO becomes one pixel written to the
  destination bit FIFO.

  translation functions that halve the image size have two source bit
  FIFOs and one destination bit FIFO.  one source bit FIFO sources
  bits from an even-numbered scanline, and the other sources bits from
  an odd-numbered scanline.  four pixels read from the source bit
  FIFOs - two from each source bit FIFO, making a 2x2 square - become
  one pixel written to the destination bit FIFO.

  translation functions that double the image size have one source
  bit FIFO and two destination bit FIFOs.  one destination bit FIFO
  sinks bits for an even-numbered scanline, and the other sinks bits
  for an odd-numbered scanline.  one pixel read from the source bit
  FIFO becomes four pixels written to the destination bit FIFOs -
  two to each destination bit FIFO, making a 2x2 square.

  translation functions don't necessarily always translate the entire
  source image into the entire destination image.  instead, the buffer
  holding the current source image is expected to be twice as large as
  necessary (plus some overhead), with the second half of the buffer
  holding a copy of the last translated ("old") source image.

  before setting up and translating pixels using bit FIFOs, a
  translation function treats the the current and old source images as
  an array of aligned 32-bit words and compares them.  if it finds no
  32-bit word that has changed, no translation is done and the
  function returns.

  otherwise, initial source pixel x and y coordinates are derived from
  the offset of the 32-bit word that failed comparison, and the source
  primary bit FIFO is primed.  if this translation function halves the
  image size, the source secondary bit FIFO is primed from the same x
  coordinate on the "other" (think y ^ 1) scanline.

  then, initial destination pixel x and y coordinates are derived from
  the initial source x and y coordinates, and the destination primary
  bit FIFO is primed.  if this translation function doubles the image
  size, the destination secondary bit FIFO is primed from the same x
  coordinate on the "other" (think y ^ 1) scanline.

  as mentioned previously, the buffer holding the current source image
  is expected to be twice as large as necessary (plus some overhead),
  with the second half of the buffer holding a copy of the last
  translated ("old") source image.

  this "overhead" is approximately two extra scanlines worth of data,
  that is initialized to all-bits-zero and must always remain zero.
  this extra data is present at the end of both the current ("new")
  and last translated ("old") source images.

  these extra, always-blank scanlines guarantee that the pixel
  translation loop terminates.  the pixel translation loop *never*
  checks for the end of the image buffer.  instead, it terminates only
  after it has read in TME_FB_XLAT_RUN consecutive aligned 32-bit
  words that have *not* changed between the new and old source images.
  this small amount of extra memory overhead simplifies the pixel
  translation loop, because it doesn't have to worry about going past
  the end of the actual image.
*/


/* macros: */

/* given a 32-bit aligned pointer into the current source image, this
   returns the corresponding 32-bit aligned pointer in the last
   translated ("old") source image.  since the old source image
   follows the current source image, this is simple pointer arithmetic
   using src_bypb: */
#define TME_FB_XLAT_SRC_OLD(raw)				\
  ((tme_uint32_t *) (((tme_uint8_t *) (raw)) + src_bypb))

/* when the fast, aligned 32-bit word comparison loop finds a word
   that has changed in the source image, pixels are translated until
   TME_FB_XLAT_RUN consecutive aligned 32-bit words are processed that
   have *not* changed in the source image, at which point the fast
   comparison loop resumes.

   the idea is that after you've started translating pixels, once the
   FIFO shift operation has read TME_FB_XLAT_RUN consecutive raw,
   unchanged 32-bit words, all of the pixels from previous, changed
   32-bit words have been translated and shifted out of the source bit
   FIFO(s), and all bits remaining in the source bit FIFO(s) are for
   pixels in those unchanged 32-bit words.  since the pixels are
   unchanged, pixel translation can stop, and the entire state of the
   source bit FIFO(s) can be discarded.

   so, each time a raw, changed 32-bit word is read, xlat_run is
   reloaded with TME_FB_XLAT_RUN, and each time 32 bits worth of
   source image pixels are processed, it is decremented.  when it
   reaches zero, the source bit FIFO(s) are discarded, the destination
   bit FIFO(s) are flushed, the pixel translation loop breaks and the
   fast comparison loop continues: */
#define TME_FB_XLAT_RUN (2)

/* this shifts a source FIFO: */
#define TME_FB_XLAT_SHIFT_SRC(unaligned, fifo, next, bits, shift, raw, order)\
do {								\
								\
  /* if the source FIFO may not be 32-bit aligned: */           \
  if (unaligned) {                                              \
                                                                \
    /* we must be shifting between 1 and 32 bits: */		\
    assert ((shift) >= 1 && (shift) <= 32);			\
								\
    /* the FIFO must have more than 32 bits in it already: */	\
    assert (bits > 32);						\
								\
    /* shift the FIFO: */					\
    if ((shift) == 32) {					\
      fifo = next;						\
    }								\
    else if (order == TME_ENDIAN_BIG) {				\
      fifo = (fifo << ((shift) & 31)) | (next >> (32 - (shift)));\
      next <<= ((shift) & 31);					\
    }								\
    else {							\
      fifo = (fifo >> ((shift) & 31)) | (next << (32 - (shift)));\
      next >>= ((shift) & 31);					\
    }								\
    bits -= (shift);						\
								\
    /* if we have a new 32-bit word to read: */			\
    if (bits <= 32) {						\
      next = *raw;						\
      if (*TME_FB_XLAT_SRC_OLD(raw) != next) {			\
        *TME_FB_XLAT_SRC_OLD(raw) = next;			\
        xlat_run = TME_FB_XLAT_RUN;				\
      }								\
      raw++;							\
      next = (order == TME_ENDIAN_BIG				\
              ? tme_betoh_u32(next)				\
              : tme_letoh_u32(next));				\
								\
      /* before the load, if there were fewer than 32 bits	\
         remaining in the FIFO, shift bits from the word	\
         we just loaded into their proper positions: */		\
      if (bits < 32) {						\
        if (order == TME_ENDIAN_BIG) {				\
          fifo |= (next >> bits);				\
          next <<= (32 - bits);					\
        }							\
        else {							\
          fifo |= (next << bits);				\
          next >>= (32 - bits);					\
        }							\
      }								\
								\
      /* there are now 32 more bits in the FIFO: */		\
      bits += 32;						\
    }								\
  }								\
								\
  /* otherwise, if the source FIFO is always 32-bit aligned: */ \
  else {                                                        \
                                                                \
    /* we must be shifting exactly 32 bits: */                  \
    assert((shift) == 32);                                      \
                                                                \
    /* load the next 32-bit word: */                            \
    fifo = *raw;                                                \
    if (*TME_FB_XLAT_SRC_OLD(raw) != fifo) {                    \
      *TME_FB_XLAT_SRC_OLD(raw) = fifo;                         \
      xlat_run = TME_FB_XLAT_RUN;                               \
    }                                                           \
    raw++;                                                      \
    fifo = (order == TME_ENDIAN_BIG                             \
            ? tme_betoh_u32(fifo)                               \
            : tme_letoh_u32(fifo));                             \
  }                                                             \
} while (/* CONSTCOND */ 0)

/* this shifts a destination FIFO: */
#define TME_FB_XLAT_SHIFT_DST(unaligned, fifo, next, bits, shift, raw, order)\
do {								\
								\
  /* if the destination FIFO may not be 32-bit aligned: */      \
  if (unaligned) {						\
								\
    /* we must be shifting between 1 and 32 bits: */		\
    assert ((shift) >= 1 && (shift) <= 32);			\
								\
    /* the FIFO must have fewer than 32 bits in it: */	        \
    assert (bits < 32);						\
								\
    /* shift the FIFO: */					\
    if (order == TME_ENDIAN_BIG) {				\
      next |= (fifo >> bits);					\
      fifo <<= (32 - bits);					\
    }								\
    else {							\
      next |= (fifo << bits);					\
      fifo >>= (32 - bits);					\
    }								\
    if (SHIFTMAX_INT32_T < 32 && bits == 0) {			\
      fifo = 0;							\
    }								\
    bits += (shift);						\
								\
    /* if we have a completed 32-bit word to write: */		\
    if (bits >= 32) {						\
      *(raw++) = (order == TME_ENDIAN_BIG			\
                  ? tme_htobe_u32(next)				\
                  : tme_htole_u32(next));			\
      bits -= 32;						\
      assert(bits != 0 || fifo == 0);				\
      next = fifo;						\
    }								\
  }								\
								\
  /* the destination FIFO is always 32-bit aligned: */		\
  else {							\
								\
    /* we must be shifting exactly 32 bits: */                  \
    assert((shift) == 32);                                      \
                                                                \
    /* store the next 32-bit word: */                           \
    *(raw++) = (order == TME_ENDIAN_BIG                         \
                ? tme_htobe_u32(fifo)                           \
                : tme_htole_u32(fifo));                         \
                                                                \
  }								\
                                                                \
  /* clear the writable part of the FIFO: */			\
  fifo = 0;							\
} while (/* CONSTCOND */ 0)

/* _TME_FB_XLAT_MAP_LINEAR_SCALE gives the factor needed to scale a
   masked value up or down to a given size in bits.  for example, if a
   value's mask is 0xf800 (a five bit mask), and the value needs to be
   scaled up to seven bits, this gives an factor of four.  if a
   value's mask is 0x7e0 (a six bit mask), and the value needs to be
   scaled down to three bits, this gives a factor of eight: */
#define _TME_FB_XLAT_MAP_LINEAR_SCALE(mask_in, mask_out)	\
  (TME_FB_XLAT_MAP_BASE_MASK(mask_in)				\
   ^ TME_FB_XLAT_MAP_BASE_MASK(mask_out))
#define TME_FB_XLAT_MAP_LINEAR_SCALE(mask_in, mask_out)	\
  (_TME_FB_XLAT_MAP_LINEAR_SCALE(mask_in, mask_out)		\
   ? (TME_FB_XLAT_MAP_BASE_MASK(_TME_FB_XLAT_MAP_LINEAR_SCALE(mask_in, mask_out))\
      + 1)							\
   : 1)

/* this linearly maps a value from one mask to another: */
#define _TME_FB_XLAT_MAP_LINEAR(value, mask_in, mask_out)	\
								\
  /* if the value does not need to be scaled up: */		\
  (((TME_FB_XLAT_MAP_BASE_MASK(mask_out)			\
     <= TME_FB_XLAT_MAP_BASE_MASK(mask_in))			\
    ?								\
								\
    /* extract the value and scale it down: */			\
    (TME_FIELD_MASK_EXTRACTU(value, mask_in)			\
     / TME_FB_XLAT_MAP_LINEAR_SCALE(mask_in, mask_out))		\
								\
    /* otherwise, the value needs to be scaled up: */		\
    :								\
								\
    /* extract the value: */					\
    ((TME_FIELD_MASK_EXTRACTU(value, mask_in)			\
								\
      /* scale it up: */					\
      * TME_FB_XLAT_MAP_LINEAR_SCALE(mask_in, mask_out))	\
								\
     /* if the least significant bit of the value is set, add	\
	in the scale minus one.  this makes the linear mapping	\
	at least cover the entire range: */			\
     + (((value /						\
	  _TME_FIELD_MASK_FACTOR(mask_in))			\
	 & 1)							\
	* (TME_FB_XLAT_MAP_LINEAR_SCALE(mask_in, mask_out)	\
	   - 1))))						\
								\
   /* finally, shift the value into position: */		\
   * _TME_FIELD_MASK_FACTOR(mask_out))

/* this indexes a value: */
#define _TME_FB_XLAT_MAP_INDEX(value, mask_out, index)		\
								\
  /* intensities are either stored as 8 or 16 bits: */		\
  (((TME_FB_XLAT_MAP_BASE_MASK(mask_out) <= 0xff)		\
    ? ((const tme_uint8_t *) (index))[(value)]			\
    : ((const tme_uint16_t *) (index))[(value)])		\
								\
   /* shift the value into position: */				\
   * _TME_FIELD_MASK_FACTOR(mask_out))

/* this maps one subfield or intensity value into another subfield or
   intensity value: */
#define TME_FB_XLAT_MAP(value, mask_in, mask_out, indexed, index)\
								\
  /* do the linear mapping or the index mapping: */		\
  ((!(indexed))							\
   ? _TME_FB_XLAT_MAP_LINEAR(value, mask_in, mask_out)		\
   : _TME_FB_XLAT_MAP_INDEX(TME_FIELD_MASK_EXTRACTU(value, mask_in), mask_out, index))

/* this translates frame buffer contents from this source format:
     1152x900
     1 bit deep, 1 bit per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     24 bits deep, 32 bits per pixel, 0 pixels skipped, 32-bit scanline padding, LSB-first, linearly mapped pixels, a g mask of 0xff00, a r mask of 0xff0000, a b mask of 0xff
*/
static int
tme_fb_xlat0(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (1)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (1)
#define dst_bipp (32)

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
#define dst_skipx (0)

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
#define dst_pad (32)

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
#define dst_bypl (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8)

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
#define dst_packed ((dst_width * dst_bipp) == (dst_bypl * 8))

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned (src_fifo0_may_be_unaligned || !dst_packed || (dst_bipp == 24) || (dst_bypl % 4) || ((dst_skipx * dst_bipp) % 32) || (dst_bipp % src_bipp))

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 1, so 32 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* since dst_bipp is known at code-generation time, the pixel
       translation loop is unrolled to translate all destination
       pixels in the 32-bit visible part of the destination bit
       FIFO(s) before shifting.

       in this case, dst_bipp is known to be 32, so 1 pixels will
       be written into the destination bit FIFO(s) before shifting,
       and when the destination bit FIFO(s) are shifted, they are
       shifted 32 bits at a time: */

    /* src_unroll = 32, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #4 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (4 * src_bipp)))
              : (4 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #5 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (5 * src_bipp)))
              : (5 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #6 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (6 * src_bipp)))
              : (6 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #7 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (7 * src_bipp)))
              : (7 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #8 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (8 * src_bipp)))
              : (8 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #9 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (9 * src_bipp)))
              : (9 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #10 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (10 * src_bipp)))
              : (10 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #11 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (11 * src_bipp)))
              : (11 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #12 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (12 * src_bipp)))
              : (12 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #13 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (13 * src_bipp)))
              : (13 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #14 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (14 * src_bipp)))
              : (14 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #15 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (15 * src_bipp)))
              : (15 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #16 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (16 * src_bipp)))
              : (16 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #17 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (17 * src_bipp)))
              : (17 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #18 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (18 * src_bipp)))
              : (18 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #19 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (19 * src_bipp)))
              : (19 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #20 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (20 * src_bipp)))
              : (20 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #21 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (21 * src_bipp)))
              : (21 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #22 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (22 * src_bipp)))
              : (22 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #23 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (23 * src_bipp)))
              : (23 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #24 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (24 * src_bipp)))
              : (24 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #25 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (25 * src_bipp)))
              : (25 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #26 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (26 * src_bipp)))
              : (26 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #27 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (27 * src_bipp)))
              : (27 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #28 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (28 * src_bipp)))
              : (28 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #29 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (29 * src_bipp)))
              : (29 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #30 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (30 * src_bipp)))
              : (30 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #31 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (31 * src_bipp)))
              : (31 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef dst_bipp
#undef src_skipx
#undef dst_skipx
#undef src_pad
#undef dst_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef dst_bypl
#undef src_packed
#undef dst_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     1 bit deep, 1 bit per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     32 bits deep, 32 bits per pixel, 0 pixels skipped, 32-bit scanline padding, LSB-first, linearly mapped pixels, a g mask of 0xff00ff00, a r mask of 0xffff0000, a b mask of 0xff0000ff
*/
static int
tme_fb_xlat1(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (1)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (1)
#define dst_bipp (32)

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
#define dst_skipx (0)

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
#define dst_pad (32)

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
#define dst_bypl (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8)

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
#define dst_packed ((dst_width * dst_bipp) == (dst_bypl * 8))

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned (src_fifo0_may_be_unaligned || !dst_packed || (dst_bipp == 24) || (dst_bypl % 4) || ((dst_skipx * dst_bipp) % 32) || (dst_bipp % src_bipp))

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 1, so 32 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* since dst_bipp is known at code-generation time, the pixel
       translation loop is unrolled to translate all destination
       pixels in the 32-bit visible part of the destination bit
       FIFO(s) before shifting.

       in this case, dst_bipp is known to be 32, so 1 pixels will
       be written into the destination bit FIFO(s) before shifting,
       and when the destination bit FIFO(s) are shifted, they are
       shifted 32 bits at a time: */

    /* src_unroll = 32, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #4 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (4 * src_bipp)))
              : (4 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #5 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (5 * src_bipp)))
              : (5 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #6 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (6 * src_bipp)))
              : (6 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #7 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (7 * src_bipp)))
              : (7 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #8 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (8 * src_bipp)))
              : (8 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #9 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (9 * src_bipp)))
              : (9 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #10 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (10 * src_bipp)))
              : (10 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #11 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (11 * src_bipp)))
              : (11 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #12 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (12 * src_bipp)))
              : (12 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #13 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (13 * src_bipp)))
              : (13 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #14 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (14 * src_bipp)))
              : (14 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #15 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (15 * src_bipp)))
              : (15 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #16 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (16 * src_bipp)))
              : (16 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #17 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (17 * src_bipp)))
              : (17 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #18 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (18 * src_bipp)))
              : (18 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #19 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (19 * src_bipp)))
              : (19 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #20 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (20 * src_bipp)))
              : (20 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #21 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (21 * src_bipp)))
              : (21 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #22 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (22 * src_bipp)))
              : (22 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #23 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (23 * src_bipp)))
              : (23 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #24 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (24 * src_bipp)))
              : (24 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #25 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (25 * src_bipp)))
              : (25 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #26 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (26 * src_bipp)))
              : (26 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #27 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (27 * src_bipp)))
              : (27 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #28 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (28 * src_bipp)))
              : (28 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #29 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (29 * src_bipp)))
              : (29 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #30 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (30 * src_bipp)))
              : (30 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #31 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (31 * src_bipp)))
              : (31 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef dst_bipp
#undef src_skipx
#undef dst_skipx
#undef src_pad
#undef dst_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef dst_bypl
#undef src_packed
#undef dst_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     1 bit deep, 1 bit per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     any depth, any bits per pixel, any number of pixels skipped, any scanline padding, MSB-first, any pixel mapping, any g mask, any r mask, any b mask
*/
static int
tme_fb_xlat2(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (1)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (1)
  const unsigned int dst_bipp = dst->tme_fb_connection_bits_per_pixel;

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
  const unsigned int dst_skipx = dst->tme_fb_connection_skipx;

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
  const unsigned int dst_pad = dst->tme_fb_connection_scanline_pad;

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_BIG)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
  const unsigned int dst_bypl = (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8);

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
  const unsigned int dst_packed = ((dst_width * dst_bipp) == (dst_bypl * 8));

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned TRUE

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 1, so 32 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* src_unroll = 32, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #4 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (4 * src_bipp)))
              : (4 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #5 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (5 * src_bipp)))
              : (5 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #6 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (6 * src_bipp)))
              : (6 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #7 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (7 * src_bipp)))
              : (7 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #8 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (8 * src_bipp)))
              : (8 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #9 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (9 * src_bipp)))
              : (9 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #10 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (10 * src_bipp)))
              : (10 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #11 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (11 * src_bipp)))
              : (11 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #12 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (12 * src_bipp)))
              : (12 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #13 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (13 * src_bipp)))
              : (13 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #14 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (14 * src_bipp)))
              : (14 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #15 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (15 * src_bipp)))
              : (15 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #16 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (16 * src_bipp)))
              : (16 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #17 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (17 * src_bipp)))
              : (17 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #18 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (18 * src_bipp)))
              : (18 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #19 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (19 * src_bipp)))
              : (19 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #20 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (20 * src_bipp)))
              : (20 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #21 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (21 * src_bipp)))
              : (21 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #22 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (22 * src_bipp)))
              : (22 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #23 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (23 * src_bipp)))
              : (23 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #24 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (24 * src_bipp)))
              : (24 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #25 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (25 * src_bipp)))
              : (25 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #26 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (26 * src_bipp)))
              : (26 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #27 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (27 * src_bipp)))
              : (27 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #28 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (28 * src_bipp)))
              : (28 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #29 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (29 * src_bipp)))
              : (29 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #30 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (30 * src_bipp)))
              : (30 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #31 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (31 * src_bipp)))
              : (31 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef src_skipx
#undef src_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef src_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     1 bit deep, 1 bit per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     any depth, any bits per pixel, any number of pixels skipped, any scanline padding, LSB-first, any pixel mapping, any g mask, any r mask, any b mask
*/
static int
tme_fb_xlat3(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (1)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (1)
  const unsigned int dst_bipp = dst->tme_fb_connection_bits_per_pixel;

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
  const unsigned int dst_skipx = dst->tme_fb_connection_skipx;

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
  const unsigned int dst_pad = dst->tme_fb_connection_scanline_pad;

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
  const unsigned int dst_bypl = (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8);

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
  const unsigned int dst_packed = ((dst_width * dst_bipp) == (dst_bypl * 8));

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned TRUE

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 1, so 32 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* src_unroll = 32, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #4 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (4 * src_bipp)))
              : (4 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #5 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (5 * src_bipp)))
              : (5 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #6 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (6 * src_bipp)))
              : (6 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #7 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (7 * src_bipp)))
              : (7 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #8 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (8 * src_bipp)))
              : (8 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #9 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (9 * src_bipp)))
              : (9 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #10 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (10 * src_bipp)))
              : (10 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #11 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (11 * src_bipp)))
              : (11 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #12 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (12 * src_bipp)))
              : (12 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #13 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (13 * src_bipp)))
              : (13 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #14 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (14 * src_bipp)))
              : (14 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #15 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (15 * src_bipp)))
              : (15 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #16 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (16 * src_bipp)))
              : (16 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #17 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (17 * src_bipp)))
              : (17 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #18 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (18 * src_bipp)))
              : (18 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #19 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (19 * src_bipp)))
              : (19 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #20 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (20 * src_bipp)))
              : (20 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #21 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (21 * src_bipp)))
              : (21 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #22 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (22 * src_bipp)))
              : (22 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #23 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (23 * src_bipp)))
              : (23 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #24 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (24 * src_bipp)))
              : (24 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #25 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (25 * src_bipp)))
              : (25 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #26 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (26 * src_bipp)))
              : (26 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #27 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (27 * src_bipp)))
              : (27 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #28 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (28 * src_bipp)))
              : (28 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #29 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (29 * src_bipp)))
              : (29 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #30 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (30 * src_bipp)))
              : (30 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #31 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (31 * src_bipp)))
              : (31 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef src_skipx
#undef src_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef src_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     8 bits deep, 8 bits per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     24 bits deep, 32 bits per pixel, 0 pixels skipped, 32-bit scanline padding, LSB-first, linearly mapped pixels, a g mask of 0xff00, a r mask of 0xff0000, a b mask of 0xff
*/
static int
tme_fb_xlat4(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (8)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (8)
#define dst_bipp (32)

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
#define dst_skipx (0)

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
#define dst_pad (32)

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
#define dst_bypl (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8)

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
#define dst_packed ((dst_width * dst_bipp) == (dst_bypl * 8))

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned (src_fifo0_may_be_unaligned || !dst_packed || (dst_bipp == 24) || (dst_bypl % 4) || ((dst_skipx * dst_bipp) % 32) || (dst_bipp % src_bipp))

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 8, so 4 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* since dst_bipp is known at code-generation time, the pixel
       translation loop is unrolled to translate all destination
       pixels in the 32-bit visible part of the destination bit
       FIFO(s) before shifting.

       in this case, dst_bipp is known to be 32, so 1 pixels will
       be written into the destination bit FIFO(s) before shifting,
       and when the destination bit FIFO(s) are shifted, they are
       shifted 32 bits at a time: */

    /* src_unroll = 4, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef dst_bipp
#undef src_skipx
#undef dst_skipx
#undef src_pad
#undef dst_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef dst_bypl
#undef src_packed
#undef dst_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     8 bits deep, 8 bits per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     32 bits deep, 32 bits per pixel, 0 pixels skipped, 32-bit scanline padding, LSB-first, linearly mapped pixels, a g mask of 0xff00ff00, a r mask of 0xffff0000, a b mask of 0xff0000ff
*/
static int
tme_fb_xlat5(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (8)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (8)
#define dst_bipp (32)

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
#define dst_skipx (0)

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
#define dst_pad (32)

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
#define dst_bypl (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8)

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
#define dst_packed ((dst_width * dst_bipp) == (dst_bypl * 8))

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned (src_fifo0_may_be_unaligned || !dst_packed || (dst_bipp == 24) || (dst_bypl % 4) || ((dst_skipx * dst_bipp) % 32) || (dst_bipp % src_bipp))

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 8, so 4 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* since dst_bipp is known at code-generation time, the pixel
       translation loop is unrolled to translate all destination
       pixels in the 32-bit visible part of the destination bit
       FIFO(s) before shifting.

       in this case, dst_bipp is known to be 32, so 1 pixels will
       be written into the destination bit FIFO(s) before shifting,
       and when the destination bit FIFO(s) are shifted, they are
       shifted 32 bits at a time: */

    /* src_unroll = 4, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef dst_bipp
#undef src_skipx
#undef dst_skipx
#undef src_pad
#undef dst_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef dst_bypl
#undef src_packed
#undef dst_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     8 bits deep, 8 bits per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     any depth, any bits per pixel, any number of pixels skipped, any scanline padding, MSB-first, any pixel mapping, any g mask, any r mask, any b mask
*/
static int
tme_fb_xlat6(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (8)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (8)
  const unsigned int dst_bipp = dst->tme_fb_connection_bits_per_pixel;

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
  const unsigned int dst_skipx = dst->tme_fb_connection_skipx;

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
  const unsigned int dst_pad = dst->tme_fb_connection_scanline_pad;

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_BIG)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
  const unsigned int dst_bypl = (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8);

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
  const unsigned int dst_packed = ((dst_width * dst_bipp) == (dst_bypl * 8));

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned TRUE

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 8, so 4 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* src_unroll = 4, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef src_skipx
#undef src_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef src_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     8 bits deep, 8 bits per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, color, index mapped pixels, 8 bits per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     any depth, any bits per pixel, any number of pixels skipped, any scanline padding, LSB-first, any pixel mapping, any g mask, any r mask, any b mask
*/
static int
tme_fb_xlat7(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (8)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (8)
  const unsigned int dst_bipp = dst->tme_fb_connection_bits_per_pixel;

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
  const unsigned int dst_skipx = dst->tme_fb_connection_skipx;

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
  const unsigned int dst_pad = dst->tme_fb_connection_scanline_pad;

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
  const unsigned int dst_bypl = (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8);

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
  const unsigned int dst_packed = ((dst_width * dst_bipp) == (dst_bypl * 8));

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned TRUE

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 8, so 4 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* src_unroll = 4, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef src_skipx
#undef src_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef src_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     1 bit deep, 1 bit per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, monochrome, linearly mapped pixels, 1 bit per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     24 bits deep, 32 bits per pixel, 0 pixels skipped, 32-bit scanline padding, LSB-first, linearly mapped pixels, a g mask of 0xff00, a r mask of 0xff0000, a b mask of 0xff
*/
static int
tme_fb_xlat8(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (1)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (1)
#define dst_bipp (32)

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
#define dst_skipx (0)

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
#define dst_pad (32)

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
#define dst_bypl (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8)

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
#define dst_packed ((dst_width * dst_bipp) == (dst_bypl * 8))

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned (src_fifo0_may_be_unaligned || !dst_packed || (dst_bipp == 24) || (dst_bypl % 4) || ((dst_skipx * dst_bipp) % 32) || (dst_bipp % src_bipp))

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 1, so 32 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* since dst_bipp is known at code-generation time, the pixel
       translation loop is unrolled to translate all destination
       pixels in the 32-bit visible part of the destination bit
       FIFO(s) before shifting.

       in this case, dst_bipp is known to be 32, so 1 pixels will
       be written into the destination bit FIFO(s) before shifting,
       and when the destination bit FIFO(s) are shifted, they are
       shifted 32 bits at a time: */

    /* src_unroll = 32, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #4 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (4 * src_bipp)))
              : (4 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #5 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (5 * src_bipp)))
              : (5 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #6 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (6 * src_bipp)))
              : (6 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #7 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (7 * src_bipp)))
              : (7 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #8 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (8 * src_bipp)))
              : (8 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #9 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (9 * src_bipp)))
              : (9 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #10 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (10 * src_bipp)))
              : (10 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #11 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (11 * src_bipp)))
              : (11 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #12 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (12 * src_bipp)))
              : (12 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #13 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (13 * src_bipp)))
              : (13 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #14 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (14 * src_bipp)))
              : (14 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #15 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (15 * src_bipp)))
              : (15 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #16 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (16 * src_bipp)))
              : (16 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #17 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (17 * src_bipp)))
              : (17 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #18 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (18 * src_bipp)))
              : (18 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #19 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (19 * src_bipp)))
              : (19 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #20 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (20 * src_bipp)))
              : (20 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #21 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (21 * src_bipp)))
              : (21 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #22 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (22 * src_bipp)))
              : (22 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #23 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (23 * src_bipp)))
              : (23 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #24 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (24 * src_bipp)))
              : (24 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #25 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (25 * src_bipp)))
              : (25 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #26 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (26 * src_bipp)))
              : (26 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #27 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (27 * src_bipp)))
              : (27 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #28 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (28 * src_bipp)))
              : (28 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #29 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (29 * src_bipp)))
              : (29 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #30 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (30 * src_bipp)))
              : (30 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #31 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (31 * src_bipp)))
              : (31 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef dst_bipp
#undef src_skipx
#undef dst_skipx
#undef src_pad
#undef dst_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef dst_bypl
#undef src_packed
#undef dst_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     1 bit deep, 1 bit per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, monochrome, linearly mapped pixels, 1 bit per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     32 bits deep, 32 bits per pixel, 0 pixels skipped, 32-bit scanline padding, LSB-first, linearly mapped pixels, a g mask of 0xff00ff00, a r mask of 0xffff0000, a b mask of 0xff0000ff
*/
static int
tme_fb_xlat9(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (1)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (1)
#define dst_bipp (32)

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
#define dst_skipx (0)

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
#define dst_pad (32)

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_LITTLE)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
#define dst_bypl (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8)

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
#define dst_packed ((dst_width * dst_bipp) == (dst_bypl * 8))

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned (src_fifo0_may_be_unaligned || !dst_packed || (dst_bipp == 24) || (dst_bypl % 4) || ((dst_skipx * dst_bipp) % 32) || (dst_bipp % src_bipp))

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 1, so 32 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* since dst_bipp is known at code-generation time, the pixel
       translation loop is unrolled to translate all destination
       pixels in the 32-bit visible part of the destination bit
       FIFO(s) before shifting.

       in this case, dst_bipp is known to be 32, so 1 pixels will
       be written into the destination bit FIFO(s) before shifting,
       and when the destination bit FIFO(s) are shifted, they are
       shifted 32 bits at a time: */

    /* src_unroll = 32, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #4 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (4 * src_bipp)))
              : (4 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #5 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (5 * src_bipp)))
              : (5 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #6 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (6 * src_bipp)))
              : (6 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #7 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (7 * src_bipp)))
              : (7 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #8 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (8 * src_bipp)))
              : (8 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #9 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (9 * src_bipp)))
              : (9 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #10 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (10 * src_bipp)))
              : (10 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #11 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (11 * src_bipp)))
              : (11 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #12 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (12 * src_bipp)))
              : (12 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #13 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (13 * src_bipp)))
              : (13 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #14 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (14 * src_bipp)))
              : (14 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #15 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (15 * src_bipp)))
              : (15 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #16 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (16 * src_bipp)))
              : (16 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #17 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (17 * src_bipp)))
              : (17 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #18 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (18 * src_bipp)))
              : (18 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #19 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (19 * src_bipp)))
              : (19 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #20 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (20 * src_bipp)))
              : (20 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #21 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (21 * src_bipp)))
              : (21 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #22 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (22 * src_bipp)))
              : (22 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #23 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (23 * src_bipp)))
              : (23 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #24 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (24 * src_bipp)))
              : (24 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #25 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (25 * src_bipp)))
              : (25 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #26 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (26 * src_bipp)))
              : (26 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #27 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (27 * src_bipp)))
              : (27 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #28 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (28 * src_bipp)))
              : (28 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #29 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (29 * src_bipp)))
              : (29 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #30 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (30 * src_bipp)))
              : (30 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

      /* iter #31 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (31 * src_bipp)))
              : (31 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* we've just translated another 32-bit word of the
         source image, so decrement xlat_run: */
      xlat_run--;

      /* shift the source primary FIFO: */
      TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                            src_fifo0,
                            src_fifo0_next,
                            src_fifo0_bits,
                            32,
                            src_raw0,
                            src_order);

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            32,
                            dst_raw0,
                            dst_order);

    }

    /* if the destination FIFOs may be unaligned, there
       may be bits left in the FIFO that we need to flush: */
    if (dst_fifo0_may_be_unaligned
        && dst_fifo0_bits > 0) {
      dst_fifo0 = *dst_raw0;
      if (dst_order == TME_ENDIAN_BIG) {
        dst_fifo0_next |= (tme_betoh_u32(dst_fifo0) & (0xffffffff >> dst_fifo0_bits));
        dst_fifo0_next = tme_htobe_u32(dst_fifo0_next);
      }
      else {
        dst_fifo0_next |= (tme_letoh_u32(dst_fifo0) & (0xffffffff << dst_fifo0_bits));
        dst_fifo0_next = tme_htole_u32(dst_fifo0_next);
      }
      *dst_raw0 = dst_fifo0;
    }

    /* loop back to compare more 32-bit words: */
    src_raw0--;
  }

  /* return nonzero iff we did some translating: */
  return (xlat_run >= 0);

#undef dst_x
#undef dst_y
#undef src_width
#undef dst_width
#undef src_depth
#undef src_mask
#undef src_bipp
#undef dst_bipp
#undef src_skipx
#undef dst_skipx
#undef src_pad
#undef dst_pad
#undef src_order
#undef dst_order
#undef src_bypl
#undef dst_bypl
#undef src_packed
#undef dst_packed
#undef src_bypb_real
#undef src_bypb
#undef src_fifo0_may_be_unaligned
#undef dst_fifo0_may_be_unaligned
}

/* this translates frame buffer contents from this source format:
     1152x900
     1 bit deep, 1 bit per pixel, 0 pixels skipped, 32-bit scanline padding, MSB-first, monochrome, linearly mapped pixels, 1 bit per mapped intensity, no g mask, no r mask, no b mask
   to this destination format:
     any depth, any bits per pixel, any number of pixels skipped, any scanline padding, MSB-first, any pixel mapping, any g mask, any r mask, any b mask
*/
static int
tme_fb_xlat10(struct tme_fb_connection *src,
             struct tme_fb_connection *dst)
{

  /* whenever possible we define macros instead of declaring
     variables, for optimization: */

  /* declare src_x and src_y.  these are the current translation
     coordinates in the source image: */
  unsigned int src_x, src_y;

  /* declare dst_x and dst_y.  these are the current translation
     coordinates in the destination image.  since this function
     does not scale the image, these coordinates are always
     the same as the coordinates in the source image: */
#define dst_x (src_x)
#define dst_y (src_y)

  /* declare pixel.  this holds a single pixel value being translated
     for the destination image: */
  tme_uint32_t pixel;

  /* declare src_width and dst_width.  these are in terms of pixels: */
#define src_width (1152)
#define dst_width (src_width)

  /* declare src_depth, the source pixel depth, which is in
     terms of bits.  declare src_mask, which is the corresponding
     mask of one bits: */
#define src_depth (1)
#define src_mask (0xffffffff >> (32 - src_depth))

  /* declare src_bipp and dst_bipp.  these are the bits-per-pixel
     values for the source and destination images: */
#define src_bipp (1)
  const unsigned int dst_bipp = dst->tme_fb_connection_bits_per_pixel;

  /* declare src_skipx and dst_skipx.  these are the counts of
     undisplayed pixels at the beginning of each scanline in the
     source and destination images: */
#define src_skipx (0)
  const unsigned int dst_skipx = dst->tme_fb_connection_skipx;

  /* declare src_pad and dst_pad.  these are the paddings, in bits,
     of each scanline in the source and destination images: */
#define src_pad (32)
  const unsigned int dst_pad = dst->tme_fb_connection_scanline_pad;

  /* declare src_order and dst_order.  these are the bit and byte
     orders (either TME_ENDIAN_BIG or TME_ENDIAN_LITTLE) of the
     source and destination images.  since these values profoundly
     affect optimization, they are always constant: */
#define src_order (TME_ENDIAN_BIG)
#define dst_order (TME_ENDIAN_BIG)

  /* declare src_bypl and dst_bypl.  these are the bytes per scanline
     in the source and destination images.  these values are calculated
     from the count of undisplayed and displayed pixels per scanline,
     the number of bits per pixel, and the scanline padding: */
#define src_bypl (((((src_skipx + src_width) * src_bipp) + (src_pad - 1)) & -src_pad) / 8)
  const unsigned int dst_bypl = (((((dst_skipx + dst_width) * dst_bipp) + (dst_pad - 1)) & -dst_pad) / 8);

  /* declare src_packed and dst_packed.  these are nonzero iff
     every last bit in a scanline belongs to a displayed pixel.
     put another way, this is zero iff a scanline has undisplayed
     pixels at its beginning or padding bits at its end.  when
     a source image or destination image is packed, translation
     doesn't have to worry about skipping FIFO bits to get to
     bits belonging to displayed pixels: */
#define src_packed ((src_width * src_bipp) == (src_bypl * 8))
  const unsigned int dst_packed = ((dst_width * dst_bipp) == (dst_bypl * 8));

  /* declare src_bypb and src_bypb_real.  src_bypb is the bytes
     per source image buffer with the "translation termination
     overhead" of approximately two extra scanlines.  src_bypb_real
     is the real bytes per source image buffer with no overhead.
     both values are padded to a multiple of 4 bytes (32 bits): */
#define src_bypb_real (((900 * src_bypl) + 3) & -4)
#define src_bypb ((src_bypb_real + (src_bypl * 2)) & -4)

  /* declare the source primary bit FIFO:

     src_raw0 points to the next aligned 32-bit word to be
     read from the image buffer.

     src_fifo0 is the visible part of the bit FIFO.

     src_fifo0_next and src_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     src_fifo0_next is the invisible part of the bit FIFO,
     and src_fifo0_bits tracks the total number of bits in the
     visible and invisible parts of the FIFO. */
  const tme_uint32_t *src_raw0;
  tme_uint32_t src_fifo0, src_fifo0_next;
  unsigned int src_fifo0_bits;

  /* declare the destination primary bit FIFO:

     dst_raw0 points to the next aligned 32-bit word to be
     written into the image buffer.

     dst_fifo0 is the visible part of the bit FIFO.

     dst_fifo0_next and dst_fifo0_bits are only used when the
     visible part of the bit FIFO is not guaranteed to always
     correspond to an aligned 32-bit word in the image buffer.
     dst_fifo0_next is the invisible part of the bit FIFO,
     and dst_fifo0_bits tracks the total number of bits in the
     invisible part of the FIFO. */
  tme_uint32_t *dst_raw0;
  tme_uint32_t dst_fifo0, dst_fifo0_next;
  unsigned int dst_fifo0_bits;

  /* declare src_off and dst_off.  these are used when priming a
     source or destination bit FIFO, to identify an initial aligned
     32-bit word in the source or destination image buffer, and an
     initial bit offset within that word: */
  unsigned int src_off, dst_off;

  /* declare src_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the source buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the source bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for source pixels, and are forced to shift the FIFO after
       each one.

     - if the source image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per source pixel.  in this case, a
       source pixel may cross a 32-bit boundary: */
#define src_fifo0_may_be_unaligned (!src_packed || (src_bipp == 24))

  /* declare dst_fifo0_may_be_unaligned.  this is zero iff all
     aligned 32-bit words in the destination buffer contain a whole
     number of displayed pixels, and at *all times during the
     translation* the visible part of the bit FIFO is guaranteed
     to correspond to an aligned 32-bit word in the image buffer.

     this is *not* so if any of the following are true:

     - the destination bits-per-pixel value is not known at compile
       time.  in this case, we can't unroll the translation loop
       for destination pixels, and are forced to shift the FIFO
       after each one.

     - if src_fifo0_may_be_unaligned is true.  in this case, we
       definitely can't guarantee that any initial dst_x will
       correspond to an aligned 32-bit word in the destination buffer.

     - if the destination image is not packed.  in this case, there may
       be undisplayed pixels in the FIFO, which we will need to
       shift out.

     - if there are 24 bits per destination pixel.  in this case,
       a destination pixel may cross a 32-bit boundary.

     - if a possible initial dst_x doesn't correspond to an aligned
       32-bit word in the destination buffer.  for this last one:

     since we require that src_fifo0_may_be_unaligned is zero, we
     know that the initial src_x = (Z * 32) / src_bipp for 
     some Z.  we also have the initial dst_x = src_x.
     the initial destination bit offset will then be:

     (dst_skipx + dst_x) * dst_bipp
     = (dst_skipx * dst_bipp) + (dst_x * dst_bipp)

     if we additionally require that (dst_skipx * dst_bipp)
     be 32-bit aligned, this reduces things to:

     dst_x * dst_bipp
     = (src_x) * dst_bipp
     = (((Z * 32) / src_bipp)) * dst_bipp

     which will be a multiple of 32 iff:

      ((1 / src_bipp)) * dst_bipp >= 1 and integral

     or, equivalently:

       (dst_bipp % src_bipp) == 0
  */
#define dst_fifo0_may_be_unaligned TRUE

  /* declare src_offset_updated_first and src_offset_updated_last,
     which hold the offsets of the first and last updated bytes in
     the source image: */
  tme_uint32_t src_offset_updated_first;
  tme_uint32_t src_offset_updated_last;

  /* declare src_raw0_end.  when treating the source image as
     an array of aligned 32-bit words, this variable holds the
     address of the first word after the real source image.
     if the fast, aligned 32-bit word comparison loop passes
     this point, the entire source image has been processed and
     the function terminates: */
  const tme_uint32_t *src_raw0_end;

  /* declare xlat_run.  see the comment for the TME_FB_XLAT_RUN
     macro for an explanation of what this variable does: */
  int xlat_run;

  /* this silences gcc -Wuninitialized: */
  src_fifo0_next = 0;
  src_fifo0_bits = 0;
  dst_fifo0_next = 0;
  dst_fifo0_bits = 0;

  /* initialize src_raw0 and src_raw0_end for the fast aligned 32-bit
     word comparison loop.  on entry to (and when continuing) that loop,
     src_raw0 always points to the aligned 32-bit word *before* the
     next word to check.  src_raw0_end always points after the last
     word to check.

     src_raw0 is actually part of the source primary bit FIFO, which
     is good, because when the fast comparison fails on a word, src_raw0
     is already primed and ready to work for that bit FIFO: */
  src_offset_updated_first = src->tme_fb_connection_offset_updated_first;
  src_offset_updated_last = TME_MIN(src->tme_fb_connection_offset_updated_last, src_bypb_real - 1);
  src->tme_fb_connection_offset_updated_first = 0;
  src->tme_fb_connection_offset_updated_last = src_bypb_real - 1;
  if (src_offset_updated_first > src_offset_updated_last) {
    return (FALSE);
  }
  src_raw0
    = (((const tme_uint32_t *)
        (src->tme_fb_connection_buffer
         + (src_offset_updated_first
            & (0 - (tme_uint32_t) sizeof(tme_uint32_t)))))
       -1);
  src_raw0_end
    = ((const tme_uint32_t *)
       (src->tme_fb_connection_buffer
        + src_offset_updated_last
        + 1));

  /* initialize xlat_run to -1.  it can never go negative inside the
     pixel translation loop, so if xlat_run stays negative for the
     entire translation, it means that the source image hasn't changed
     since the last translation.  this information is returned to the
     caller to hopefully save more work in updating the display: */
  xlat_run = -1;

  /* this is the main translation loop, which contains the fast aligned
     32-bit word comparison loop, and the pixel translation loop: */
  for (;;) {

    /* this is the fast aligned 32-bit word comparison loop.  it
       terminates either when a word fails comparison, or when the
       entire source image has been compared.  the if test that
       follows checks for the latter case and breaks the main
       translation loop: */
    for (; (++src_raw0 < src_raw0_end
            && *src_raw0 == *TME_FB_XLAT_SRC_OLD(src_raw0)); );
    if (src_raw0 >= src_raw0_end) {
      break;
    }

    /* calculate the byte offset into the source buffer of the
       32-bit word that failed comparison: */
    src_off = ((tme_uint8_t *) src_raw0) - src->tme_fb_connection_buffer;

    /* calculate the source y pixel coordinate, and reduce
       src_off from the byte offset into the buffer to the
       byte offset into that scanline: */
    src_y = src_off / src_bypl;
    src_off = src_off % src_bypl;

    /* while translating pixels, we use one or more "bit FIFOs",
       each composed of one or more 32-bit integers.  we load these
       FIFOs 32 bits at a time. */

    /* prime the visible part of the source primary bit FIFO: */
    src_fifo0 = *src_raw0;
    *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0;
    src_raw0++;
    src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                 ? tme_betoh_u32(src_fifo0)
                 : tme_letoh_u32(src_fifo0));

    /* if the source primary bit FIFO may be unaligned: */
    if (src_fifo0_may_be_unaligned) {

      /* prime the invisible part of the source primary bit FIFO and
         assume that we will not have to shift it to finish: */
      src_fifo0_next = *src_raw0;
      *TME_FB_XLAT_SRC_OLD(src_raw0) = src_fifo0_next;
      src_raw0++;
      src_fifo0_next = ((src_order == TME_ENDIAN_BIG)
                        ? tme_betoh_u32(src_fifo0_next)
                        : tme_letoh_u32(src_fifo0_next));
      src_fifo0_bits = 0;

      /* if there are pixels that need to be skipped, the first 32 bits
         we loaded into the FIFO may have first bits that belong to
         those undisplayed (skipped) pixels.  it is *not* possible for
         it to have first bits that belong to the scanline pad; there
         might be pad bits in the *middle* of the first 32 bits, but any
         first bits *must* belong to pixels, displayed or not: */
      if (src_skipx > 0
          && (src_off * 8) < (src_skipx * src_bipp)) {

        /* see how many bits we will need to skip: */
        src_fifo0_bits = (src_skipx * src_bipp) - (src_off * 8);

        /* if it is more than 31 bits, this is an entire 32 bits of
           undisplayed pixels.  just advance: */
        if (src_fifo0_bits > 31) {
          src_raw0--;
          continue;
        }

        /* set the source x coordinate to zero: */
        src_x = 0;
      }

      /* otherwise, the first 32 bits we load will have first bits for
         a displayable pixel: */
      else {

        /* if the source bits per pixel is 24,  calculate the number of
           bytes *before* the original src_raw0 of any split pixel, and
           subtract this from src_off, to leave src_off as the byte offset
           into the scanline of the beginning of a pixel: */
        if (src_bipp == 24) {
          src_fifo0_bits = (src_off % 3);
          src_off -= src_fifo0_bits;

          /* if this is a split pixel, we need to prime the source primary
              bit FIFO starting with the part *before* the original src_raw0.
              we do not have to copy to the old; it passed comparison: */
          if (src_fifo0_bits) {
            src_raw0--;
            src_fifo0_next = src_fifo0;
            src_fifo0 = ((src_order == TME_ENDIAN_BIG)
                         ? tme_betoh_u32(*(src_raw0 - 2))
                         : tme_letoh_u32(*(src_raw0 - 2)));
          }
        }

        /* calculate the source x coordinate: */
        src_x = ((src_off * 8) / src_bipp) - src_skipx;
      }

      /* do any shifting to finish priming the source primary FIFO: */
      if (src_fifo0_bits) {
        if (src_order == TME_ENDIAN_BIG) {
          src_fifo0 = (src_fifo0 << src_fifo0_bits) | (src_fifo0_next >> (32 - src_fifo0_bits));
          src_fifo0_next <<= src_fifo0_bits;
        }
        else {
          src_fifo0 = (src_fifo0 >> src_fifo0_bits) | (src_fifo0_next << (32 - src_fifo0_bits));
          src_fifo0_next >>= src_fifo0_bits;
        }
      }
      src_fifo0_bits = 64 - src_fifo0_bits;
    }

    /* otherwise, the source primary FIFO is aligned: */
    else {
      src_x = ((src_off * 8) / src_bipp) - src_skipx;
    }

    /* prime the destination primary bit FIFO: */
    dst_fifo0 = 0;
    if (dst_fifo0_may_be_unaligned) {

      /* calculate the bit offset into the destination buffer of
         the destination pixel: */
      dst_off = (dst_y * dst_bypl * 8) + ((dst_skipx + dst_x) * dst_bipp);

      /* calculate the number of bits that will be in the primed FIFO: */
      dst_fifo0_bits = dst_off % 32;

      /* set dst_raw0: */
      dst_raw0 = (tme_uint32_t *)
        (dst->tme_fb_connection_buffer
         + ((dst_off - dst_fifo0_bits) / 8));

      /* prime the primary destination FIFO: */
      dst_fifo0_next = 0;
      if (dst_fifo0_bits) {
        dst_fifo0_next = (src_order == TME_ENDIAN_BIG
                          ? (tme_betoh_u32(*dst_raw0) & (0xffffffffUL << (32 - dst_fifo0_bits)))
                          : (tme_letoh_u32(*dst_raw0) & (0xffffffffUL >> (32 - dst_fifo0_bits))));
      }
    }

    /* otherwise the destination primary FIFO is aligned: */
    else {
      dst_off = (dst_y * dst_bypl) + (((dst_skipx + dst_x) * dst_bipp) / 8);
      dst_raw0 = (tme_uint32_t *) (dst->tme_fb_connection_buffer + dst_off);
    }

    /* since src_bipp is known at code-generation time, the
       pixel translation loop is unrolled to translate all
       source pixels in the 32-bit visible part of the source
       bit FIFO(s) before shifting.

       in this case, src_bipp is known to be 1, so 32 pixels will
       be read out of the source bit FIFO(s) before shifting, and
       when the source bit FIFO(s) are shifted, they are shifted
       32 bits at a time: */

    /* src_unroll = 32, src_iter_scale = 1
       dst_unroll = 1, dst_iter_scale = 1 */
    for (xlat_run = TME_FB_XLAT_RUN;
         xlat_run > 0; ) {

      /* iter #0 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (0 * src_bipp)))
              : (0 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #1 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (1 * src_bipp)))
              : (1 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #2 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (2 * src_bipp)))
              : (2 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #3 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (3 * src_bipp)))
              : (3 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #4 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (4 * src_bipp)))
              : (4 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #5 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (5 * src_bipp)))
              : (5 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #6 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (6 * src_bipp)))
              : (6 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #7 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (7 * src_bipp)))
              : (7 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #8 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (8 * src_bipp)))
              : (8 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #9 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (9 * src_bipp)))
              : (9 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #10 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (10 * src_bipp)))
              : (10 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #11 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (11 * src_bipp)))
              : (11 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #12 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (12 * src_bipp)))
              : (12 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #13 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (13 * src_bipp)))
              : (13 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #14 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (14 * src_bipp)))
              : (14 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #15 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (15 * src_bipp)))
              : (15 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #16 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (16 * src_bipp)))
              : (16 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #17 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (17 * src_bipp)))
              : (17 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #18 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (18 * src_bipp)))
              : (18 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #19 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (19 * src_bipp)))
              : (19 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #20 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (20 * src_bipp)))
              : (20 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #21 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (21 * src_bipp)))
              : (21 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #22 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (22 * src_bipp)))
              : (22 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #23 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (23 * src_bipp)))
              : (23 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #24 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (24 * src_bipp)))
              : (24 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #25 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (25 * src_bipp)))
              : (25 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #26 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (26 * src_bipp)))
              : (26 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #27 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (27 * src_bipp)))
              : (27 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                                dst_fifo0,
                                dst_fifo0_next,
                                dst_fifo0_bits,
                                TME_MIN(dst_off, 32),
                                dst_raw0,
                                dst_order);
        }

        /* we are now on the first pixel of the next scanline: */
        dst_x = 0;
      }

      /* shift the destination primary FIFO: */
      TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
                            dst_fifo0,
                            dst_fifo0_next,
                            dst_fifo0_bits,
                            dst_bipp,
                            dst_raw0,
                            dst_order);

      /* iter #28 */

      /* get a pixel from the source primary FIFO: */
      pixel =
        ((src_fifo0
          >> (src_order == TME_ENDIAN_BIG
              ? (32 - (src_bipp + (28 * src_bipp)))
              : (28 * src_bipp))));

      /* since source pixels are known at compile time to
         not have subfields, map the source pixel into the
         destination pixel: */
      pixel = dst->tme_fb_connection_map_pixel[pixel & src_mask];

      /* if the source buffer is not packed, and we just
         read the last pixel on this source scanline: */
      if (!src_packed
          && ++src_x == src_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        src_off = ((src_bypl * 8) - (src_width * src_bipp));

        /* while there are bits to shift: */
        for (; src_off > 0; src_off -= TME_MIN(src_off, 32)) {

          /* shift the source primary FIFO: */
          TME_FB_XLAT_SHIFT_SRC(src_fifo0_may_be_unaligned,
                                src_fifo0,
                                src_fifo0_next,
                                src_fifo0_bits,
                                TME_MIN(src_off, 32),
                                src_raw0,
                                src_order);
        }

        /* we are now on the first pixel of the next scanline: */
        src_x = 0;
      }

      /* put the pixel into the destination primary FIFO: */
      dst_fifo0 |=
        (pixel
         << (dst_order == TME_ENDIAN_BIG
             ? ((32 - dst_bipp) - (0 * dst_bipp))
             : (0 * dst_bipp)));

      /* if the destination buffer is not packed, and we just
         wrote the last pixel on this destination scanline: */
      if (!dst_packed
          && (dst_x += 1) == dst_width) {

        /* calculate the number of bits between the
           last bit of the last pixel and the first bit
           of the first displayed pixel on the next
           scanline.  this is equal to the number of
           pad bits plus bits for undisplayed pixels: */
        dst_off = ((dst_bypl * 8) - (dst_width * dst_bipp));

        /* while there are bits to shift: */
        for (; dst_off > 0; dst_off -= TME_MIN(dst_off, 32)) {

          /* shift the destination primary FIFO: */
          TME_FB_XLAT_SHIFT_DST(dst_fifo0_may_be_unaligned,
            