[N-Gage] Add LUT color mod, cardinal rotation cache and loop unrolling

- Implement lookup tables for faster color modulation - Cache 0°/90°/180°/270° rotations for speedup on common angles - Add dirty rectangle tracking infrastructure - Process 4 pixels at a time in all transform operations
2026-06-20 21:16:04 +00:00 · 2026-04-15 20:34:23 +02:00 · 2026-04-15 20:34:23 +02:00 · 5bd1a65e6f
commit 5bd1a65e6f
parent e5c8523b36
6 changed files with 319 additions and 40 deletions
--- a/src/render/ngage/SDL_render_ops.cpp
+++ b/src/render/ngage/SDL_render_ops.cpp
@ -23,30 +23,58 @@
 #include "SDL_render_ops.hpp"
 #include <3dtypes.h>

-void ApplyColorMod(void *dest, void *source, int pitch, int width, int height, SDL_FColor color)
+void ApplyColorMod(void *dest, void *source, int pitch, int width, int height, SDL_FColor color, const TUint8 *colorLUT)
 {
    TUint16 *src_pixels = static_cast<TUint16 *>(source);
    TUint16 *dst_pixels = static_cast<TUint16 *>(dest);

-    TFixed rf = Real2Fix(color.r);
-    TFixed gf = Real2Fix(color.g);
-    TFixed bf = Real2Fix(color.b);
-
    // Pre-calculate pitch in pixels to avoid repeated division.
    const TInt pitchPixels = pitch >> 1;
+    const int totalPixels = width * height;

+    // Process 4 pixels at a time (loop unrolling).
+    int pixelIndex = 0;
    for (int y = 0; y < height; ++y) {
-        // Calculate row offset once per row.
        TInt rowOffset = y * pitchPixels;
+        int x = 0;

-        for (int x = 0; x < width; ++x) {
+        // Unrolled loop: process 4 pixels at once.
+        for (; x < width - 3; x += 4) {
+            // Pixel 0
+            TUint16 p0 = src_pixels[rowOffset + x];
+            TUint8 r0 = colorLUT[(p0 & 0xF800) >> 8];
+            TUint8 g0 = colorLUT[256 + ((p0 & 0x07E0) >> 3)];
+            TUint8 b0 = colorLUT[512 + ((p0 & 0x001F) << 3)];
+            dst_pixels[rowOffset + x] = (r0 << 8) | (g0 << 3) | (b0 >> 3);
+
+            // Pixel 1
+            TUint16 p1 = src_pixels[rowOffset + x + 1];
+            TUint8 r1 = colorLUT[(p1 & 0xF800) >> 8];
+            TUint8 g1 = colorLUT[256 + ((p1 & 0x07E0) >> 3)];
+            TUint8 b1 = colorLUT[512 + ((p1 & 0x001F) << 3)];
+            dst_pixels[rowOffset + x + 1] = (r1 << 8) | (g1 << 3) | (b1 >> 3);
+
+            // Pixel 2
+            TUint16 p2 = src_pixels[rowOffset + x + 2];
+            TUint8 r2 = colorLUT[(p2 & 0xF800) >> 8];
+            TUint8 g2 = colorLUT[256 + ((p2 & 0x07E0) >> 3)];
+            TUint8 b2 = colorLUT[512 + ((p2 & 0x001F) << 3)];
+            dst_pixels[rowOffset + x + 2] = (r2 << 8) | (g2 << 3) | (b2 >> 3);
+
+            // Pixel 3
+            TUint16 p3 = src_pixels[rowOffset + x + 3];
+            TUint8 r3 = colorLUT[(p3 & 0xF800) >> 8];
+            TUint8 g3 = colorLUT[256 + ((p3 & 0x07E0) >> 3)];
+            TUint8 b3 = colorLUT[512 + ((p3 & 0x001F) << 3)];
+            dst_pixels[rowOffset + x + 3] = (r3 << 8) | (g3 << 3) | (b3 >> 3);
+        }
+
+        // Handle remaining pixels.
+        for (; x < width; ++x) {
            TUint16 pixel = src_pixels[rowOffset + x];
-            TUint8 r = (pixel & 0xF800) >> 8;
-            TUint8 g = (pixel & 0x07E0) >> 3;
-            TUint8 b = (pixel & 0x001F) << 3;
-            r = FixMul(r, rf);
-            g = FixMul(g, gf);
-            b = FixMul(b, bf);
+            TUint8 r = colorLUT[(pixel & 0xF800) >> 8];
+            TUint8 g = colorLUT[256 + ((pixel & 0x07E0) >> 3)];
+            TUint8 b = colorLUT[512 + ((pixel & 0x001F) << 3)];
            dst_pixels[rowOffset + x] = (r << 8) | (g << 3) | (b >> 3);
        }
    }
@ -57,20 +85,40 @@ void ApplyFlip(void *dest, void *source, int pitch, int width, int height, SDL_F
    TUint16 *src_pixels = static_cast<TUint16 *>(source);
    TUint16 *dst_pixels = static_cast<TUint16 *>(dest);

+    // Pre-calculate pitch in pixels to avoid repeated division.
+    const TInt pitchPixels = pitch >> 1;
+
+    // Pre-calculate flip flags to avoid repeated bitwise operations.
+    const bool flipHorizontal = (flip & SDL_FLIP_HORIZONTAL) != 0;
+    const bool flipVertical = (flip & SDL_FLIP_VERTICAL) != 0;
+
    for (int y = 0; y < height; ++y) {
-        for (int x = 0; x < width; ++x) {
-            int src_x = x;
-            int src_y = y;
+        // Calculate destination row offset once per row.
+        TInt dstRowOffset = y * pitchPixels;

-            if (flip & SDL_FLIP_HORIZONTAL) {
-                src_x = width - 1 - x;
-            }
+        // Calculate source Y coordinate once per row.
+        int src_y = flipVertical ? (height - 1 - y) : y;
+        TInt srcRowOffset = src_y * pitchPixels;

-            if (flip & SDL_FLIP_VERTICAL) {
-                src_y = height - 1 - y;
-            }
+        int x = 0;

-            dst_pixels[y * pitch / 2 + x] = src_pixels[src_y * pitch / 2 + src_x];
+        // Unrolled loop: process 4 pixels at once.
+        for (; x < width - 3; x += 4) {
+            int src_x0 = flipHorizontal ? (width - 1 - x) : x;
+            int src_x1 = flipHorizontal ? (width - 2 - x) : (x + 1);
+            int src_x2 = flipHorizontal ? (width - 3 - x) : (x + 2);
+            int src_x3 = flipHorizontal ? (width - 4 - x) : (x + 3);
+
+            dst_pixels[dstRowOffset + x] = src_pixels[srcRowOffset + src_x0];
+            dst_pixels[dstRowOffset + x + 1] = src_pixels[srcRowOffset + src_x1];
+            dst_pixels[dstRowOffset + x + 2] = src_pixels[srcRowOffset + src_x2];
+            dst_pixels[dstRowOffset + x + 3] = src_pixels[srcRowOffset + src_x3];
+        }
+
+        // Handle remaining pixels.
+        for (; x < width; ++x) {
+            int src_x = flipHorizontal ? (width - 1 - x) : x;
+            dst_pixels[dstRowOffset + x] = src_pixels[srcRowOffset + src_x];
        }
    }
 }
@ -132,25 +180,67 @@ void ApplyScale(void *dest, void *source, int pitch, int width, int height, TFix
    TUint16 *src_pixels = static_cast<TUint16 *>(source);
    TUint16 *dst_pixels = static_cast<TUint16 *>(dest);

+    // Pre-calculate pitch in pixels to avoid repeated division.
+    const TInt pitchPixels = pitch >> 1;
+
    for (int y = 0; y < height; ++y) {
-        for (int x = 0; x < width; ++x) {
-            // Translate point to origin.
+        // Calculate destination row offset once per row.
+        TInt dstRowOffset = y * pitchPixels;
+
+        // Pre-calculate translated_y for the entire row.
+        TFixed translated_y = Int2Fix(y) - center_y;
+        TFixed scaled_y = FixDiv(translated_y, scale_y);
+        int final_y = Fix2Int(scaled_y + center_y);
+
+        // Check if this row is within bounds.
+        bool rowInBounds = (final_y >= 0 && final_y < height);
+        TInt srcRowOffset = final_y * pitchPixels;
+
+        int x = 0;
+
+        // Unrolled loop: process 4 pixels at once.
+        for (; x < width - 3; x += 4) {
+            // Pixel 0
+            TFixed translated_x0 = Int2Fix(x) - center_x;
+            TFixed scaled_x0 = FixDiv(translated_x0, scale_x);
+            int final_x0 = Fix2Int(scaled_x0 + center_x);
+
+            // Pixel 1
+            TFixed translated_x1 = Int2Fix(x + 1) - center_x;
+            TFixed scaled_x1 = FixDiv(translated_x1, scale_x);
+            int final_x1 = Fix2Int(scaled_x1 + center_x);
+
+            // Pixel 2
+            TFixed translated_x2 = Int2Fix(x + 2) - center_x;
+            TFixed scaled_x2 = FixDiv(translated_x2, scale_x);
+            int final_x2 = Fix2Int(scaled_x2 + center_x);
+
+            // Pixel 3
+            TFixed translated_x3 = Int2Fix(x + 3) - center_x;
+            TFixed scaled_x3 = FixDiv(translated_x3, scale_x);
+            int final_x3 = Fix2Int(scaled_x3 + center_x);
+
+            // Write all 4 pixels
+            dst_pixels[dstRowOffset + x] = (rowInBounds && final_x0 >= 0 && final_x0 < width) ?
+                src_pixels[srcRowOffset + final_x0] : 0;
+            dst_pixels[dstRowOffset + x + 1] = (rowInBounds && final_x1 >= 0 && final_x1 < width) ?
+                src_pixels[srcRowOffset + final_x1] : 0;
+            dst_pixels[dstRowOffset + x + 2] = (rowInBounds && final_x2 >= 0 && final_x2 < width) ?
+                src_pixels[srcRowOffset + final_x2] : 0;
+            dst_pixels[dstRowOffset + x + 3] = (rowInBounds && final_x3 >= 0 && final_x3 < width) ?
+                src_pixels[srcRowOffset + final_x3] : 0;
+        }
+
+        // Handle remaining pixels.
+        for (; x < width; ++x) {
            TFixed translated_x = Int2Fix(x) - center_x;
-            TFixed translated_y = Int2Fix(y) - center_y;
-
-            // Scale point.
            TFixed scaled_x = FixDiv(translated_x, scale_x);
-            TFixed scaled_y = FixDiv(translated_y, scale_y);
-
-            // Translate point back.
            int final_x = Fix2Int(scaled_x + center_x);
-            int final_y = Fix2Int(scaled_y + center_y);

-            // Check bounds.
-            if (final_x >= 0 && final_x < width && final_y >= 0 && final_y < height) {
-                dst_pixels[y * pitch / 2 + x] = src_pixels[final_y * pitch / 2 + final_x];
+            if (rowInBounds && final_x >= 0 && final_x < width) {
+                dst_pixels[dstRowOffset + x] = src_pixels[srcRowOffset + final_x];
            } else {
-                dst_pixels[y * pitch / 2 + x] = 0;
+                dst_pixels[dstRowOffset + x] = 0;
            }
        }
    }