[N-Gage] Add LUT color mod, cardinal rotation cache and loop unrolling

- Implement lookup tables for faster color modulation
- Cache 0°/90°/180°/270° rotations for speedup on common angles
- Add dirty rectangle tracking infrastructure
- Process 4 pixels at a time in all transform operations
This commit is contained in:
Michael Fitzmayer 2026-04-15 20:34:23 +02:00
parent e5c8523b36
commit 5bd1a65e6f
6 changed files with 319 additions and 40 deletions

View file

@ -23,30 +23,58 @@
#include "SDL_render_ops.hpp"
#include <3dtypes.h>
void ApplyColorMod(void *dest, void *source, int pitch, int width, int height, SDL_FColor color)
void ApplyColorMod(void *dest, void *source, int pitch, int width, int height, SDL_FColor color, const TUint8 *colorLUT)
{
TUint16 *src_pixels = static_cast<TUint16 *>(source);
TUint16 *dst_pixels = static_cast<TUint16 *>(dest);
TFixed rf = Real2Fix(color.r);
TFixed gf = Real2Fix(color.g);
TFixed bf = Real2Fix(color.b);
// Pre-calculate pitch in pixels to avoid repeated division.
const TInt pitchPixels = pitch >> 1;
const int totalPixels = width * height;
// Process 4 pixels at a time (loop unrolling).
int pixelIndex = 0;
for (int y = 0; y < height; ++y) {
// Calculate row offset once per row.
TInt rowOffset = y * pitchPixels;
int x = 0;
for (int x = 0; x < width; ++x) {
// Unrolled loop: process 4 pixels at once.
for (; x < width - 3; x += 4) {
// Pixel 0
TUint16 p0 = src_pixels[rowOffset + x];
TUint8 r0 = colorLUT[(p0 & 0xF800) >> 8];
TUint8 g0 = colorLUT[256 + ((p0 & 0x07E0) >> 3)];
TUint8 b0 = colorLUT[512 + ((p0 & 0x001F) << 3)];
dst_pixels[rowOffset + x] = (r0 << 8) | (g0 << 3) | (b0 >> 3);
// Pixel 1
TUint16 p1 = src_pixels[rowOffset + x + 1];
TUint8 r1 = colorLUT[(p1 & 0xF800) >> 8];
TUint8 g1 = colorLUT[256 + ((p1 & 0x07E0) >> 3)];
TUint8 b1 = colorLUT[512 + ((p1 & 0x001F) << 3)];
dst_pixels[rowOffset + x + 1] = (r1 << 8) | (g1 << 3) | (b1 >> 3);
// Pixel 2
TUint16 p2 = src_pixels[rowOffset + x + 2];
TUint8 r2 = colorLUT[(p2 & 0xF800) >> 8];
TUint8 g2 = colorLUT[256 + ((p2 & 0x07E0) >> 3)];
TUint8 b2 = colorLUT[512 + ((p2 & 0x001F) << 3)];
dst_pixels[rowOffset + x + 2] = (r2 << 8) | (g2 << 3) | (b2 >> 3);
// Pixel 3
TUint16 p3 = src_pixels[rowOffset + x + 3];
TUint8 r3 = colorLUT[(p3 & 0xF800) >> 8];
TUint8 g3 = colorLUT[256 + ((p3 & 0x07E0) >> 3)];
TUint8 b3 = colorLUT[512 + ((p3 & 0x001F) << 3)];
dst_pixels[rowOffset + x + 3] = (r3 << 8) | (g3 << 3) | (b3 >> 3);
}
// Handle remaining pixels.
for (; x < width; ++x) {
TUint16 pixel = src_pixels[rowOffset + x];
TUint8 r = (pixel & 0xF800) >> 8;
TUint8 g = (pixel & 0x07E0) >> 3;
TUint8 b = (pixel & 0x001F) << 3;
r = FixMul(r, rf);
g = FixMul(g, gf);
b = FixMul(b, bf);
TUint8 r = colorLUT[(pixel & 0xF800) >> 8];
TUint8 g = colorLUT[256 + ((pixel & 0x07E0) >> 3)];
TUint8 b = colorLUT[512 + ((pixel & 0x001F) << 3)];
dst_pixels[rowOffset + x] = (r << 8) | (g << 3) | (b >> 3);
}
}
@ -57,20 +85,40 @@ void ApplyFlip(void *dest, void *source, int pitch, int width, int height, SDL_F
TUint16 *src_pixels = static_cast<TUint16 *>(source);
TUint16 *dst_pixels = static_cast<TUint16 *>(dest);
// Pre-calculate pitch in pixels to avoid repeated division.
const TInt pitchPixels = pitch >> 1;
// Pre-calculate flip flags to avoid repeated bitwise operations.
const bool flipHorizontal = (flip & SDL_FLIP_HORIZONTAL) != 0;
const bool flipVertical = (flip & SDL_FLIP_VERTICAL) != 0;
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
int src_x = x;
int src_y = y;
// Calculate destination row offset once per row.
TInt dstRowOffset = y * pitchPixels;
if (flip & SDL_FLIP_HORIZONTAL) {
src_x = width - 1 - x;
}
// Calculate source Y coordinate once per row.
int src_y = flipVertical ? (height - 1 - y) : y;
TInt srcRowOffset = src_y * pitchPixels;
if (flip & SDL_FLIP_VERTICAL) {
src_y = height - 1 - y;
}
int x = 0;
dst_pixels[y * pitch / 2 + x] = src_pixels[src_y * pitch / 2 + src_x];
// Unrolled loop: process 4 pixels at once.
for (; x < width - 3; x += 4) {
int src_x0 = flipHorizontal ? (width - 1 - x) : x;
int src_x1 = flipHorizontal ? (width - 2 - x) : (x + 1);
int src_x2 = flipHorizontal ? (width - 3 - x) : (x + 2);
int src_x3 = flipHorizontal ? (width - 4 - x) : (x + 3);
dst_pixels[dstRowOffset + x] = src_pixels[srcRowOffset + src_x0];
dst_pixels[dstRowOffset + x + 1] = src_pixels[srcRowOffset + src_x1];
dst_pixels[dstRowOffset + x + 2] = src_pixels[srcRowOffset + src_x2];
dst_pixels[dstRowOffset + x + 3] = src_pixels[srcRowOffset + src_x3];
}
// Handle remaining pixels.
for (; x < width; ++x) {
int src_x = flipHorizontal ? (width - 1 - x) : x;
dst_pixels[dstRowOffset + x] = src_pixels[srcRowOffset + src_x];
}
}
}
@ -132,25 +180,67 @@ void ApplyScale(void *dest, void *source, int pitch, int width, int height, TFix
TUint16 *src_pixels = static_cast<TUint16 *>(source);
TUint16 *dst_pixels = static_cast<TUint16 *>(dest);
// Pre-calculate pitch in pixels to avoid repeated division.
const TInt pitchPixels = pitch >> 1;
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
// Translate point to origin.
// Calculate destination row offset once per row.
TInt dstRowOffset = y * pitchPixels;
// Pre-calculate translated_y for the entire row.
TFixed translated_y = Int2Fix(y) - center_y;
TFixed scaled_y = FixDiv(translated_y, scale_y);
int final_y = Fix2Int(scaled_y + center_y);
// Check if this row is within bounds.
bool rowInBounds = (final_y >= 0 && final_y < height);
TInt srcRowOffset = final_y * pitchPixels;
int x = 0;
// Unrolled loop: process 4 pixels at once.
for (; x < width - 3; x += 4) {
// Pixel 0
TFixed translated_x0 = Int2Fix(x) - center_x;
TFixed scaled_x0 = FixDiv(translated_x0, scale_x);
int final_x0 = Fix2Int(scaled_x0 + center_x);
// Pixel 1
TFixed translated_x1 = Int2Fix(x + 1) - center_x;
TFixed scaled_x1 = FixDiv(translated_x1, scale_x);
int final_x1 = Fix2Int(scaled_x1 + center_x);
// Pixel 2
TFixed translated_x2 = Int2Fix(x + 2) - center_x;
TFixed scaled_x2 = FixDiv(translated_x2, scale_x);
int final_x2 = Fix2Int(scaled_x2 + center_x);
// Pixel 3
TFixed translated_x3 = Int2Fix(x + 3) - center_x;
TFixed scaled_x3 = FixDiv(translated_x3, scale_x);
int final_x3 = Fix2Int(scaled_x3 + center_x);
// Write all 4 pixels
dst_pixels[dstRowOffset + x] = (rowInBounds && final_x0 >= 0 && final_x0 < width) ?
src_pixels[srcRowOffset + final_x0] : 0;
dst_pixels[dstRowOffset + x + 1] = (rowInBounds && final_x1 >= 0 && final_x1 < width) ?
src_pixels[srcRowOffset + final_x1] : 0;
dst_pixels[dstRowOffset + x + 2] = (rowInBounds && final_x2 >= 0 && final_x2 < width) ?
src_pixels[srcRowOffset + final_x2] : 0;
dst_pixels[dstRowOffset + x + 3] = (rowInBounds && final_x3 >= 0 && final_x3 < width) ?
src_pixels[srcRowOffset + final_x3] : 0;
}
// Handle remaining pixels.
for (; x < width; ++x) {
TFixed translated_x = Int2Fix(x) - center_x;
TFixed translated_y = Int2Fix(y) - center_y;
// Scale point.
TFixed scaled_x = FixDiv(translated_x, scale_x);
TFixed scaled_y = FixDiv(translated_y, scale_y);
// Translate point back.
int final_x = Fix2Int(scaled_x + center_x);
int final_y = Fix2Int(scaled_y + center_y);
// Check bounds.
if (final_x >= 0 && final_x < width && final_y >= 0 && final_y < height) {
dst_pixels[y * pitch / 2 + x] = src_pixels[final_y * pitch / 2 + final_x];
if (rowInBounds && final_x >= 0 && final_x < width) {
dst_pixels[dstRowOffset + x] = src_pixels[srcRowOffset + final_x];
} else {
dst_pixels[y * pitch / 2 + x] = 0;
dst_pixels[dstRowOffset + x] = 0;
}
}
}