Add comprehensive encoding fixes for NKJV text corruption

- Detect UTF-8 replacement characters and fall back to Latin-1 encoding - Add NKJV-specific cleanup to remove replacement chars and normalize line endings - Strip UTF-8 BOM and handle Windows line endings properly
2025-09-28 13:14:43 -04:00
parent 1f526df0bc
commit 2b3d753275
1 changed files with 29 additions and 3 deletions
--- a/backend/src/index.js
+++ b/backend/src/index.js
@@ -64,9 +64,35 @@ function getDataDir(version) {
 // Helper function to read markdown files with encoding normalization
 async function readMarkdownFile(filePath) {
  try {
-    const content = await fs.readFile(filePath, 'utf-8');
-    // Remove BOM if present and normalize encoding issues
-    return content.replace(/^\uFEFF/, ''); // Remove UTF-8 BOM
+    // Try UTF-8 first
+    let content = await fs.readFile(filePath, 'utf-8');
+
+    // Check if content contains replacement characters (indicates wrong encoding)
+    if (content.includes('\ufffd')) {
+      // Try with ISO-8859-1 (Latin-1) encoding common for Windows files
+      try {
+        const isoContent = await fs.readFile(filePath, 'latin1');
+        // Convert ISO-8859-1 to UTF-8
+        content = isoContent.normalize('NFC');
+      } catch (isoError) {
+        console.warn(`Failed to read with Latin-1 encoding for ${filePath}`);
+      }
+    }
+
+    // Remove BOM if present
+    content = content.replace(/^\uFEFF/, '');
+
+    // Additional cleanup for NKJV files
+    if (filePath.includes('/NKJV/')) {
+      // Replace any remaining encoding issues
+      content = content.replace(/\ufffd/g, ''); // Remove any remaining replacement chars
+
+      // Clean up any anomalous line breaks that might emerge in NKJV
+      content = content.replace(/\r\n/g, '\n'); // Normalize line endings
+      content = content.replace(/\r/g, '\n');   // Handle stray \r characters
+    }
+
+    return content;
  } catch (error) {
    throw new Error(`Failed to read file: ${filePath}`);
  }