Add comprehensive encoding fixes for NKJV text corruption
- Detect UTF-8 replacement characters and fall back to Latin-1 encoding - Add NKJV-specific cleanup to remove replacement chars and normalize line endings - Strip UTF-8 BOM and handle Windows line endings properly
This commit is contained in:
@@ -64,9 +64,35 @@ function getDataDir(version) {
|
||||
// Helper function to read markdown files with encoding normalization
|
||||
async function readMarkdownFile(filePath) {
|
||||
try {
|
||||
const content = await fs.readFile(filePath, 'utf-8');
|
||||
// Remove BOM if present and normalize encoding issues
|
||||
return content.replace(/^\uFEFF/, ''); // Remove UTF-8 BOM
|
||||
// Try UTF-8 first
|
||||
let content = await fs.readFile(filePath, 'utf-8');
|
||||
|
||||
// Check if content contains replacement characters (indicates wrong encoding)
|
||||
if (content.includes('\ufffd')) {
|
||||
// Try with ISO-8859-1 (Latin-1) encoding common for Windows files
|
||||
try {
|
||||
const isoContent = await fs.readFile(filePath, 'latin1');
|
||||
// Convert ISO-8859-1 to UTF-8
|
||||
content = isoContent.normalize('NFC');
|
||||
} catch (isoError) {
|
||||
console.warn(`Failed to read with Latin-1 encoding for ${filePath}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove BOM if present
|
||||
content = content.replace(/^\uFEFF/, '');
|
||||
|
||||
// Additional cleanup for NKJV files
|
||||
if (filePath.includes('/NKJV/')) {
|
||||
// Replace any remaining encoding issues
|
||||
content = content.replace(/\ufffd/g, ''); // Remove any remaining replacement chars
|
||||
|
||||
// Clean up any anomalous line breaks that might emerge in NKJV
|
||||
content = content.replace(/\r\n/g, '\n'); // Normalize line endings
|
||||
content = content.replace(/\r/g, '\n'); // Handle stray \r characters
|
||||
}
|
||||
|
||||
return content;
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to read file: ${filePath}`);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user