#!/usr/bin/env node import { readFileSync, writeFileSync, existsSync, mkdirSync, unlinkSync } from 'fs'; import { join, dirname, basename } from 'path'; import { fileURLToPath } from 'url'; import { Client } from '@notionhq/client'; import { NotionConverter } from 'notion-to-md'; import { DefaultExporter } from 'notion-to-md/plugins/exporter'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); /** * Ensure directory exists */ function ensureDirectory(dir) { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } } /** * Post-process Notion-generated Markdown for better MDX compatibility * @param {string} content - Raw markdown content from Notion * @param {Client} notionClient - Notion API client (optional) * @param {string} notionToken - Notion API token (optional) * @returns {Promise} - Processed markdown content */ export async function postProcessMarkdown(content, notionClient = null, notionToken = null) { console.log('๐Ÿ”ง Post-processing Notion Markdown for MDX compatibility...'); let processedContent = content; // Apply each transformation step processedContent = removeExcludeTags(processedContent); processedContent = await includeNotionPages(processedContent, notionClient, notionToken); processedContent = cleanNotionArtifacts(processedContent); processedContent = fixImageAltTextWithLinks(processedContent); processedContent = fixNotionLinks(processedContent); processedContent = fixJsxAttributes(processedContent); processedContent = optimizeImages(processedContent); processedContent = shiftHeadingLevels(processedContent); processedContent = cleanEmptyLines(processedContent); processedContent = fixCodeBlocks(processedContent); processedContent = fixCodeBlockEndings(processedContent); processedContent = unwrapHtmlCodeBlocks(processedContent); processedContent = fixPlainTextCodeBlocks(processedContent); processedContent = optimizeTables(processedContent); return processedContent; } /** * Remove tags and their content, plus associated media files * @param {string} content - Markdown content * @returns {string} - Content with exclude tags removed and unused imports cleaned */ function removeExcludeTags(content) { console.log(' ๐Ÿ—‘๏ธ Removing tags and associated media...'); let removedCount = 0; const removedImageVariables = new Set(); const mediaFilesToDelete = new Set(); // First, extract image variable names and media files from exclude blocks before removing them const excludeBlocks = content.match(/[\s\S]*?<\/exclude>/g) || []; excludeBlocks.forEach(match => { // Extract image variables from JSX components const imageMatches = match.match(/src=\{([^}]+)\}/g); if (imageMatches) { imageMatches.forEach(imgMatch => { const varName = imgMatch.match(/src=\{([^}]+)\}/)?.[1]; if (varName) { removedImageVariables.add(varName); } }); } // Extract media file paths from markdown images const markdownImages = match.match(/!\[[^\]]*\]\(([^)]+)\)/g); if (markdownImages) { markdownImages.forEach(imgMatch => { const src = imgMatch.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1]; if (src) { // Extract filename from path like /media/pageId/filename.png const filename = basename(src); if (filename) { mediaFilesToDelete.add(filename); } } }); } }); // Remove tags and everything between them (including multiline) content = content.replace(/[\s\S]*?<\/exclude>/g, (match) => { removedCount++; return ''; }); // Delete associated media files if (mediaFilesToDelete.size > 0) { console.log(` ๐Ÿ—‘๏ธ Found ${mediaFilesToDelete.size} media file(s) to delete from exclude blocks`); // Try to find and delete media files in common locations const possibleMediaDirs = [ join(__dirname, 'output', 'media'), join(__dirname, '..', '..', 'src', 'content', 'assets', 'image') ]; mediaFilesToDelete.forEach(filename => { let deleted = false; for (const mediaDir of possibleMediaDirs) { if (existsSync(mediaDir)) { const filePath = join(mediaDir, filename); if (existsSync(filePath)) { try { unlinkSync(filePath); console.log(` ๐Ÿ—‘๏ธ Deleted media file: ${filename}`); deleted = true; break; } catch (error) { console.log(` โš ๏ธ Failed to delete ${filename}: ${error.message}`); } } } } if (!deleted) { console.log(` โ„น๏ธ Media file not found: ${filename}`); } }); } // Remove unused image imports that were only used in exclude blocks if (removedImageVariables.size > 0) { console.log(` ๐Ÿ–ผ๏ธ Found ${removedImageVariables.size} unused image import(s) in exclude blocks`); removedImageVariables.forEach(varName => { // Check if the variable is still used elsewhere in the content after removing exclude blocks const remainingUsage = content.includes(`{${varName}}`) || content.includes(`src={${varName}}`); if (!remainingUsage) { // Remove import lines for unused image variables // Pattern: import VarName from './assets/image/filename'; const importPattern = new RegExp(`import\\s+${varName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\s+from\\s+['"][^'"]+['"];?\\s*`, 'g'); content = content.replace(importPattern, ''); console.log(` ๐Ÿ—‘๏ธ Removed unused import: ${varName}`); } }); console.log(` ๐Ÿงน Cleaned up unused image imports`); } if (removedCount > 0) { console.log(` โœ… Removed ${removedCount} tag(s) and their content`); } else { console.log(' โ„น๏ธ No tags found'); } return content; } /** * Replace Notion page links with their actual content * @param {string} content - Markdown content * @param {Client} notionClient - Notion API client * @param {string} notionToken - Notion API token * @returns {Promise} - Content with page links replaced */ async function includeNotionPages(content, notionClient, notionToken) { console.log(' ๐Ÿ“„ Including linked Notion pages...'); if (!notionClient || !notionToken) { console.log(' โ„น๏ธ Skipping page inclusion (no Notion client/token provided)'); return content; } let includedCount = 0; let skippedCount = 0; // First, identify all exclude blocks to avoid processing links within them const excludeBlocks = []; const excludeRegex = /[\s\S]*?<\/exclude>/g; let excludeMatch; while ((excludeMatch = excludeRegex.exec(content)) !== null) { excludeBlocks.push({ start: excludeMatch.index, end: excludeMatch.index + excludeMatch[0].length }); } // Helper function to check if a position is within an exclude block const isWithinExcludeBlock = (position) => { return excludeBlocks.some(block => position >= block.start && position <= block.end); }; // Regex to match links to Notion pages with UUID format // Pattern: [text](uuid-with-dashes) const notionPageLinkRegex = /\[([^\]]+)\]\(([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\)/g; let processedContent = content; let match; // Find all matches const matches = []; while ((match = notionPageLinkRegex.exec(content)) !== null) { const linkStartPos = match.index; // Skip if this link is within an exclude block if (isWithinExcludeBlock(linkStartPos)) { console.log(` โญ๏ธ Skipping page link in exclude block: ${match[1]} (${match[2]})`); skippedCount++; continue; } matches.push({ fullMatch: match[0], linkText: match[1], pageId: match[2], startPos: match.index, endPos: match.index + match[0].length }); } // Process matches in reverse order to maintain correct indices for (let i = matches.length - 1; i >= 0; i--) { const link = matches[i]; try { console.log(` ๐Ÿ”— Fetching content for page: ${link.pageId}`); // Create media directory for this sub-page const outputDir = join(__dirname, 'output'); const mediaDir = join(outputDir, 'media', link.pageId); ensureDirectory(mediaDir); // Configure the DefaultExporter to get content as string const exporter = new DefaultExporter({ outputType: 'string', }); // Create the converter with media downloading strategy (same as convertNotionPage) const converter = new NotionConverter(notionClient) .withExporter(exporter) // Download media to local directory with path transformation .downloadMediaTo({ outputDir: mediaDir, // Transform paths to be web-accessible transformPath: (localPath) => `/media/${link.pageId}/${basename(localPath)}`, }); // Convert the page const result = await converter.convert(link.pageId); console.log(` ๐Ÿ–ผ๏ธ Media saved to: ${mediaDir}`); if (result && result.content) { // Save raw content as .raw.md file const rawFileName = `${link.linkText.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-${link.pageId}`; const rawFilePath = join(outputDir, `${rawFileName}.raw.md`); try { writeFileSync(rawFilePath, result.content); console.log(` ๐Ÿ“„ Saved raw markdown: ${rawFileName}.raw.md`); } catch (error) { console.log(` โš ๏ธ Failed to save raw file: ${error.message}`); } // Clean the content (remove frontmatter, etc.) let pageContent = result.content; // Remove YAML frontmatter if present pageContent = pageContent.replace(/^---[\s\S]*?---\s*\n/, ''); // Remove the first markdown heading (H1, H2, H3, etc.) from the included page pageContent = pageContent.replace(/^#+ .+\n\n?/, ''); // Keep the page content without title const finalContent = '\n\n' + pageContent.trim() + '\n\n'; // Replace the link with the content processedContent = processedContent.substring(0, link.startPos) + finalContent + processedContent.substring(link.endPos); includedCount++; console.log(` โœ… Included page content: ${link.linkText}`); } else { console.log(` โš ๏ธ No content found for page: ${link.pageId}`); } } catch (error) { console.log(` โŒ Failed to fetch page ${link.pageId}: ${error.message}`); // Keep the original link if we can't fetch the content } } if (includedCount > 0) { console.log(` โœ… Included ${includedCount} Notion page(s)`); } else { console.log(' โ„น๏ธ No Notion page links found to include'); } if (skippedCount > 0) { console.log(` โญ๏ธ Skipped ${skippedCount} page link(s) in exclude blocks`); } return processedContent; } /** * Clean Notion-specific artifacts and formatting * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function cleanNotionArtifacts(content) { console.log(' ๐Ÿงน Cleaning Notion artifacts...'); let cleanedCount = 0; // Remove Notion's internal page references that don't convert well content = content.replace(/\[([^\]]+)\]\(https:\/\/www\.notion\.so\/[^)]+\)/g, (match, text) => { cleanedCount++; return text; // Keep just the text, remove the broken link }); // Clean up Notion's callout blocks that might not render properly content = content.replace(/^> \*\*([^*]+)\*\*\s*\n/gm, '> **$1**\n\n'); // Remove Notion's page dividers that don't have markdown equivalents content = content.replace(/^---+\s*$/gm, ''); // Clean up empty blockquotes content = content.replace(/^>\s*$/gm, ''); // Fix corrupted bold/italic formatting from notion-to-md conversion // Pattern: ***text*** **** -> ***text*** content = content.replace(/\*\*\*([^*]+)\*\*\*\s+\*\*\*\*/g, (match, text) => { cleanedCount++; return `***${text.trim()}***`; }); // Fix other corrupted asterisk patterns // Pattern: **text** ** -> **text** content = content.replace(/\*\*([^*]+)\*\*\s+\*\*/g, (match, text) => { cleanedCount++; return `**${text.trim()}**`; }); if (cleanedCount > 0) { console.log(` โœ… Cleaned ${cleanedCount} Notion artifact(s)`); } return content; } /** * Fix image alt text that contains markdown links * notion-to-md v4 sometimes generates: ![alt with [link](url)](image_path) * This breaks MDX parsing. Clean it to: ![alt with @mention](image_path) * @param {string} content - Markdown content * @returns {string} - Content with fixed image alt text */ function fixImageAltTextWithLinks(content) { console.log(' ๐Ÿ–ผ๏ธ Fixing image alt text with embedded links...'); let fixedCount = 0; // Pattern: ![text [link](url) more_text](image_path) // This regex finds images where the alt text contains markdown links const imageWithLinksPattern = /!\[([^\]]*\[[^\]]+\]\([^)]+\)[^\]]*)\]\(([^)]+)\)/g; content = content.replace(imageWithLinksPattern, (match, altText, imagePath) => { fixedCount++; // Remove all markdown links from alt text: [text](url) -> text const cleanedAlt = altText.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // Also clean up any remaining brackets const finalAlt = cleanedAlt.replace(/[\[\]]/g, ''); console.log(` ๐Ÿ”ง Fixed: "${altText.substring(0, 50)}..." -> "${finalAlt.substring(0, 50)}..."`); return `![${finalAlt}](${imagePath})`; }); if (fixedCount > 0) { console.log(` โœ… Fixed ${fixedCount} image(s) with embedded links in alt text`); } else { console.log(' โ„น๏ธ No images with embedded links found'); } return content; } /** * Fix Notion internal links to be more MDX-friendly * @param {string} content - Markdown content * @returns {string} - Content with fixed links */ function fixNotionLinks(content) { console.log(' ๐Ÿ”— Fixing Notion internal links...'); let fixedCount = 0; // Convert Notion page links to relative links (assuming they'll be converted to MDX) content = content.replace(/\[([^\]]+)\]\(https:\/\/www\.notion\.so\/[^/]+\/([^?#)]+)\)/g, (match, text, pageId) => { fixedCount++; // Convert to relative link - this will need to be updated based on your routing return `[${text}](#${pageId})`; }); // Fix broken notion.so links that might be malformed content = content.replace(/\[([^\]]+)\]\(https:\/\/www\.notion\.so\/[^)]*\)/g, (match, text) => { fixedCount++; return text; // Remove broken links, keep text }); if (fixedCount > 0) { console.log(` โœ… Fixed ${fixedCount} Notion link(s)`); } return content; } /** * Fix JSX attributes that were corrupted during Notion conversion * @param {string} content - Markdown content * @returns {string} - Content with fixed JSX attributes */ function fixJsxAttributes(content) { console.log(' ๐Ÿ”ง Fixing JSX attributes corrupted by Notion conversion...'); let fixedCount = 0; // Fix the specific issue: โ†’ // Pattern: content = content.replace(/<(\w+)\s+\*\s*([^*\s]+)\s*\*\s*=\s*"([^"]*)"\s*\/?>/g, (match, tagName, attribute, value) => { fixedCount++; return `<${tagName} ${attribute}="${value}" />`; }); // Pattern: content = content.replace(/<(\w+)\s+\*\s*([^*\s]+)\s*\*\s*=\s*([^>\s\/]+)\s*\/?>/g, (match, tagName, attribute, value) => { fixedCount++; return `<${tagName} ${attribute}=${value} />`; }); // Handle cases with **double asterisks** around attribute names content = content.replace(/<(\w+)\s+\*\*\s*([^*\s]+)\s*\*\*\s*=\s*"([^"]*)"\s*\/?>/g, (match, tagName, attribute, value) => { fixedCount++; return `<${tagName} ${attribute}="${value}" />`; }); content = content.replace(/<(\w+)\s+\*\*\s*([^*\s]+)\s*\*\*\s*=\s*([^>\s\/]+)\s*\/?>/g, (match, tagName, attribute, value) => { fixedCount++; return `<${tagName} ${attribute}=${value} />`; }); // Fix HTML tags (like iframe, video, etc.) where URLs were corrupted by markdown conversion // Pattern: src="[url](url)" -> src="url" // Handle both regular quotes and various smart quote characters (", ", ', ', """, etc.) // Handle attributes before and after src // Handle iframe tags with separate opening and closing tags FIRST: content = content.replace(/]*?)\ssrc=[""''""\u201C\u201D\u2018\u2019]\[([^\]]+)\]\([^)]+\)[""''""\u201C\u201D\u2018\u2019]([^>]*?)>\s*<\/iframe>/gi, (match, before, urlText, after) => { fixedCount++; return ``; }); // Handle self-closing iframe tags SECOND: