canvas-website/src/lib/obsidianImporter.ts

1247 lines
37 KiB
TypeScript

/**
* Obsidian Vault Importer
* Handles reading and processing markdown files from a local Obsidian vault
*/
import { GitHubQuartzReader, GitHubQuartzConfig } from './githubQuartzReader'
import { getClientConfig } from './clientConfig'
export interface ObsidianObsNote {
id: string
title: string
content: string
filePath: string
tags: string[]
created: Date | string
modified: Date | string
links: string[]
backlinks: string[]
frontmatter: Record<string, any>
vaultPath?: string
}
export interface FolderNode {
name: string
path: string
children: FolderNode[]
notes: ObsidianObsNote[]
isExpanded: boolean
level: number
}
export interface ObsidianVault {
name: string
path: string
obs_notes: ObsidianObsNote[]
totalObsNotes: number
lastImported: Date
folderTree: FolderNode
}
export interface ObsidianVaultRecord {
id: string
typeName: 'obsidian_vault'
name: string
path: string
obs_notes: ObsidianObsNote[]
totalObsNotes: number
lastImported: Date
folderTree: FolderNode
meta: Record<string, any>
}
export class ObsidianImporter {
private vault: ObsidianVault | null = null
/**
* Import notes from a directory (simulated file picker for now)
* In a real implementation, this would use the File System Access API
*/
async importFromDirectory(directoryPath: string): Promise<ObsidianVault> {
try {
// For now, we'll simulate this with a demo vault
// In a real implementation, you'd use the File System Access API
// Simulate reading files (in real implementation, use File System Access API)
const mockObsNotes = await this.createMockObsNotes()
this.vault = {
name: this.extractVaultName(directoryPath),
path: directoryPath,
obs_notes: mockObsNotes,
totalObsNotes: mockObsNotes.length,
lastImported: new Date(),
folderTree: this.buildFolderTree(mockObsNotes)
}
return this.vault
} catch (error) {
console.error('Error importing Obsidian vault:', error)
throw new Error('Failed to import Obsidian vault')
}
}
/**
* Import notes from a Quartz URL using GitHub API
*/
async importFromQuartzUrl(quartzUrl: string): Promise<ObsidianVault> {
try {
// Ensure URL has protocol
const url = quartzUrl.startsWith('http') ? quartzUrl : `https://${quartzUrl}`
// Try to get GitHub repository info from environment or URL
const githubConfig = this.getGitHubConfigFromUrl(url)
if (githubConfig) {
const obs_notes = await this.importFromGitHub(githubConfig)
this.vault = {
name: this.extractVaultNameFromUrl(url),
path: url,
obs_notes,
totalObsNotes: obs_notes.length,
lastImported: new Date(),
folderTree: this.buildFolderTree(obs_notes)
}
return this.vault
} else {
// Fallback to the old method
const obs_notes = await this.discoverQuartzContent(url)
this.vault = {
name: this.extractVaultNameFromUrl(url),
path: url,
obs_notes,
totalObsNotes: obs_notes.length,
lastImported: new Date(),
folderTree: this.buildFolderTree(obs_notes)
}
return this.vault
}
} catch (error) {
console.error('Error importing from Quartz URL:', error)
throw new Error('Failed to import from Quartz URL')
}
}
/**
* Import notes using File System Access API (modern browsers)
*/
async importFromFileSystem(): Promise<ObsidianVault> {
try {
// Check if File System Access API is supported
if (!('showDirectoryPicker' in window)) {
throw new Error('File System Access API not supported in this browser')
}
// Request directory access
const directoryHandle = await (window as any).showDirectoryPicker({
mode: 'read'
})
const obs_notes: ObsidianObsNote[] = []
await this.readDirectoryRecursively(directoryHandle, obs_notes, '')
this.vault = {
name: directoryHandle.name,
path: directoryHandle.name, // File System Access API doesn't expose full path
obs_notes,
totalObsNotes: obs_notes.length,
lastImported: new Date(),
folderTree: this.buildFolderTree(obs_notes)
}
return this.vault
} catch (error) {
console.error('Error importing Obsidian vault via File System Access API:', error)
throw new Error('Failed to import Obsidian vault')
}
}
/**
* Recursively read directory and process markdown files
*/
private async readDirectoryRecursively(
directoryHandle: any,
obs_notes: ObsidianObsNote[],
relativePath: string
): Promise<void> {
for await (const [name, handle] of directoryHandle.entries()) {
const currentPath = relativePath ? `${relativePath}/${name}` : name
if (handle.kind === 'directory') {
// Skip hidden directories and .obsidian
if (!name.startsWith('.') && name !== 'node_modules') {
await this.readDirectoryRecursively(handle, obs_notes, currentPath)
}
} else if (handle.kind === 'file' && name.endsWith('.md')) {
try {
const file = await handle.getFile()
const content = await file.text()
const obs_note = this.parseMarkdownFile(content, currentPath, file.lastModified)
obs_notes.push(obs_note)
} catch (error) {
console.warn(`Failed to read file ${currentPath}:`, error)
}
}
}
}
/**
* Parse a markdown file and extract metadata
*/
private parseMarkdownFile(content: string, filePath: string, lastModified: number): ObsidianObsNote {
// Extract frontmatter
const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/)
let frontmatter: Record<string, any> = {}
let body = content
if (frontmatterMatch) {
try {
const frontmatterText = frontmatterMatch[1]
// Simple YAML parsing (in production, use a proper YAML parser)
frontmatter = this.parseSimpleYaml(frontmatterText)
body = frontmatterMatch[2]
} catch (error) {
console.warn('Failed to parse frontmatter:', error)
}
}
// Extract title from frontmatter or first heading
const title = frontmatter.title || this.extractTitle(body) || this.extractFileName(filePath)
// Extract tags
const tags = this.extractTags(body, frontmatter)
// Extract links
const links = this.extractLinks(body, '')
// Generate unique ID
const id = this.generateNoteId(filePath)
return {
id,
title,
content: body,
filePath,
tags,
created: new Date(frontmatter.created || lastModified),
modified: new Date(lastModified),
links,
backlinks: [], // Would need to be calculated by analyzing all notes
frontmatter
}
}
/**
* Extract title from markdown content
*/
private extractTitle(content: string): string | null {
const headingMatch = content.match(/^#\s+(.+)$/m)
return headingMatch ? headingMatch[1].trim() : null
}
/**
* Extract filename without extension
*/
private extractFileName(filePath: string): string {
const fileName = filePath.split('/').pop() || filePath
return fileName.replace(/\.md$/, '')
}
/**
* Extract tags from content and frontmatter
*/
private extractTags(content: string, frontmatter: Record<string, any>): string[] {
const tags = new Set<string>()
// Extract from frontmatter
if (frontmatter.tags) {
if (Array.isArray(frontmatter.tags)) {
frontmatter.tags.forEach((tag: string) => tags.add(tag))
} else if (typeof frontmatter.tags === 'string') {
frontmatter.tags.split(',').forEach((tag: string) => tags.add(tag.trim()))
}
}
// Extract from content (#tag format)
const tagMatches = content.match(/#[a-zA-Z0-9_-]+/g)
if (tagMatches) {
tagMatches.forEach(tag => tags.add(tag))
}
return Array.from(tags)
}
/**
* Generate unique ID for note
*/
private generateNoteId(filePath: string): string {
return `note_${filePath.replace(/[^a-zA-Z0-9]/g, '_')}`
}
/**
* Simple YAML parser for frontmatter
*/
private parseSimpleYaml(yamlText: string): Record<string, any> {
const result: Record<string, any> = {}
const lines = yamlText.split('\n')
for (const line of lines) {
const trimmed = line.trim()
if (trimmed && !trimmed.startsWith('#')) {
const colonIndex = trimmed.indexOf(':')
if (colonIndex > 0) {
const key = trimmed.substring(0, colonIndex).trim()
let value = trimmed.substring(colonIndex + 1).trim()
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1)
}
// Parse arrays
if (value.startsWith('[') && value.endsWith(']')) {
try {
value = JSON.parse(value)
} catch {
// If JSON parsing fails, treat as string
}
}
result[key] = value
}
}
}
return result
}
/**
* Extract vault name from path
*/
private extractVaultName(path: string): string {
const parts = path.split('/')
return parts[parts.length - 1] || 'Obsidian Vault'
}
/**
* Create mock obs_notes for demonstration
*/
private async createMockObsNotes(): Promise<ObsidianObsNote[]> {
return [
{
id: 'note_1',
title: 'Welcome to Obsidian',
content: `# Welcome to Obsidian
This is a sample note from your Obsidian vault. You can drag this note onto the canvas to create a new rectangle shape.
## Features
- [[Note Linking]]
- #tags
- [External Links](https://obsidian.md)
## Tasks
- [x] Set up vault
- [ ] Import notes
- [ ] Organize content`,
filePath: 'Welcome to Obsidian.md',
tags: ['#welcome', '#getting-started'],
created: new Date('2024-01-01'),
modified: new Date('2024-01-15'),
links: ['Note Linking', 'https://obsidian.md'],
backlinks: [],
frontmatter: {
title: 'Welcome to Obsidian',
tags: ['welcome', 'getting-started'],
created: '2024-01-01'
}
},
{
id: 'note_2',
title: 'Project Ideas',
content: `# Project Ideas
A collection of creative project ideas and concepts.
## Web Development
- Canvas-based drawing app
- Real-time collaboration tools
- AI-powered content generation
## Design
- Interactive data visualizations
- User experience improvements
- Mobile-first design patterns`,
filePath: 'Project Ideas.md',
tags: ['#projects', '#ideas', '#development'],
created: new Date('2024-01-05'),
modified: new Date('2024-01-20'),
links: [],
backlinks: [],
frontmatter: {
title: 'Project Ideas',
tags: ['projects', 'ideas', 'development']
}
},
{
id: 'note_3',
title: 'Meeting Notes',
content: `# Meeting Notes - January 15, 2024
## Attendees
- John Doe
- Jane Smith
- Bob Johnson
## Agenda
1. Project status update
2. Budget review
3. Timeline discussion
## Action Items
- [ ] Complete budget analysis by Friday
- [ ] Schedule follow-up meeting
- [ ] Update project documentation`,
filePath: 'Meetings/2024-01-15 Meeting Notes.md',
tags: ['#meetings', '#2024'],
created: new Date('2024-01-15'),
modified: new Date('2024-01-15'),
links: [],
backlinks: [],
frontmatter: {
title: 'Meeting Notes - January 15, 2024',
date: '2024-01-15',
tags: ['meetings', '2024']
}
}
]
}
/**
* Get the current vault
*/
getVault(): ObsidianVault | null {
return this.vault
}
/**
* Search obs_notes in the vault
*/
searchObsNotes(query: string): ObsidianObsNote[] {
if (!this.vault) return []
const lowercaseQuery = query.toLowerCase()
return this.vault.obs_notes.filter(obs_note =>
obs_note.title.toLowerCase().includes(lowercaseQuery) ||
obs_note.content.toLowerCase().includes(lowercaseQuery) ||
obs_note.tags.some(tag => tag.toLowerCase().includes(lowercaseQuery))
)
}
/**
* Get obs_notes by tag
*/
getObsNotesByTag(tag: string): ObsidianObsNote[] {
if (!this.vault) return []
return this.vault.obs_notes.filter(obs_note =>
obs_note.tags.some(noteTag => noteTag.toLowerCase().includes(tag.toLowerCase()))
)
}
/**
* Get all unique tags
*/
getAllTags(): string[] {
if (!this.vault) return []
const allTags = new Set<string>()
this.vault.obs_notes.forEach(obs_note => {
obs_note.tags.forEach(tag => allTags.add(tag))
})
return Array.from(allTags).sort()
}
/**
* Build folder tree structure from obs_notes
*/
buildFolderTree(obs_notes: ObsidianObsNote[]): FolderNode {
const root: FolderNode = {
name: 'Root',
path: '',
children: [],
notes: [],
isExpanded: true,
level: 0
}
// Group notes by their folder paths
const folderMap = new Map<string, { folders: string[], notes: ObsidianObsNote[] }>()
obs_notes.forEach(note => {
const pathParts = this.parseFilePath(note.filePath)
const folderKey = pathParts.folders.join('/')
if (!folderMap.has(folderKey)) {
folderMap.set(folderKey, { folders: pathParts.folders, notes: [] })
}
folderMap.get(folderKey)!.notes.push(note)
})
// Build the tree structure
folderMap.forEach(({ folders, notes }) => {
this.addFolderToTree(root, folders, notes)
})
return root
}
/**
* Parse file path into folder structure
*/
private parseFilePath(filePath: string): { folders: string[], fileName: string } {
// Handle both local paths and URLs
let pathToParse = filePath
if (filePath.startsWith('http')) {
// Extract pathname from URL
try {
const url = new URL(filePath)
pathToParse = url.pathname.replace(/^\//, '')
} catch (e) {
console.warn('Invalid URL:', filePath)
return { folders: [], fileName: filePath }
}
}
// Split path and filter out empty parts
const parts = pathToParse.split('/').filter(part => part.length > 0)
if (parts.length === 0) {
return { folders: [], fileName: filePath }
}
const fileName = parts[parts.length - 1]
const folders = parts.slice(0, -1)
return { folders, fileName }
}
/**
* Add folder to tree structure
*/
private addFolderToTree(root: FolderNode, folderPath: string[], notes: ObsidianObsNote[]): void {
let current = root
for (let i = 0; i < folderPath.length; i++) {
const folderName = folderPath[i]
let existingFolder = current.children.find(child => child.name === folderName)
if (!existingFolder) {
const currentPath = folderPath.slice(0, i + 1).join('/')
existingFolder = {
name: folderName,
path: currentPath,
children: [],
notes: [],
isExpanded: false,
level: i + 1
}
current.children.push(existingFolder)
}
current = existingFolder
}
// Add notes to the final folder
current.notes.push(...notes)
}
/**
* Get all notes from a folder tree (recursive)
*/
getAllNotesFromTree(folder: FolderNode): ObsidianObsNote[] {
let notes = [...folder.notes]
folder.children.forEach(child => {
notes.push(...this.getAllNotesFromTree(child))
})
return notes
}
/**
* Find folder by path in tree
*/
findFolderByPath(root: FolderNode, path: string): FolderNode | null {
if (root.path === path) {
return root
}
for (const child of root.children) {
const found = this.findFolderByPath(child, path)
if (found) {
return found
}
}
return null
}
/**
* Convert vault to Automerge record format
*/
vaultToRecord(vault: ObsidianVault): ObsidianVaultRecord {
return {
id: `obsidian_vault:${vault.name}`,
typeName: 'obsidian_vault',
name: vault.name,
path: vault.path,
obs_notes: vault.obs_notes,
totalObsNotes: vault.totalObsNotes,
lastImported: vault.lastImported,
folderTree: vault.folderTree,
meta: {}
}
}
/**
* Convert Automerge record to vault format
*/
recordToVault(record: ObsidianVaultRecord): ObsidianVault {
return {
name: record.name,
path: record.path,
obs_notes: record.obs_notes,
totalObsNotes: record.totalObsNotes,
lastImported: record.lastImported,
folderTree: record.folderTree
}
}
/**
* Search notes in the current vault
*/
async searchNotes(query: string): Promise<ObsidianObsNote[]> {
if (!this.vault) return []
// If this is a GitHub-based Quartz vault, use GitHub search
if (this.vault.path && (this.vault.path.startsWith('http') || this.vault.path.includes('github'))) {
const githubConfig = this.getGitHubConfigFromUrl(this.vault.path)
if (githubConfig) {
try {
const reader = new GitHubQuartzReader(githubConfig)
const quartzNotes = await reader.searchNotes(query)
// Convert to Obsidian format
return quartzNotes.map(note => ({
id: note.id,
title: note.title,
content: note.content,
filePath: note.filePath,
tags: note.tags,
links: [],
created: new Date().toISOString(),
modified: note.lastModified,
vaultPath: githubConfig.owner + '/' + githubConfig.repo,
backlinks: [],
frontmatter: note.frontmatter
}))
} catch (error) {
console.error('GitHub search failed, falling back to local search:', error)
}
}
}
// Fallback to local search
const searchTerm = query.toLowerCase()
return this.vault.obs_notes.filter(note =>
note.title.toLowerCase().includes(searchTerm) ||
note.content.toLowerCase().includes(searchTerm) ||
note.tags.some(tag => tag.toLowerCase().includes(searchTerm))
)
}
/**
* Get GitHub configuration from client config
*/
private getGitHubConfigFromUrl(_quartzUrl: string): GitHubQuartzConfig | null {
const config = getClientConfig()
const githubToken = config.githubToken
const githubRepo = config.quartzRepo
if (!githubToken || !githubRepo) {
return null
}
if (githubToken === 'your_github_token_here' || githubRepo === 'your_username/your-quartz-repo') {
return null
}
const [owner, repo] = githubRepo.split('/')
if (!owner || !repo) {
return null
}
return {
token: githubToken,
owner,
repo,
branch: config.quartzBranch || 'main',
contentPath: 'content'
}
}
/**
* Import notes from GitHub repository
*/
private async importFromGitHub(config: GitHubQuartzConfig): Promise<ObsidianObsNote[]> {
try {
const reader = new GitHubQuartzReader(config)
const quartzNotes = await reader.getAllNotes()
// Convert Quartz notes to Obsidian format and deduplicate by ID
const notesMap = new Map<string, ObsidianObsNote>()
quartzNotes
.filter(note => note != null) // Filter out any null/undefined notes
.forEach(note => {
const obsNote: ObsidianObsNote = {
id: note.id || 'unknown',
title: note.title || 'Untitled',
content: note.content || '',
filePath: note.filePath || 'unknown',
tags: note.tags || [],
links: [], // Will be populated if needed
created: new Date(),
modified: new Date(note.lastModified || new Date().toISOString()),
backlinks: [],
frontmatter: note.frontmatter || {},
vaultPath: config.owner + '/' + config.repo,
}
// If we already have a note with this ID, keep the one with the longer content
// (assuming it's more complete) or prefer the one without quotes in the filename
const existing = notesMap.get(obsNote.id)
if (existing) {
console.warn(`Duplicate note ID found: ${obsNote.id}. File paths: ${existing.filePath} vs ${obsNote.filePath}`)
// Prefer the note without quotes in the filename
const existingHasQuotes = existing.filePath.includes('"')
const currentHasQuotes = obsNote.filePath.includes('"')
if (currentHasQuotes && !existingHasQuotes) {
return // Keep the existing one
} else if (!currentHasQuotes && existingHasQuotes) {
notesMap.set(obsNote.id, obsNote)
} else {
// Both have or don't have quotes, prefer the one with more content
if (obsNote.content.length > existing.content.length) {
notesMap.set(obsNote.id, obsNote)
}
}
} else {
notesMap.set(obsNote.id, obsNote)
}
})
const uniqueNotes = Array.from(notesMap.values())
return uniqueNotes
} catch (error) {
console.error('Failed to import from GitHub:', error)
throw error
}
}
/**
* Discover content from a Quartz site (fallback method)
*/
private async discoverQuartzContent(baseUrl: string): Promise<ObsidianObsNote[]> {
const obs_notes: ObsidianObsNote[] = []
try {
// Try to find content through common Quartz patterns
const contentUrls = await this.findQuartzContentUrls(baseUrl)
if (contentUrls.length === 0) {
return obs_notes
}
for (const contentUrl of contentUrls) {
try {
const response = await fetch(contentUrl)
if (!response.ok) {
continue
}
const content = await response.text()
const obs_note = this.parseQuartzMarkdown(content, contentUrl, baseUrl)
// Add all notes regardless of content length
obs_notes.push(obs_note)
} catch (error) {
// Silently skip failed fetches
}
}
} catch (error) {
console.warn('⚠️ Failed to discover Quartz content:', error)
}
return obs_notes
}
/**
* Find content URLs from a Quartz site
*/
private async findQuartzContentUrls(baseUrl: string): Promise<string[]> {
const urls: string[] = []
try {
// First, try to fetch the main page to discover content
console.log('🔍 Fetching main page to discover content structure...')
const mainPageResponse = await fetch(baseUrl)
if (mainPageResponse.ok) {
const mainPageContent = await mainPageResponse.text()
urls.push(baseUrl) // Always include the main page
// Look for navigation links and content links in the main page
const discoveredUrls = this.extractContentUrlsFromPage(mainPageContent, baseUrl)
urls.push(...discoveredUrls)
}
// Try to find a sitemap
const sitemapUrl = `${baseUrl}/sitemap.xml`
try {
const response = await fetch(sitemapUrl)
if (response.ok) {
const sitemap = await response.text()
const urlMatches = sitemap.match(/<loc>(.*?)<\/loc>/g)
if (urlMatches) {
const sitemapUrls = urlMatches.map(match =>
match.replace(/<\/?loc>/g, '').trim()
).filter(url => url.endsWith('.html') || url.endsWith('.md') || url.includes(baseUrl))
urls.push(...sitemapUrls)
}
}
} catch (error) {
console.warn('Failed to fetch sitemap:', error)
}
// Try to find content through common Quartz patterns
const commonPaths = [
'/', // Root page
'/index.html',
'/about',
'/contact',
'/notes',
'/posts',
'/content',
'/pages',
'/blog',
'/articles'
]
for (const path of commonPaths) {
try {
const url = path === '/' ? baseUrl : `${baseUrl}${path}`
const response = await fetch(url)
if (response.ok) {
urls.push(url)
}
} catch (error) {
// Ignore individual path failures
}
}
} catch (error) {
console.warn('Failed to find Quartz content URLs:', error)
}
// Remove duplicates and limit results
const uniqueUrls = [...new Set(urls)]
return uniqueUrls.slice(0, 50) // Limit to 50 pages to avoid overwhelming
}
/**
* Extract content URLs from a page's HTML content
*/
private extractContentUrlsFromPage(content: string, baseUrl: string): string[] {
const urls: string[] = []
try {
// Look for navigation links
const navLinks = content.match(/<nav[^>]*>[\s\S]*?<\/nav>/gi)
if (navLinks) {
navLinks.forEach(nav => {
const links = nav.match(/<a[^>]+href=["']([^"']+)["'][^>]*>/gi)
if (links) {
links.forEach(link => {
const urlMatch = link.match(/href=["']([^"']+)["']/i)
if (urlMatch) {
const url = urlMatch[1]
if (url.startsWith('/') && !url.startsWith('//')) {
urls.push(`${baseUrl}${url}`)
} else if (url.startsWith(baseUrl)) {
urls.push(url)
}
}
})
}
})
}
// Look for any internal links
const allLinks = content.match(/<a[^>]+href=["']([^"']+)["'][^>]*>/gi)
if (allLinks) {
allLinks.forEach(link => {
const urlMatch = link.match(/href=["']([^"']+)["']/i)
if (urlMatch) {
const url = urlMatch[1]
if (url.startsWith('/') && !url.startsWith('//') && !url.includes('#')) {
urls.push(`${baseUrl}${url}`)
} else if (url.startsWith(baseUrl) && !url.includes('#')) {
urls.push(url)
}
}
})
}
} catch (error) {
console.warn('Error extracting URLs from page:', error)
}
return urls
}
/**
* Parse Quartz markdown content
*/
private parseQuartzMarkdown(content: string, url: string, baseUrl: string): ObsidianObsNote {
// Extract title from URL or content
const title = this.extractTitleFromUrl(url) || this.extractTitleFromContent(content)
// Parse frontmatter
const frontmatter = this.parseFrontmatter(content)
// Extract tags
const tags = this.extractTags(content, frontmatter)
// Extract links
const links = this.extractLinks(content, baseUrl)
// Clean content (remove frontmatter and convert HTML to markdown-like text)
let cleanContent = this.removeFrontmatter(content)
// If content is HTML, convert it to a more readable format
if (cleanContent.includes('<html') || cleanContent.includes('<body')) {
cleanContent = this.convertHtmlToMarkdown(cleanContent)
}
return {
id: this.generateId(url),
title,
content: cleanContent,
filePath: url,
tags,
created: new Date(),
modified: new Date(),
links,
backlinks: [],
frontmatter
}
}
/**
* Convert HTML content to markdown-like text
*/
private convertHtmlToMarkdown(html: string): string {
let text = html
// Remove script, style, and other non-content tags
text = text.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
text = text.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
text = text.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
text = text.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
text = text.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '')
// Try to extract main content area
const mainMatch = text.match(/<main[^>]*>(.*?)<\/main>/is)
if (mainMatch) {
text = mainMatch[1]
} else {
// Try to find article or content div
const articleMatch = text.match(/<article[^>]*>(.*?)<\/article>/is)
if (articleMatch) {
text = articleMatch[1]
} else {
// Try multiple content div patterns
const contentPatterns = [
/<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)<\/div>/is,
/<div[^>]*class="[^"]*main[^"]*"[^>]*>(.*?)<\/div>/is,
/<div[^>]*class="[^"]*post[^"]*"[^>]*>(.*?)<\/div>/is,
/<div[^>]*class="[^"]*article[^"]*"[^>]*>(.*?)<\/div>/is,
/<div[^>]*id="[^"]*content[^"]*"[^>]*>(.*?)<\/div>/is,
/<div[^>]*id="[^"]*main[^"]*"[^>]*>(.*?)<\/div>/is
]
for (const pattern of contentPatterns) {
const match = text.match(pattern)
if (match) {
text = match[1]
break
}
}
}
}
// Convert headers
text = text.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n')
text = text.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n')
text = text.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n')
text = text.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n\n')
text = text.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n\n')
text = text.replace(/<h6[^>]*>(.*?)<\/h6>/gi, '###### $1\n\n')
// Convert paragraphs
text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
// Convert links
text = text.replace(/<a[^>]+href=["']([^"']+)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
// Convert lists
text = text.replace(/<ul[^>]*>/gi, '')
text = text.replace(/<\/ul>/gi, '\n')
text = text.replace(/<ol[^>]*>/gi, '')
text = text.replace(/<\/ol>/gi, '\n')
text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n')
// Convert emphasis
text = text.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
text = text.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
text = text.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
text = text.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
// Convert code
text = text.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`')
text = text.replace(/<pre[^>]*>(.*?)<\/pre>/gi, '```\n$1\n```\n')
// Convert blockquotes
text = text.replace(/<blockquote[^>]*>(.*?)<\/blockquote>/gi, '> $1\n\n')
// Convert line breaks
text = text.replace(/<br[^>]*>/gi, '\n')
// Remove remaining HTML tags
text = text.replace(/<[^>]+>/g, '')
// Decode HTML entities
text = text.replace(/&amp;/g, '&')
text = text.replace(/&lt;/g, '<')
text = text.replace(/&gt;/g, '>')
text = text.replace(/&quot;/g, '"')
text = text.replace(/&#39;/g, "'")
text = text.replace(/&nbsp;/g, ' ')
// Clean up whitespace
text = text.replace(/\n\s*\n\s*\n/g, '\n\n')
text = text.replace(/^\s+|\s+$/g, '') // Trim start and end
text = text.trim()
// If we still don't have much content, try to extract any text from the original HTML
if (text.length < 50) {
let fallbackText = html
// Remove script, style, and other non-content tags
fallbackText = fallbackText.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
fallbackText = fallbackText.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
fallbackText = fallbackText.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
fallbackText = fallbackText.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
fallbackText = fallbackText.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
fallbackText = fallbackText.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '')
// Convert basic HTML elements
fallbackText = fallbackText.replace(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi, '# $1\n\n')
fallbackText = fallbackText.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
fallbackText = fallbackText.replace(/<div[^>]*>(.*?)<\/div>/gi, '$1\n')
fallbackText = fallbackText.replace(/<span[^>]*>(.*?)<\/span>/gi, '$1')
fallbackText = fallbackText.replace(/<[^>]+>/g, '')
fallbackText = fallbackText.replace(/&amp;/g, '&')
fallbackText = fallbackText.replace(/&lt;/g, '<')
fallbackText = fallbackText.replace(/&gt;/g, '>')
fallbackText = fallbackText.replace(/&quot;/g, '"')
fallbackText = fallbackText.replace(/&#39;/g, "'")
fallbackText = fallbackText.replace(/&nbsp;/g, ' ')
fallbackText = fallbackText.replace(/\n\s*\n\s*\n/g, '\n\n')
fallbackText = fallbackText.trim()
if (fallbackText.length > text.length) {
text = fallbackText
}
}
// Final fallback: if we still don't have content, try to extract any text from the body
if (text.length < 20) {
const bodyMatch = html.match(/<body[^>]*>(.*?)<\/body>/is)
if (bodyMatch) {
let bodyText = bodyMatch[1]
// Remove all HTML tags
bodyText = bodyText.replace(/<[^>]+>/g, '')
// Decode HTML entities
bodyText = bodyText.replace(/&amp;/g, '&')
bodyText = bodyText.replace(/&lt;/g, '<')
bodyText = bodyText.replace(/&gt;/g, '>')
bodyText = bodyText.replace(/&quot;/g, '"')
bodyText = bodyText.replace(/&#39;/g, "'")
bodyText = bodyText.replace(/&nbsp;/g, ' ')
bodyText = bodyText.replace(/\s+/g, ' ').trim()
if (bodyText.length > text.length) {
text = bodyText
}
}
}
return text
}
/**
* Extract title from URL
*/
private extractTitleFromUrl(url: string): string {
try {
const urlObj = new URL(url)
const path = urlObj.pathname
const segments = path.split('/').filter(segment => segment)
const lastSegment = segments[segments.length - 1] || 'index'
let title = lastSegment
.replace(/\.(html|md)$/, '')
.replace(/[-_]/g, ' ')
.replace(/\b\w/g, l => l.toUpperCase())
// If title is just "index" or empty, try to use the domain name
if (title === 'Index' || title === '') {
title = urlObj.hostname.replace('www.', '').replace('.com', '').replace('.xyz', '')
}
return title
} catch (error) {
// Fallback if URL parsing fails
return url.split('/').pop() || 'Untitled'
}
}
/**
* Extract title from content
*/
private extractTitleFromContent(content: string): string {
// Look for title tag first
const titleMatch = content.match(/<title[^>]*>(.*?)<\/title>/i)
if (titleMatch) {
let title = titleMatch[1].replace(/<[^>]*>/g, '').trim()
// Clean up common title suffixes
title = title.replace(/\s*-\s*.*$/, '') // Remove " - Site Name" suffix
title = title.replace(/\s*\|\s*.*$/, '') // Remove " | Site Name" suffix
if (title && title !== 'Untitled') {
return title
}
}
// Look for h1 tag
const h1Match = content.match(/<h1[^>]*>(.*?)<\/h1>/i)
if (h1Match) {
return h1Match[1].replace(/<[^>]*>/g, '').trim()
}
// Look for first heading
const headingMatch = content.match(/^#\s+(.+)$/m)
if (headingMatch) {
return headingMatch[1].trim()
}
return 'Untitled'
}
/**
* Extract vault name from URL
*/
private extractVaultNameFromUrl(url: string): string {
try {
const urlObj = new URL(url)
return urlObj.hostname.replace('www.', '')
} catch (error) {
return 'Quartz Vault'
}
}
/**
* Generate ID from URL
*/
private generateId(url: string): string {
return url.replace(/[^a-zA-Z0-9]/g, '_')
}
/**
* Parse frontmatter from content
*/
private parseFrontmatter(content: string): Record<string, any> {
const frontmatterMatch = content.match(/^---\s*\n([\s\S]*?)\n---\s*\n/)
if (frontmatterMatch) {
return this.parseSimpleYaml(frontmatterMatch[1])
}
return {}
}
/**
* Remove frontmatter from content
*/
private removeFrontmatter(content: string): string {
return content.replace(/^---\s*\n[\s\S]*?\n---\s*\n/, '')
}
/**
* Extract links from content with base URL
*/
private extractLinks(content: string, baseUrl: string): string[] {
const links: string[] = []
// Extract markdown links [text](url)
const markdownLinks = content.match(/\[([^\]]+)\]\(([^)]+)\)/g)
if (markdownLinks) {
markdownLinks.forEach(link => {
const urlMatch = link.match(/\[([^\]]+)\]\(([^)]+)\)/)
if (urlMatch) {
const url = urlMatch[2]
if (url.startsWith('http') || url.startsWith('/')) {
links.push(url.startsWith('/') ? `${baseUrl}${url}` : url)
}
}
})
}
// Extract HTML links <a href="url">
const htmlLinks = content.match(/<a[^>]+href=["']([^"']+)["'][^>]*>/gi)
if (htmlLinks) {
htmlLinks.forEach(link => {
const urlMatch = link.match(/href=["']([^"']+)["']/i)
if (urlMatch) {
const url = urlMatch[1]
if (url.startsWith('http') || url.startsWith('/')) {
links.push(url.startsWith('/') ? `${baseUrl}${url}` : url)
}
}
})
}
return links
}
}