// Extracts user-facing siteData from the analyze context for the three generators. // Every field is optional; generators fall back to German `[Bitte ergänzen: ...]` placeholders. // Kept in sync with checks/ai-bots.js. Order matters — used as canonical // ordering for generated robots.txt. export const AI_BOTS = [ 'GPTBot', 'ClaudeBot', 'OAI-SearchBot', 'PerplexityBot', 'Bingbot', 'Google-Extended', 'GoogleOther', 'Applebot-Extended', 'Meta-ExternalAgent', 'CCBot', 'Bytespider', 'DuckAssistBot', 'ChatGPT-User', ] const SEPARATORS = /\s+[–|—\-·•|]\s+/ const PLACEHOLDER_EMAILS = new Set([ 'name@example.com', 'test@test.de', 'test@example.com', 'mail@example.com', 'info@example.com', ]) const PLACEHOLDER_PHONES = new Set(['+49 0', '+49000', '0000000', '1234567']) function cleanTitle(title) { if (!title) return null const parts = title.split(SEPARATORS).map((s) => s.trim()).filter(Boolean) if (!parts.length) return null const longest = parts.reduce((a, b) => (a.length >= b.length ? a : b)) return longest.length >= 3 ? longest : null } function decodeEntities(s) { if (!s) return s return s .replace(/&/g, '&') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/</g, '<') .replace(/>/g, '>') .replace(/ /g, ' ') } function metaContent(headHtml, attr, value) { const re = new RegExp( `]*${attr}=["']${value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}["'][^>]*content=["']([^"']*)["']`, 'i' ) const m = headHtml.match(re) if (m) return decodeEntities(m[1].trim()) // Try attribute order swapped. const re2 = new RegExp( `]*content=["']([^"']*)["'][^>]*${attr}=["']${value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}["']`, 'i' ) const m2 = headHtml.match(re2) return m2 ? decodeEntities(m2[1].trim()) : null } function linkHref(headHtml, rel) { const re = new RegExp(`]*rel=["']${rel}["'][^>]*href=["']([^"']+)["']`, 'i') const m = headHtml.match(re) if (m) return m[1].trim() const re2 = new RegExp(`]*href=["']([^"']+)["'][^>]*rel=["']${rel}["']`, 'i') const m2 = headHtml.match(re2) return m2 ? m2[1].trim() : null } function parseJsonLdBlocks(blocks) { const parsed = [] for (const block of blocks || []) { try { const v = JSON.parse(block) if (Array.isArray(v)) parsed.push(...v) else parsed.push(v) } catch { // skip malformed } } // Flatten @graph members so consumers can iterate flat list. const flat = [] for (const node of parsed) { if (node && typeof node === 'object' && Array.isArray(node['@graph'])) { flat.push(...node['@graph']) } else if (node) { flat.push(node) } } return flat } function pickType(node) { const t = node?.['@type'] if (Array.isArray(t)) return t[0] return t } function findNode(nodes, types) { const set = new Set(types) return nodes.find((n) => set.has(pickType(n))) || null } function firstEmail(html) { const m = (html || '').match(/mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/i) if (!m) return null const email = m[1].toLowerCase() return PLACEHOLDER_EMAILS.has(email) ? null : email } function firstPhone(html) { const m = (html || '').match(/tel:(\+?[0-9 \-()]{6,})/i) if (!m) return null const phone = m[1].trim() return PLACEHOLDER_PHONES.has(phone) ? null : phone } function detectExistingAiBots(robotsTxt) { if (!robotsTxt) return [] return AI_BOTS.filter((bot) => robotsTxt.includes(bot)) } export function extractSiteData(context) { const { headHtml = '', html = '', jsonLdBlocks = [], robotsTxt = '', llmsTxt = '', baseUrl = '' } = context const nodes = parseJsonLdBlocks(jsonLdBlocks) const org = findNode(nodes, ['Organization', 'LocalBusiness', 'Corporation', 'NewsMediaOrganization']) const website = findNode(nodes, ['WebSite']) const ogSiteName = metaContent(headHtml, 'property', 'og:site_name') const ogTitle = metaContent(headHtml, 'property', 'og:title') const ogDesc = metaContent(headHtml, 'property', 'og:description') const ogLocale = metaContent(headHtml, 'property', 'og:locale') const metaDesc = metaContent(headHtml, 'name', 'description') const titleRaw = (headHtml.match(/