Container-ready via docker/ compose (frontend nginx + backend Node). Compose adjusted for Coolify on the prod server: frontend uses expose:80 (no host binding — host 8080 is taken by the Coolify proxy; Traefik routes visigine.de), backend ALLOWED_ORIGINS=https://visigine.de. Secrets stay in server/.env (git-ignored); see server/.env.example. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
193 lines
5.9 KiB
JavaScript
193 lines
5.9 KiB
JavaScript
// Extracts user-facing siteData from the analyze context for the three generators.
|
||
// Every field is optional; generators fall back to German `[Bitte ergänzen: ...]` placeholders.
|
||
|
||
// Kept in sync with checks/ai-bots.js. Order matters — used as canonical
|
||
// ordering for generated robots.txt.
|
||
export const AI_BOTS = [
|
||
'GPTBot', 'ClaudeBot', 'OAI-SearchBot', 'PerplexityBot', 'Bingbot',
|
||
'Google-Extended', 'GoogleOther', 'Applebot-Extended', 'Meta-ExternalAgent',
|
||
'CCBot', 'Bytespider', 'DuckAssistBot', 'ChatGPT-User',
|
||
]
|
||
|
||
const SEPARATORS = /\s+[–|—\-·•|]\s+/
|
||
const PLACEHOLDER_EMAILS = new Set([
|
||
'name@example.com', 'test@test.de', 'test@example.com',
|
||
'mail@example.com', 'info@example.com',
|
||
])
|
||
const PLACEHOLDER_PHONES = new Set(['+49 0', '+49000', '0000000', '1234567'])
|
||
|
||
function cleanTitle(title) {
|
||
if (!title) return null
|
||
const parts = title.split(SEPARATORS).map((s) => s.trim()).filter(Boolean)
|
||
if (!parts.length) return null
|
||
const longest = parts.reduce((a, b) => (a.length >= b.length ? a : b))
|
||
return longest.length >= 3 ? longest : null
|
||
}
|
||
|
||
function decodeEntities(s) {
|
||
if (!s) return s
|
||
return s
|
||
.replace(/&/g, '&')
|
||
.replace(/"/g, '"')
|
||
.replace(/'/g, "'")
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>')
|
||
.replace(/ /g, ' ')
|
||
}
|
||
|
||
function metaContent(headHtml, attr, value) {
|
||
const re = new RegExp(
|
||
`<meta[^>]*${attr}=["']${value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}["'][^>]*content=["']([^"']*)["']`,
|
||
'i'
|
||
)
|
||
const m = headHtml.match(re)
|
||
if (m) return decodeEntities(m[1].trim())
|
||
// Try attribute order swapped.
|
||
const re2 = new RegExp(
|
||
`<meta[^>]*content=["']([^"']*)["'][^>]*${attr}=["']${value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}["']`,
|
||
'i'
|
||
)
|
||
const m2 = headHtml.match(re2)
|
||
return m2 ? decodeEntities(m2[1].trim()) : null
|
||
}
|
||
|
||
function linkHref(headHtml, rel) {
|
||
const re = new RegExp(`<link[^>]*rel=["']${rel}["'][^>]*href=["']([^"']+)["']`, 'i')
|
||
const m = headHtml.match(re)
|
||
if (m) return m[1].trim()
|
||
const re2 = new RegExp(`<link[^>]*href=["']([^"']+)["'][^>]*rel=["']${rel}["']`, 'i')
|
||
const m2 = headHtml.match(re2)
|
||
return m2 ? m2[1].trim() : null
|
||
}
|
||
|
||
function parseJsonLdBlocks(blocks) {
|
||
const parsed = []
|
||
for (const block of blocks || []) {
|
||
try {
|
||
const v = JSON.parse(block)
|
||
if (Array.isArray(v)) parsed.push(...v)
|
||
else parsed.push(v)
|
||
} catch {
|
||
// skip malformed
|
||
}
|
||
}
|
||
// Flatten @graph members so consumers can iterate flat list.
|
||
const flat = []
|
||
for (const node of parsed) {
|
||
if (node && typeof node === 'object' && Array.isArray(node['@graph'])) {
|
||
flat.push(...node['@graph'])
|
||
} else if (node) {
|
||
flat.push(node)
|
||
}
|
||
}
|
||
return flat
|
||
}
|
||
|
||
function pickType(node) {
|
||
const t = node?.['@type']
|
||
if (Array.isArray(t)) return t[0]
|
||
return t
|
||
}
|
||
|
||
function findNode(nodes, types) {
|
||
const set = new Set(types)
|
||
return nodes.find((n) => set.has(pickType(n))) || null
|
||
}
|
||
|
||
function firstEmail(html) {
|
||
const m = (html || '').match(/mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/i)
|
||
if (!m) return null
|
||
const email = m[1].toLowerCase()
|
||
return PLACEHOLDER_EMAILS.has(email) ? null : email
|
||
}
|
||
|
||
function firstPhone(html) {
|
||
const m = (html || '').match(/tel:(\+?[0-9 \-()]{6,})/i)
|
||
if (!m) return null
|
||
const phone = m[1].trim()
|
||
return PLACEHOLDER_PHONES.has(phone) ? null : phone
|
||
}
|
||
|
||
function detectExistingAiBots(robotsTxt) {
|
||
if (!robotsTxt) return []
|
||
return AI_BOTS.filter((bot) => robotsTxt.includes(bot))
|
||
}
|
||
|
||
export function extractSiteData(context) {
|
||
const { headHtml = '', html = '', jsonLdBlocks = [], robotsTxt = '', llmsTxt = '', baseUrl = '' } = context
|
||
const nodes = parseJsonLdBlocks(jsonLdBlocks)
|
||
const org = findNode(nodes, ['Organization', 'LocalBusiness', 'Corporation', 'NewsMediaOrganization'])
|
||
const website = findNode(nodes, ['WebSite'])
|
||
|
||
const ogSiteName = metaContent(headHtml, 'property', 'og:site_name')
|
||
const ogTitle = metaContent(headHtml, 'property', 'og:title')
|
||
const ogDesc = metaContent(headHtml, 'property', 'og:description')
|
||
const ogLocale = metaContent(headHtml, 'property', 'og:locale')
|
||
const metaDesc = metaContent(headHtml, 'name', 'description')
|
||
|
||
const titleRaw = (headHtml.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1] || '').trim()
|
||
const titleClean = cleanTitle(decodeEntities(titleRaw))
|
||
|
||
const langMatch = html.match(/<html[^>]*\slang=["']([^"']+)["']/i)
|
||
const language = (langMatch?.[1] || ogLocale || 'de').split(/[-_]/)[0].toLowerCase()
|
||
|
||
const canonical = linkHref(headHtml, 'canonical')
|
||
const url = canonical || baseUrl || ''
|
||
const hostname = (() => {
|
||
try { return new URL(url).hostname } catch { return '' }
|
||
})()
|
||
|
||
const name =
|
||
ogSiteName ||
|
||
(typeof org?.name === 'string' ? org.name : null) ||
|
||
(typeof website?.name === 'string' ? website.name : null) ||
|
||
titleClean ||
|
||
hostname ||
|
||
null
|
||
|
||
const description =
|
||
metaDesc ||
|
||
ogDesc ||
|
||
(typeof org?.description === 'string' ? org.description : null) ||
|
||
null
|
||
|
||
const email = firstEmail(html) || (typeof org?.email === 'string' ? org.email : null) || null
|
||
|
||
const phone =
|
||
firstPhone(html) ||
|
||
(typeof org?.telephone === 'string' ? org.telephone : null) ||
|
||
null
|
||
|
||
let address = null
|
||
const addrRaw = org?.address
|
||
if (addrRaw && typeof addrRaw === 'object') {
|
||
address = {
|
||
streetAddress: addrRaw.streetAddress || null,
|
||
postalCode: addrRaw.postalCode || null,
|
||
addressLocality: addrRaw.addressLocality || null,
|
||
addressCountry: addrRaw.addressCountry || null,
|
||
}
|
||
}
|
||
|
||
let sameAs = []
|
||
if (Array.isArray(org?.sameAs)) {
|
||
sameAs = org.sameAs.filter((s) => typeof s === 'string' && /^https?:\/\//.test(s))
|
||
}
|
||
|
||
return {
|
||
name,
|
||
description,
|
||
url,
|
||
language,
|
||
hostname,
|
||
email,
|
||
phone,
|
||
address,
|
||
sameAs,
|
||
existingRobots: robotsTxt || '',
|
||
existingAiBots: detectExistingAiBots(robotsTxt),
|
||
hasLlmsTxt: Boolean(llmsTxt && llmsTxt.length > 0),
|
||
hasOrgJsonLd: Boolean(org),
|
||
}
|
||
}
|