'use strict' /** * Network Intelligence — AI-powered log analysis via InfluxDB + Grafana * * Queries InfluxDB (telegraf) for real-time network metrics: * - Interface status (up/down) across all routers, switches, OLTs * - Traffic patterns (ingress/egress) for anomaly detection * - Error counters (CRC, drops, collisions) for fiber/cable issues * - SNMP trap history for correlating events * - LLDP/CDP neighbor discovery for topology links * * Then feeds these to Gemini for root cause analysis instead of making * humans stare at 20 Grafana dashboards trying to correlate mentally. * * InfluxDB: 10.5.2.65:8086 (no auth, InfluxQL v1) * Databases: telegraf (live SNMP), OLT_Logs (syslog), graphite (FastNetMon) */ const cfg = require('./config') const { log, json, parseBody } = require('./helpers') const http = require('http') const https = require('https') const INFLUX = cfg.INFLUXDB_URL || 'http://10.5.2.65:8086' const KUMA_METRICS_URL = cfg.KUMA_METRICS_URL || 'https://kuma.targo.ca/metrics' // ── InfluxDB query helper ────────────────────────────────────────────────── async function influxQuery (db, query) { const url = `${INFLUX}/query?db=${encodeURIComponent(db)}&q=${encodeURIComponent(query)}` return new Promise((resolve, reject) => { const proto = url.startsWith('https') ? require('https') : http proto.get(url, { timeout: 15000 }, (res) => { let data = '' res.on('data', c => data += c) res.on('end', () => { try { const parsed = JSON.parse(data) resolve(parsed) } catch (e) { reject(new Error(`InfluxDB parse error: ${e.message}`)) } }) }).on('error', reject) }) } function extractSeries (result) { const rows = [] for (const r of (result.results || [])) { for (const s of (r.series || [])) { const cols = s.columns || [] for (const v of (s.values || [])) { const row = {} cols.forEach((c, i) => { row[c] = v[i] }) row._measurement = s.name if (s.tags) Object.assign(row, s.tags) rows.push(row) } } } return rows } // ── Kuma Prometheus metrics scraper ──────────────────────────────────────── let _kumaCache = null let _kumaCacheTs = 0 const KUMA_CACHE_TTL = 60000 // 1 min async function fetchKumaMonitors () { const now = Date.now() if (_kumaCache && (now - _kumaCacheTs) < KUMA_CACHE_TTL) return _kumaCache return new Promise((resolve, reject) => { const proto = KUMA_METRICS_URL.startsWith('https') ? https : http proto.get(KUMA_METRICS_URL, { timeout: 10000, rejectUnauthorized: false }, (res) => { let data = '' res.on('data', c => data += c) res.on('end', () => { const monitors = [] const lines = data.split('\n') // Parse Prometheus format: monitor_status{labels} value const statusLines = lines.filter(l => l.startsWith('monitor_status{')) const rtLines = lines.filter(l => l.startsWith('monitor_response_time{')) const rtMap = {} for (const line of rtLines) { const m = line.match(/monitor_id="(\d+)".*}\s+([\d.]+)/) if (m) rtMap[m[1]] = parseFloat(m[2]) } for (const line of statusLines) { const idM = line.match(/monitor_id="(\d+)"/) const nameM = line.match(/monitor_name="([^"]*)"/) const typeM = line.match(/monitor_type="([^"]*)"/) const hostM = line.match(/monitor_hostname="([^"]*)"/) const valM = line.match(/}\s+([\d.]+)$/) if (!idM || !nameM) continue const id = idM[1] const hostname = hostM ? hostM[1] : null monitors.push({ id, name: nameM[1], type: typeM ? typeM[1] : 'unknown', hostname: hostname === 'null' ? null : hostname, status: valM ? parseInt(valM[1]) : -1, // 1=up, 0=down, 2=pending responseTime: rtMap[id] || null, }) } log(`Kuma: scraped ${monitors.length} monitors, ${monitors.filter(m => m.status === 0).length} down`) _kumaCache = monitors _kumaCacheTs = now resolve(monitors) }) }).on('error', (e) => { log(`Kuma metrics fetch failed: ${e.message}`) resolve(_kumaCache || []) // Return stale cache on error }) }) } // ── Network topology discovery ───────────────────────────────────────────── let _topologyCache = null let _topologyCacheTs = 0 const TOPOLOGY_CACHE_TTL = 300000 // 5 min // Known network hierarchy (based on Targo/Gigafibre architecture) // Core → Edge Router → OLT Gateway → OLTs → ONUs const DEVICE_HIERARCHY = { 'core-switch': { level: 0, label: 'Core Switch', icon: 'hub' }, 'edge-router': { level: 1, label: 'Edge Router', icon: 'router' }, 'olt-gateway': { level: 2, label: 'OLT Gateway', icon: 'device_hub' }, 'public-infra': { level: 1, label: 'Public Infra', icon: 'public' }, 'server': { level: 3, label: 'Server', icon: 'dns' }, 'unknown': { level: 4, label: 'Unknown', icon: 'help' }, } function classifyDevice (host) { if (host.startsWith('172.17.')) return 'olt-gateway' if (/^10\.10[1-6]\./.test(host)) return 'edge-router' if (host.startsWith('10.255.')) return 'core-switch' if (host.startsWith('10.100.')) return 'server' if (host.startsWith('23.159.') || host.startsWith('100.64.') || host.startsWith('96.125.')) return 'public-infra' return 'unknown' } // Subnet-based parent inference: which core switch likely serves which edge router // Based on network architecture: core switches on 10.255.x.x connect to edge routers 10.101-106.x.x function inferLinks (devices) { const links = [] const byType = {} for (const d of devices) { if (!byType[d.type]) byType[d.type] = [] byType[d.type].push(d) } const coreDevices = byType['core-switch'] || [] const edgeDevices = byType['edge-router'] || [] const oltGateways = byType['olt-gateway'] || [] // Infer Core → Edge links by analyzing interface descriptions // Core switches (10.255.x.x) have uplinks to edge routers with descriptions // mentioning the router hostname or IP. Fallback: subnet-based matching. for (const edge of edgeDevices) { // Find the core switch most likely connected (by matching 3rd octet patterns) // Example: edge 10.101.0.1 → core 10.255.254.1 or 10.255.254.2 // For now, link to all core switches (the AI analysis will refine) let linked = false for (const core of coreDevices) { // Check if any core interface description mentions the edge IP or subnet for (const iface of core.interfaces) { const d = iface.descr || '' // Common patterns: "GigaEthernet0/1 - to-router-101", "ae0.101" const edgeOctet = edge.host.split('.')[1] // "101", "102", etc. if (d.includes(edgeOctet) || d.toLowerCase().includes(edge.host)) { links.push({ from: core.host, to: edge.host, type: 'core-to-edge', via: d, status: iface.status }) linked = true break } } } // Fallback: distribute edge routers across core switches based on subnet patterns // 10.101.x → first core, 10.102.x → second core, etc. if (!linked && coreDevices.length > 0) { const edgeSubnet = parseInt(edge.host.split('.')[1]) // 101, 102, 103... const sectorIdx = (edgeSubnet - 101) % coreDevices.length const assignedCore = coreDevices[Math.max(0, Math.min(sectorIdx, coreDevices.length - 1))] links.push({ from: assignedCore.host, to: edge.host, type: 'core-to-edge', via: `inferred (subnet .${edgeSubnet})`, status: 'unknown' }) } } // Infer Edge → OLT Gateway links by IP range proximity // OLT gateways (172.17.x.x) connect through edge routers for (const gw of oltGateways) { // The gateway's management interface often routes through a specific edge router // For now we infer by matching OLT registered names to edge subnet areas if (edgeDevices.length > 0) { links.push({ from: edgeDevices[0].host, to: gw.host, type: 'edge-to-olt', via: 'inferred', status: 'unknown' }) } else if (coreDevices.length > 0) { links.push({ from: coreDevices[0].host, to: gw.host, type: 'core-to-olt', via: 'inferred', status: 'unknown' }) } } // OLT Gateway → OLTs (from olt-snmp module) try { const oltSnmp = require('./olt-snmp') const oltStats = oltSnmp.getOltStats() for (const olt of (oltStats || [])) { // Match OLT host to nearest gateway const oltIp = olt.host if (!oltIp) continue let bestGw = null for (const gw of oltGateways) { // Same /16 subnet? const gwParts = gw.host.split('.') const oltParts = oltIp.split('.') if (gwParts[0] === oltParts[0] && gwParts[1] === oltParts[1]) { bestGw = gw break } } if (bestGw) { links.push({ from: bestGw.host, to: oltIp, type: 'gateway-to-olt', via: olt.name || oltIp, status: olt.reachable ? 'up' : 'down', oltName: olt.name, onuCount: olt.onuCount || 0, onlineCount: olt.onlineCount || 0, }) } } } catch (e) { log(`OLT link inference failed: ${e.message}`) } return links } async function discoverTopology () { const now = Date.now() if (_topologyCache && (now - _topologyCacheTs) < TOPOLOGY_CACHE_TTL) return _topologyCache log('Network topology discovery starting...') const devices = new Map() // agent_host → { interfaces: [], ... } // Step 1: Get list of agent_hosts from BOTH 'interface' and 'ifTable' measurements // Edge routers may only appear in ifTable (different telegraf collector) const measurements = ['interface', 'ifTable'] for (const m of measurements) { try { const hostResult = await influxQuery('telegraf', `SHOW TAG VALUES FROM "${m}" WITH KEY = "agent_host"` ) for (const r of (hostResult.results || [])) { for (const s of (r.series || [])) { for (const v of (s.values || [])) { const host = v[1] if (host && !devices.has(host)) { devices.set(host, { host, interfaces: [], type: 'unknown', source: m }) } } } } } catch (e) { log(`Topology host discovery from ${m} failed: ${e.message}`) } } // Step 2: Get interface summary from 'interface' measurement try { const ifResult = await influxQuery('telegraf', `SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed" FROM "interface" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"` ) for (const r of (ifResult.results || [])) { for (const s of (r.series || [])) { const agentHost = s.tags?.agent_host if (!agentHost || !devices.has(agentHost)) continue const dev = devices.get(agentHost) for (const v of (s.values || [])) { dev.interfaces.push({ descr: s.tags?.ifDescr || '', status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown', speed: v[2], source: 'interface', }) } } } } catch (e) { log(`Topology interface query failed: ${e.message}`) } // Step 3: Get interface summary from 'ifTable' for devices not yet covered // (edge routers, additional switches that only report via ifTable) try { const ifResult = await influxQuery('telegraf', `SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed" FROM "ifTable" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"` ) for (const r of (ifResult.results || [])) { for (const s of (r.series || [])) { const agentHost = s.tags?.agent_host if (!agentHost) continue if (!devices.has(agentHost)) { devices.set(agentHost, { host: agentHost, interfaces: [], type: 'unknown', source: 'ifTable' }) } const dev = devices.get(agentHost) // Only add if we don't already have interfaces from 'interface' measurement const hasIfData = dev.interfaces.some(i => i.source === 'interface') for (const v of (s.values || [])) { const ifDescr = s.tags?.ifDescr || '' // Skip if already have this interface from 'interface' measurement if (hasIfData && dev.interfaces.some(i => i.descr === ifDescr && i.source === 'interface')) continue dev.interfaces.push({ descr: ifDescr, status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown', speed: v[2], source: 'ifTable', }) } } } } catch (e) { log(`Topology ifTable query failed: ${e.message}`) } // Step 4: Try LLDP neighbor data for real link discovery let lldpLinks = [] try { const lldpResult = await influxQuery('telegraf', `SELECT last("lldpRemSysName") AS "remoteName", last("lldpRemPortDesc") AS "remotePort" FROM "lldpRemTable" WHERE time > now() - 1h GROUP BY "agent_host", "lldpRemIndex"` ) for (const r of (lldpResult.results || [])) { for (const s of (r.series || [])) { const localHost = s.tags?.agent_host for (const v of (s.values || [])) { if (v[1]) { lldpLinks.push({ localHost, remoteName: v[1], remotePort: v[2] || '' }) } } } } } catch (e) { // LLDP not available — fine, we'll infer links } // Step 5: Get traffic data for bandwidth utilization (top talkers) const trafficMap = new Map() try { const trResult = await influxQuery('telegraf', `SELECT last("ifHCInOctets") AS "inOctets", last("ifHCOutOctets") AS "outOctets" FROM "ifXTable" WHERE time > now() - 10m GROUP BY "agent_host", "ifDescr"` ) for (const r of (trResult.results || [])) { for (const s of (r.series || [])) { const host = s.tags?.agent_host if (!host) continue if (!trafficMap.has(host)) trafficMap.set(host, { totalIn: 0, totalOut: 0, topIf: [] }) const t = trafficMap.get(host) for (const v of (s.values || [])) { const inO = v[1] || 0 const outO = v[2] || 0 t.totalIn += inO t.totalOut += outO if (inO + outO > 0) { t.topIf.push({ descr: s.tags?.ifDescr, inOctets: inO, outOctets: outO }) } } } } } catch (e) { // Traffic data optional } // Step 6: Fetch Kuma monitor data for real reachability let kumaMonitors = [] try { kumaMonitors = await fetchKumaMonitors() } catch (e) { log(`Topology Kuma fetch failed: ${e.message}`) } // Build lookup: IP/hostname → Kuma monitor const kumaByHost = new Map() for (const m of kumaMonitors) { if (m.hostname) { // Strip protocol and port if present (e.g. "http://10.5.2.65:8086" → "10.5.2.65") const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '') kumaByHost.set(clean, m) } // Also index by monitor name in case it contains an IP const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/) if (ipInName && !kumaByHost.has(ipInName[1])) { kumaByHost.set(ipInName[1], m) } } // Classify devices by their host tag patterns for (const [host, dev] of devices) { dev.type = classifyDevice(host) dev.level = DEVICE_HIERARCHY[dev.type]?.level ?? 4 dev.label = DEVICE_HIERARCHY[dev.type]?.label ?? 'Unknown' dev.icon = DEVICE_HIERARCHY[dev.type]?.icon ?? 'help' // Interface summary stats — only count "active" interfaces (those with speed > 0 or status up) // Unused switch ports (down + speed 0) are not problems, just empty ports const activeIfs = dev.interfaces.filter(i => i.status === 'up' || (i.speed && i.speed > 0)) const up = dev.interfaces.filter(i => i.status === 'up').length const down = activeIfs.filter(i => i.status === 'down').length // only active-but-down ports matter const total = dev.interfaces.length const activeTotal = activeIfs.length const ifHealth = activeTotal > 0 ? Math.round((up / activeTotal) * 100) : (total > 0 ? 0 : null) // Kuma reachability — source of truth for device being up/down const kuma = kumaByHost.get(host) const kumaUp = kuma ? kuma.status === 1 : null // null = not monitored in Kuma const kumaRt = kuma ? kuma.responseTime : null // Final health: Kuma UP + has active interfaces → use interface health // Kuma UP + no active interfaces → 100% (device reachable, just no SNMP polling) // Kuma DOWN → 0% // No Kuma data → fall back to interface health let health if (kumaUp === true) { health = activeTotal > 0 ? Math.max(ifHealth, 80) : 100 // reachable = at least 80% } else if (kumaUp === false) { health = 0 } else { health = ifHealth ?? 0 } dev.ifSummary = { up, down, total, activeTotal, health } dev.kuma = kuma ? { status: kuma.status, name: kuma.name, responseTime: kumaRt } : null // Traffic data if available const traffic = trafficMap.get(host) if (traffic) { dev.traffic = { totalInGb: Math.round(traffic.totalIn / 1073741824 * 100) / 100, totalOutGb: Math.round(traffic.totalOut / 1073741824 * 100) / 100, } } } // Build links (inferred + LLDP-based) const links = inferLinks([...devices.values()]) // Add LLDP-discovered links for (const l of lldpLinks) { // Try to resolve remoteName to an IP in our device list const remoteDevice = [...devices.values()].find(d => d.host === l.remoteName || d.host.includes(l.remoteName) ) if (remoteDevice) { // Check if link already exists const exists = links.some(lk => (lk.from === l.localHost && lk.to === remoteDevice.host) || (lk.to === l.localHost && lk.from === remoteDevice.host) ) if (!exists) { links.push({ from: l.localHost, to: remoteDevice.host, type: 'lldp', via: `LLDP: ${l.remotePort}`, status: 'up', }) } } } const topology = { devices: [...devices.values()], links, deviceCount: devices.size, linkCount: links.length, ts: new Date().toISOString(), summary: { oltGateways: [...devices.values()].filter(d => d.type === 'olt-gateway').length, edgeRouters: [...devices.values()].filter(d => d.type === 'edge-router').length, coreSwitches: [...devices.values()].filter(d => d.type === 'core-switch').length, servers: [...devices.values()].filter(d => d.type === 'server').length, publicInfra: [...devices.values()].filter(d => d.type === 'public-infra').length, unknown: [...devices.values()].filter(d => d.type === 'unknown').length, }, } log(`Topology discovered: ${topology.deviceCount} devices, ${topology.linkCount} links (${topology.summary.edgeRouters} routers, ${topology.summary.coreSwitches} core, ${topology.summary.oltGateways} OLT gateways)`) _topologyCache = topology _topologyCacheTs = now return topology } // ── Interface health snapshot ────────────────────────────────────────────── async function getInterfaceHealth (agentHost) { // Query both interface and ifTable measurements const results = [] for (const m of ['ifTable', 'interface']) { try { const result = await influxQuery('telegraf', `SELECT last("ifOperStatus") AS "status", last("ifInErrors") AS "inErrors", last("ifOutErrors") AS "outErrors", last("ifInDiscards") AS "inDiscards", last("ifOutDiscards") AS "outDiscards", last("ifSpeed") AS "speed" FROM "${m}" WHERE "agent_host" = '${agentHost}' AND time > now() - 30m GROUP BY "ifIndex", "ifDescr"` ) const rows = extractSeries(result) if (rows.length > 0) return rows // Return first measurement that has data results.push(...rows) } catch (e) { // Try next measurement } } return results } // ── Recent error spikes ──────────────────────────────────────────────────── async function getErrorSpikes (windowMinutes = 60) { const result = await influxQuery('telegraf', `SELECT non_negative_derivative(last("ifInErrors"), 1m) AS "errRate" FROM "ifTable" WHERE time > now() - ${windowMinutes}m GROUP BY time(5m), "agent_host", "ifIndex" FILL(0)` ) const spikes = [] for (const r of (result.results || [])) { for (const s of (r.series || [])) { for (const v of (s.values || [])) { if (v[1] && v[1] > 10) { // >10 errors/min = spike spikes.push({ time: v[0], errRate: v[1], host: s.tags?.agent_host, ifIndex: s.tags?.ifIndex, }) } } } } return spikes } // ── Dependency chain builder ────────────────────────────────────────────── async function buildDependencyMap () { const topology = await discoverTopology() // Build adjacency from links const adj = new Map() // host → [{ target, type, status }] for (const link of topology.links) { if (!adj.has(link.from)) adj.set(link.from, []) adj.get(link.from).push({ target: link.to, ...link }) // Bidirectional if (!adj.has(link.to)) adj.set(link.to, []) adj.get(link.to).push({ target: link.from, from: link.to, to: link.from, type: link.type, status: link.status }) } // Build dependency chains: for each leaf device, trace path to core const chains = [] const deviceMap = new Map(topology.devices.map(d => [d.host, d])) for (const dev of topology.devices) { if (dev.type === 'olt-gateway' || dev.type === 'server') { // Trace upward to core const chain = [dev.host] const visited = new Set([dev.host]) let current = dev.host for (let i = 0; i < 5; i++) { // max depth const neighbors = adj.get(current) || [] // Pick the neighbor with lowest level (closest to core) let bestNext = null let bestLevel = 999 for (const n of neighbors) { const nDev = deviceMap.get(n.target) if (!nDev || visited.has(n.target)) continue if (nDev.level < bestLevel) { bestLevel = nDev.level bestNext = n.target } } if (!bestNext) break chain.push(bestNext) visited.add(bestNext) current = bestNext } chain.reverse() // core → ... → leaf chains.push({ path: chain.map(h => { const d = deviceMap.get(h) return { host: h, type: d?.type, label: d?.label, health: d?.ifSummary?.health } }), leaf: dev.host, leafType: dev.type, }) } } // OLT-specific chains (from olt-snmp module) try { const oltSnmp = require('./olt-snmp') const stats = oltSnmp.getOltStats() for (const olt of (stats || [])) { // Find which gateway this OLT connects through const gwLink = topology.links.find(l => l.to === olt.host && l.type === 'gateway-to-olt') if (gwLink) { // Find the chain for this gateway and extend it const gwChain = chains.find(c => c.leaf === gwLink.from) if (gwChain) { chains.push({ path: [ ...gwChain.path, { host: olt.host, type: 'olt', label: olt.name || 'OLT', health: olt.reachable ? 100 : 0, onuCount: olt.onuCount, onlineCount: olt.onlineCount }, ], leaf: olt.host, leafType: 'olt', oltName: olt.name, onuCount: olt.onuCount || 0, onlineCount: olt.onlineCount || 0, }) } } } } catch (e) { // OLT data optional } return { topology: { devices: topology.devices.map(d => ({ host: d.host, type: d.type, level: d.level, label: d.label, icon: d.icon, ifSummary: d.ifSummary, traffic: d.traffic, })), links: topology.links, summary: topology.summary, }, chains, ts: new Date().toISOString(), } } // ── SNMP Trap log reader ────────────────────────────────────────────────── // Raisecom OLT trap OID → human-readable event type const RAISECOM_TRAP_MAP = { 'enterprises.8886.18.3.1.4.1': 'onu-register', // ONU registered 'enterprises.8886.18.3.1.4.2': 'onu-deregister', // ONU deregistered (offline) 'enterprises.8886.18.3.1.4.3': 'onu-state-change', // ONU state change 'enterprises.8886.1.27.8.2': 'olt-alarm-power', // Power alarm 'enterprises.8886.1.27.8.6': 'olt-alarm-fan', // Fan alarm 'enterprises.8886.1.27.8.10': 'olt-alarm-temperature',// Temperature alarm 'enterprises.8886.1.27.8.11': 'olt-alarm-los', // Loss of signal 'enterprises.8886.1.27.8.12': 'olt-alarm-dying-gasp',// Dying gasp 'enterprises.8886.1.27.8.17': 'olt-alarm-link-down', // Link down 'enterprises.8886.1.27.8.18': 'olt-alarm-link-up', // Link up 'enterprises.8886.1.27.8.20': 'olt-alarm-onu-los', // ONU loss of signal 'enterprises.8886.1.27.8.21': 'olt-alarm-onu-lof', // ONU loss of frame 'enterprises.8886.1.27.8.28': 'olt-alarm-onu-dying-gasp', // ONU dying gasp 'enterprises.8886.1.27.8.30': 'olt-alarm-onu-power-fail', // ONU power fail 'enterprises.8886.1.27.8.31': 'olt-alarm-onu-offline', // ONU offline 'enterprises.8886.1.27.8.37': 'olt-alarm-fiber-cut', // Fiber cut detected 'enterprises.8886.1.27.8.38': 'olt-alarm-fiber-restored', // Fiber restored 'enterprises.8886.18.3.6.3.11.2': 'gpon-onu-offline', 'enterprises.8886.18.3.6.3.11.3': 'gpon-onu-online', 'enterprises.8886.18.3.6.3.11.4': 'gpon-onu-dying-gasp', 'enterprises.8886.18.3.6.3.11.5': 'gpon-onu-los', 'enterprises.8886.18.3.6.3.11.8': 'gpon-onu-power-fail', 'enterprises.8886.18.3.6.3.11.9': 'gpon-onu-deactivated', 'enterprises.8886.18.3.6.3.11.10': 'gpon-onu-fiber-cut', 'enterprises.8886.18.3.6.3.11.11': 'gpon-onu-fiber-restored', 'enterprises.8886.18.3.6.3.11.12': 'gpon-onu-signal-degraded', 'enterprises.8886.18.3.6.3.11.13': 'gpon-onu-signal-fail', 'enterprises.8886.18.3.6.3.11.14': 'gpon-onu-drift-exceed', 'enterprises.8886.18.3.6.3.11.15': 'gpon-onu-range-exceed', 'enterprises.8886.1.2.1.5': 'raisecom-cold-start', 'enterprises.8886.1.26.1.2.1': 'raisecom-config-change', 'enterprises.8886.1.26.4.4.1': 'raisecom-firmware-event', 'enterprises.17409.2.2.11.1.1.1': 'tplink-olt-alarm', 'linkDown': 'link-down', 'linkUp': 'link-up', } async function getRecentTraps (windowMinutes = 60, limit = 200) { const result = await influxQuery('telegraf', `SELECT * FROM snmp_trap WHERE time > now() - ${windowMinutes}m ORDER BY time DESC LIMIT ${limit}` ) const events = [] for (const r of (result.results || [])) { for (const s of (r.series || [])) { const cols = s.columns || [] for (const v of (s.values || [])) { const row = {} cols.forEach((c, i) => { row[c] = v[i] }) // Extract meaningful fields const trapName = row.name || '' const eventType = RAISECOM_TRAP_MAP[trapName] || trapName const source = row.source || '' // Extract ONU serial from trap varbinds (search for RCMG/TPLG pattern) let onuSerial = null for (const [k, val] of Object.entries(row)) { if (typeof val === 'string' && /^(RCMG|TPLG|GPON)[A-F0-9]{8,}$/i.test(val)) { onuSerial = val break } } // Extract ONU index (numeric field in varbinds) let onuIndex = null for (const [k, val] of Object.entries(row)) { if (k.startsWith('enterprises.') && typeof val === 'number' && k.includes('.1.1.')) { onuIndex = val break } } events.push({ time: row.time, source, // OLT IP event: eventType, trapOid: trapName, onuSerial, onuIndex, sysUptime: row.sysUpTimeInstance, }) } } } return events } // ── Build summary from logs ────────────────────────────────────────────── async function buildLogSummary (windowMinutes = 60) { const traps = await getRecentTraps(windowMinutes, 500) // Group by OLT source const byOlt = {} for (const t of traps) { if (!byOlt[t.source]) byOlt[t.source] = { events: [], offline: [], online: [], alarms: [] } byOlt[t.source].events.push(t) if (t.event.includes('offline') || t.event === 'onu-deregister' || t.event.includes('dying-gasp') || t.event.includes('los') || t.event.includes('power-fail')) { byOlt[t.source].offline.push(t) } else if (t.event.includes('online') || t.event === 'onu-register' || t.event === 'link-up') { byOlt[t.source].online.push(t) } if (t.event.startsWith('olt-alarm') || t.event.startsWith('gpon-onu-fiber') || t.event.startsWith('gpon-onu-signal')) { byOlt[t.source].alarms.push(t) } } // Get OLT SNMP live status let oltStats = [] try { const oltSnmp = require('./olt-snmp') oltStats = oltSnmp.getOltStats() } catch (e) {} // Get outage monitor events let incidents = [] let eventLog = [] try { const monitor = require('./outage-monitor') incidents = monitor.getActiveIncidents() eventLog = monitor.getEventLog(new Date(Date.now() - windowMinutes * 60000).toISOString(), 100) } catch (e) {} // ONU-level: find ONUs that went offline and didn't come back const onuEvents = {} for (const t of traps) { if (!t.onuSerial) continue if (!onuEvents[t.onuSerial]) onuEvents[t.onuSerial] = { serial: t.onuSerial, source: t.source, events: [] } onuEvents[t.onuSerial].events.push({ time: t.time, event: t.event }) } const stuckOffline = Object.values(onuEvents).filter(o => { const last = o.events[0] // already sorted desc return last && (last.event.includes('offline') || last.event.includes('deregister') || last.event.includes('dying-gasp') || last.event.includes('los')) }) // Get Kuma real-time monitor status (source of truth for device reachability) let kumaMonitors = [] try { kumaMonitors = await fetchKumaMonitors() } catch (e) { log(`Kuma fetch failed: ${e.message}`) } const kumaDown = kumaMonitors.filter(m => m.status === 0 && m.type !== 'group') const kumaUp = kumaMonitors.filter(m => m.status === 1 && m.type !== 'group') const kumaSlow = kumaMonitors.filter(m => m.responseTime > 100 && m.type !== 'group') return { window: `${windowMinutes}m`, totalTraps: traps.length, byOlt: Object.entries(byOlt).map(([ip, data]) => { const olt = oltStats.find(o => o.host === ip) return { oltIp: ip, oltName: olt?.name || ip, totalEvents: data.events.length, offlineEvents: data.offline.length, onlineEvents: data.online.length, alarms: data.alarms.length, recentEvents: data.events.slice(0, 10).map(e => ({ time: e.time, event: e.event, serial: e.onuSerial, })), } }).sort((a, b) => b.offlineEvents - a.offlineEvents), oltStats: oltStats.map(o => ({ name: o.name, host: o.host, onuCount: o.onuCount, onlineCount: o.onlineCount, offlineCount: (o.onuCount || 0) - (o.onlineCount || 0), lastPoll: o.lastPoll, lastError: o.lastError || null, })), stuckOfflineOnus: stuckOffline.length, stuckOfflineSample: stuckOffline.slice(0, 20).map(o => ({ serial: o.serial, olt: o.source, lastEvent: o.events[0]?.event, lastTime: o.events[0]?.time, })), activeIncidents: incidents, monitorEvents: eventLog.length, // Kuma — real device reachability (source of truth) kuma: { totalMonitors: kumaMonitors.filter(m => m.type !== 'group').length, up: kumaUp.length, down: kumaDown.map(m => ({ name: m.name, host: m.hostname, type: m.type })), slow: kumaSlow.map(m => ({ name: m.name, host: m.hostname, type: m.type, ms: m.responseTime })), }, ts: new Date().toISOString(), } } // ── AI-powered root cause analysis (LOG-BASED) ─────────────────────────── async function analyzeNetworkIssue (context) { const { aiCall } = require('./ai') // Gather real data from logs, not interface status const logSummary = await buildLogSummary(60) const systemPrompt = `Tu es un ingénieur réseau senior NOC pour Targo/Gigafibre, un ISP fibre optique (GPON) au Québec. Tu analyses les LOGS et événements SNMP en temps réel pour diagnostiquer les problèmes réseau. Architecture réseau: - OLTs Raisecom et TP-Link desservent les clients via GPON - Chaque OLT a plusieurs ports, chaque port un splitter, chaque splitter dessert 32-128 ONUs (modems clients) - Les SNMP traps sont la source de vérité pour les événements réseau Types d'événements SNMP importants: - onu-deregister / gpon-onu-offline: ONU déconnecté - onu-register / gpon-onu-online: ONU reconnecté - gpon-onu-dying-gasp / olt-alarm-onu-dying-gasp: Perte de courant chez le client - gpon-onu-los / olt-alarm-onu-los: Perte de signal optique (fibre coupée?) - gpon-onu-fiber-cut: Coupure fibre détectée - gpon-onu-power-fail: Panne d'alimentation ONU - gpon-onu-signal-degraded: Signal optique dégradé (fibre sale, connecteur usé) - link-down / link-up: Interface réseau up/down - olt-alarm-power / olt-alarm-temperature: Alarmes OLT Règles de diagnostic: 1. Beaucoup de dying-gasp sur même OLT en même temps = panne de courant dans le secteur 2. Beaucoup de LOS sur même port = coupure fibre en amont du splitter 3. Dying-gasp isolé = client a débranché ou panne électrique locale 4. OLT unreachable + tous ONUs offline = panne OLT ou lien amont coupé 5. Serials qui vont offline/online en boucle = équipement instable, remplacement nécessaire 6. Signal dégradé = maintenance préventive nécessaire (nettoyage connecteurs) Réponds en JSON valide: { "summary_fr": "Résumé en 2-3 phrases de l'état du réseau", "severity": "critical|high|medium|low|normal", "network_health_score": 0-100, "active_issues": [ { "type": "power_outage|fiber_cut|olt_failure|unstable_onu|signal_degradation|mass_offline|normal_churn", "olt": "nom ou IP de l'OLT", "description_fr": "description du problème", "affected_customers": number, "evidence": ["preuve 1", "preuve 2"], "action_fr": "action recommandée", "urgency": "immédiat|court-terme|surveillance|aucun" } ], "olt_health": [ {"name": "...", "status": "ok|warning|critical|unreachable", "online": number, "total": number, "issues_fr": "..."} ], "trending": "improving|stable|degrading", "next_check_fr": "Recommandation pour le prochain contrôle" }` const userPrompt = `Analyse les logs réseau de la dernière heure: CONTEXTE: ${context || 'Rapport de situation réseau — résume les problèmes actifs et identifie les causes'} UPTIME-KUMA — SOURCE DE VÉRITÉ POUR LA JOIGNABILITÉ (${logSummary.kuma.totalMonitors} moniteurs): - En ligne: ${logSummary.kuma.up} moniteurs - HORS LIGNE: ${JSON.stringify(logSummary.kuma.down)} - Lents (>100ms): ${JSON.stringify(logSummary.kuma.slow)} NOTE: Si Kuma dit qu'un équipement est UP, il est UP. Ne pas contredire Kuma. ÉVÉNEMENTS SNMP TRAP (${logSummary.totalTraps} événements en ${logSummary.window}): ${JSON.stringify(logSummary.byOlt, null, 1)} STATUT OLTs — COMPTAGE ONUs (via SNMP polling targo-hub): ${JSON.stringify(logSummary.oltStats, null, 1)} NOTE: Si onuCount=0 pour un OLT mais que Kuma dit qu'il est UP, c'est que le polling n'est pas configuré, PAS que l'OLT est down. ONUs BLOQUÉS OFFLINE (${logSummary.stuckOfflineOnus} ONUs dont le dernier événement est offline): ${JSON.stringify(logSummary.stuckOfflineSample, null, 1)} INCIDENTS ACTIFS (outage monitor): ${JSON.stringify(logSummary.activeIncidents, null, 1)} Analyse les patterns, identifie les VRAIS problèmes (pas les faux positifs), et donne un rapport synthétique pour l'équipe. Si tout est normal (Kuma OK, pas de clusters d'ONUs offline, pas d'alarmes), dis-le clairement avec severity "normal".` try { const result = await aiCall(systemPrompt, userPrompt, { maxTokens: 16384 }) result._logSummary = { totalTraps: logSummary.totalTraps, oltCount: logSummary.oltStats.length, stuckOffline: logSummary.stuckOfflineOnus, incidentCount: logSummary.activeIncidents.length, } return result } catch (e) { log(`Network AI analysis failed: ${e.message}`) return { summary_fr: `Analyse IA indisponible: ${e.message}`, severity: 'unknown', network_health_score: null, active_issues: [], olt_health: logSummary.oltStats.map(o => ({ name: o.name, status: o.reachable ? 'ok' : 'unreachable', online: o.onlineCount, total: o.onuCount, issues_fr: o.lastError || 'OK', })), _logSummary: { totalTraps: logSummary.totalTraps, oltCount: logSummary.oltStats.length, stuckOffline: logSummary.stuckOfflineOnus, }, _error: e.message, } } } // ── Network Map (static topology + live enrichment) ─────────────────────── let _staticTopo = null function loadStaticTopology () { if (_staticTopo) return _staticTopo try { const fs = require('fs') const path = require('path') const raw = fs.readFileSync(path.join(__dirname, '..', 'data', 'network-topology.json'), 'utf8') _staticTopo = JSON.parse(raw) log(`Static topology loaded: ${Object.keys(_staticTopo.sites).length} sites, ${_staticTopo.links.length} links`) return _staticTopo } catch (e) { log(`Failed to load static topology: ${e.message}`) return { sites: {}, links: [], rings: [] } } } let _mapCache = null let _mapCacheTs = 0 const MAP_CACHE_TTL = 60000 // 1 min async function getNetworkMap () { const now = Date.now() if (_mapCache && (now - _mapCacheTs) < MAP_CACHE_TTL) return _mapCache const topo = loadStaticTopology() const kumaMonitors = await fetchKumaMonitors().catch(() => []) // Build Kuma lookup by IP const kumaByIp = new Map() for (const m of kumaMonitors) { if (m.hostname) { const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '') kumaByIp.set(clean, m) } const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/) if (ipInName && !kumaByIp.has(ipInName[1])) kumaByIp.set(ipInName[1], m) } // Get OLT stats let oltStats = [] try { const oltSnmp = require('./olt-snmp') oltStats = oltSnmp.getOltStats() || [] } catch (e) { /* ok */ } const oltByIp = new Map() for (const o of oltStats) { if (o.host) oltByIp.set(o.host, o) } // Enrich each site with live data const sites = {} const allDeviceIps = new Set() for (const [siteId, site] of Object.entries(topo.sites)) { const enrichedDevices = (site.devices || []).map(dev => { allDeviceIps.add(dev.ip) const kuma = kumaByIp.get(dev.ip) const olt = oltByIp.get(dev.ip) return { ...dev, kuma: kuma ? { status: kuma.status, name: kuma.name, responseTime: kuma.responseTime } : null, olt: olt ? { name: olt.name, onuCount: olt.onuCount, onlineCount: olt.onlineCount, reachable: olt.reachable } : null, } }) // Site-level status: aggregate device statuses const deviceStatuses = enrichedDevices.map(d => { if (d.kuma) return d.kuma.status === 1 ? 'up' : d.kuma.status === 0 ? 'down' : 'pending' if (d.olt) return d.olt.reachable ? 'up' : 'down' return 'unknown' }) const hasDown = deviceStatuses.includes('down') const hasUp = deviceStatuses.includes('up') const siteStatus = hasDown ? 'degraded' : hasUp ? 'up' : 'unknown' // ONU totals for the site const onuTotal = enrichedDevices.reduce((s, d) => s + (d.olt?.onuCount || 0), 0) const onuOnline = enrichedDevices.reduce((s, d) => s + (d.olt?.onlineCount || 0), 0) sites[siteId] = { ...site, devices: enrichedDevices, status: siteStatus, onuTotal, onuOnline, } } // Enrich links with status (both endpoints up = link up) const links = (topo.links || []).map(link => { const fromSite = sites[link.from] const toSite = sites[link.to] const fromUp = fromSite?.status === 'up' const toUp = toSite?.status === 'up' return { ...link, status: (fromUp && toUp) ? 'up' : (!fromUp || !toUp) ? 'degraded' : 'unknown', } }) const result = { sites, links, rings: topo.rings || [], ts: new Date().toISOString(), summary: { totalSites: Object.keys(sites).length, totalLinks: links.length, totalOlts: Object.values(sites).flatMap(s => s.devices).filter(d => d.type === 'olt').length, totalOnus: Object.values(sites).reduce((s, site) => s + site.onuTotal, 0), totalOnusOnline: Object.values(sites).reduce((s, site) => s + site.onuOnline, 0), sitesUp: Object.values(sites).filter(s => s.status === 'up').length, sitesDown: Object.values(sites).filter(s => s.status === 'degraded').length, }, } _mapCache = result _mapCacheTs = now return result } // ── HTTP handler ─────────────────────────────────────────────────────────── async function handle (req, res, method, urlPath) { const sub = urlPath.replace('/network/', '') // GET /network/topology — discover network devices and links if (sub === 'topology' && method === 'GET') { try { const topology = await discoverTopology() return json(res, 200, topology) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/dependency-map — full dependency chain with OLT data if (sub === 'dependency-map' && method === 'GET') { try { const depMap = await buildDependencyMap() return json(res, 200, depMap) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/health/:host — interface health for a specific device if (sub.startsWith('health/') && method === 'GET') { try { const host = decodeURIComponent(sub.replace('health/', '')) const health = await getInterfaceHealth(host) return json(res, 200, { host, interfaces: health }) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/errors — recent error spikes across the network if (sub === 'errors' && method === 'GET') { try { const url = new URL(req.url, 'http://localhost') const minutes = parseInt(url.searchParams.get('minutes')) || 60 const spikes = await getErrorSpikes(minutes) return json(res, 200, { spikes, count: spikes.length }) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/kuma — Uptime Kuma monitor status (source of truth) if (sub === 'kuma' && method === 'GET') { try { const monitors = await fetchKumaMonitors() const down = monitors.filter(m => m.status === 0 && m.type !== 'group') const up = monitors.filter(m => m.status === 1 && m.type !== 'group') return json(res, 200, { total: monitors.filter(m => m.type !== 'group').length, up: up.length, down: down.length, downList: down, groups: monitors.filter(m => m.type === 'group').map(m => ({ name: m.name, status: m.status })), monitors: monitors.filter(m => m.type !== 'group'), }) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/logs — recent SNMP trap events (parsed) if (sub === 'logs' && method === 'GET') { try { const url = new URL(req.url, 'http://localhost') const minutes = parseInt(url.searchParams.get('minutes')) || 60 const summary = await buildLogSummary(minutes) return json(res, 200, summary) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/traps — raw recent SNMP traps (parsed into events) if (sub === 'traps' && method === 'GET') { try { const url = new URL(req.url, 'http://localhost') const minutes = parseInt(url.searchParams.get('minutes')) || 60 const limit = parseInt(url.searchParams.get('limit')) || 200 const traps = await getRecentTraps(minutes, limit) return json(res, 200, { events: traps, count: traps.length }) } catch (e) { return json(res, 500, { error: e.message }) } } // POST /network/analyze — AI root cause analysis if (sub === 'analyze' && method === 'POST') { try { const body = await parseBody(req) const analysis = await analyzeNetworkIssue(body.context || '') return json(res, 200, analysis) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/map — static topology from Excel + live Kuma/OLT enrichment if (sub === 'map' && method === 'GET') { try { const map = await getNetworkMap() return json(res, 200, map) } catch (e) { return json(res, 500, { error: e.message }) } } // GET /network/influx?db=telegraf&q=SELECT... — raw InfluxDB proxy (for ops dashboard) if (sub === 'influx' && method === 'GET') { try { const url = new URL(req.url, 'http://localhost') const db = url.searchParams.get('db') || 'telegraf' const q = url.searchParams.get('q') if (!q) return json(res, 400, { error: 'Missing q parameter' }) // Safety: only allow SELECT and SHOW queries const ql = q.trim().toUpperCase() if (!ql.startsWith('SELECT') && !ql.startsWith('SHOW')) { return json(res, 403, { error: 'Only SELECT and SHOW queries allowed' }) } const result = await influxQuery(db, q) return json(res, 200, result) } catch (e) { return json(res, 500, { error: e.message }) } } return json(res, 404, { error: 'Network endpoint not found' }) } module.exports = { handle, discoverTopology, buildDependencyMap, buildLogSummary, getRecentTraps, fetchKumaMonitors, getInterfaceHealth, getNetworkMap, getErrorSpikes, analyzeNetworkIssue, influxQuery, extractSeries, }