Backend services: - targo-hub: extract deepGetValue to helpers.js, DRY disconnect reasons lookup map, compact CAPABILITIES, consolidate vision.js prompts/schemas, extract dispatch scoring weights, trim section dividers across 9 files - modem-bridge: extract getSession() helper (6 occurrences), resetIdleTimer(), consolidate DM query factory, fix duplicate username fill bug, trim headers (server.js -36%, tplink-session.js -47%, docker-compose.yml -57%) Frontend: - useWifiDiagnostic: extract THRESHOLDS const, split processDiagnostic into 6 focused helpers (processOnlineStatus, processWanIPs, processRadios, processMeshNodes, processClients, checkRadioIssues) - EquipmentDetail: merge duplicate ROLE_LABELS, remove verbose comments Documentation (17 → 13 files, -1,400 lines): - New consolidated README.md (architecture, services, dependencies, auth) - Merge ECOSYSTEM-OVERVIEW into ARCHITECTURE.md - Merge MIGRATION-PLAN + ARCHITECTURE-COMPARE + FIELD-GAP + CHANGELOG → MIGRATION.md - Merge COMPETITIVE-ANALYSIS into PLATFORM-STRATEGY.md - Update ROADMAP.md with current phase status - Delete CONTEXT.md (absorbed into README) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1222 lines
46 KiB
JavaScript
1222 lines
46 KiB
JavaScript
'use strict'
|
|
/**
|
|
* Network Intelligence — AI-powered log analysis via InfluxDB + Grafana
|
|
*
|
|
* Queries InfluxDB (telegraf) for real-time network metrics:
|
|
* - Interface status (up/down) across all routers, switches, OLTs
|
|
* - Traffic patterns (ingress/egress) for anomaly detection
|
|
* - Error counters (CRC, drops, collisions) for fiber/cable issues
|
|
* - SNMP trap history for correlating events
|
|
* - LLDP/CDP neighbor discovery for topology links
|
|
*
|
|
* Then feeds these to Gemini for root cause analysis instead of making
|
|
* humans stare at 20 Grafana dashboards trying to correlate mentally.
|
|
*
|
|
* InfluxDB: 10.5.2.65:8086 (no auth, InfluxQL v1)
|
|
* Databases: telegraf (live SNMP), OLT_Logs (syslog), graphite (FastNetMon)
|
|
*/
|
|
|
|
const cfg = require('./config')
|
|
const { log, json, parseBody } = require('./helpers')
|
|
const http = require('http')
|
|
const https = require('https')
|
|
|
|
const INFLUX = cfg.INFLUXDB_URL || 'http://10.5.2.65:8086'
|
|
const KUMA_METRICS_URL = cfg.KUMA_METRICS_URL || 'https://kuma.targo.ca/metrics'
|
|
|
|
// ── InfluxDB query helper ──────────────────────────────────────────────────
|
|
|
|
async function influxQuery (db, query) {
|
|
const url = `${INFLUX}/query?db=${encodeURIComponent(db)}&q=${encodeURIComponent(query)}`
|
|
return new Promise((resolve, reject) => {
|
|
const proto = url.startsWith('https') ? require('https') : http
|
|
proto.get(url, { timeout: 15000 }, (res) => {
|
|
let data = ''
|
|
res.on('data', c => data += c)
|
|
res.on('end', () => {
|
|
try {
|
|
const parsed = JSON.parse(data)
|
|
resolve(parsed)
|
|
} catch (e) {
|
|
reject(new Error(`InfluxDB parse error: ${e.message}`))
|
|
}
|
|
})
|
|
}).on('error', reject)
|
|
})
|
|
}
|
|
|
|
function extractSeries (result) {
|
|
const rows = []
|
|
for (const r of (result.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
const cols = s.columns || []
|
|
for (const v of (s.values || [])) {
|
|
const row = {}
|
|
cols.forEach((c, i) => { row[c] = v[i] })
|
|
row._measurement = s.name
|
|
if (s.tags) Object.assign(row, s.tags)
|
|
rows.push(row)
|
|
}
|
|
}
|
|
}
|
|
return rows
|
|
}
|
|
|
|
// ── Kuma Prometheus metrics scraper ────────────────────────────────────────
|
|
|
|
let _kumaCache = null
|
|
let _kumaCacheTs = 0
|
|
const KUMA_CACHE_TTL = 60000 // 1 min
|
|
|
|
async function fetchKumaMonitors () {
|
|
const now = Date.now()
|
|
if (_kumaCache && (now - _kumaCacheTs) < KUMA_CACHE_TTL) return _kumaCache
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const proto = KUMA_METRICS_URL.startsWith('https') ? https : http
|
|
proto.get(KUMA_METRICS_URL, { timeout: 10000, rejectUnauthorized: false }, (res) => {
|
|
let data = ''
|
|
res.on('data', c => data += c)
|
|
res.on('end', () => {
|
|
const monitors = []
|
|
const lines = data.split('\n')
|
|
// Parse Prometheus format: monitor_status{labels} value
|
|
const statusLines = lines.filter(l => l.startsWith('monitor_status{'))
|
|
const rtLines = lines.filter(l => l.startsWith('monitor_response_time{'))
|
|
|
|
const rtMap = {}
|
|
for (const line of rtLines) {
|
|
const m = line.match(/monitor_id="(\d+)".*}\s+([\d.]+)/)
|
|
if (m) rtMap[m[1]] = parseFloat(m[2])
|
|
}
|
|
|
|
for (const line of statusLines) {
|
|
const idM = line.match(/monitor_id="(\d+)"/)
|
|
const nameM = line.match(/monitor_name="([^"]*)"/)
|
|
const typeM = line.match(/monitor_type="([^"]*)"/)
|
|
const hostM = line.match(/monitor_hostname="([^"]*)"/)
|
|
const valM = line.match(/}\s+([\d.]+)$/)
|
|
if (!idM || !nameM) continue
|
|
|
|
const id = idM[1]
|
|
const hostname = hostM ? hostM[1] : null
|
|
monitors.push({
|
|
id,
|
|
name: nameM[1],
|
|
type: typeM ? typeM[1] : 'unknown',
|
|
hostname: hostname === 'null' ? null : hostname,
|
|
status: valM ? parseInt(valM[1]) : -1, // 1=up, 0=down, 2=pending
|
|
responseTime: rtMap[id] || null,
|
|
})
|
|
}
|
|
|
|
log(`Kuma: scraped ${monitors.length} monitors, ${monitors.filter(m => m.status === 0).length} down`)
|
|
_kumaCache = monitors
|
|
_kumaCacheTs = now
|
|
resolve(monitors)
|
|
})
|
|
}).on('error', (e) => {
|
|
log(`Kuma metrics fetch failed: ${e.message}`)
|
|
resolve(_kumaCache || []) // Return stale cache on error
|
|
})
|
|
})
|
|
}
|
|
|
|
// ── Network topology discovery ─────────────────────────────────────────────
|
|
|
|
let _topologyCache = null
|
|
let _topologyCacheTs = 0
|
|
const TOPOLOGY_CACHE_TTL = 300000 // 5 min
|
|
|
|
// Known network hierarchy (based on Targo/Gigafibre architecture)
|
|
// Core → Edge Router → OLT Gateway → OLTs → ONUs
|
|
const DEVICE_HIERARCHY = {
|
|
'core-switch': { level: 0, label: 'Core Switch', icon: 'hub' },
|
|
'edge-router': { level: 1, label: 'Edge Router', icon: 'router' },
|
|
'olt-gateway': { level: 2, label: 'OLT Gateway', icon: 'device_hub' },
|
|
'public-infra': { level: 1, label: 'Public Infra', icon: 'public' },
|
|
'server': { level: 3, label: 'Server', icon: 'dns' },
|
|
'unknown': { level: 4, label: 'Unknown', icon: 'help' },
|
|
}
|
|
|
|
function classifyDevice (host) {
|
|
if (host.startsWith('172.17.')) return 'olt-gateway'
|
|
if (/^10\.10[1-6]\./.test(host)) return 'edge-router'
|
|
if (host.startsWith('10.255.')) return 'core-switch'
|
|
if (host.startsWith('10.100.')) return 'server'
|
|
if (host.startsWith('23.159.') || host.startsWith('100.64.') || host.startsWith('96.125.')) return 'public-infra'
|
|
return 'unknown'
|
|
}
|
|
|
|
// Subnet-based parent inference: which core switch likely serves which edge router
|
|
// Based on network architecture: core switches on 10.255.x.x connect to edge routers 10.101-106.x.x
|
|
function inferLinks (devices) {
|
|
const links = []
|
|
const byType = {}
|
|
for (const d of devices) {
|
|
if (!byType[d.type]) byType[d.type] = []
|
|
byType[d.type].push(d)
|
|
}
|
|
|
|
const coreDevices = byType['core-switch'] || []
|
|
const edgeDevices = byType['edge-router'] || []
|
|
const oltGateways = byType['olt-gateway'] || []
|
|
|
|
// Infer Core → Edge links by analyzing interface descriptions
|
|
// Core switches (10.255.x.x) have uplinks to edge routers with descriptions
|
|
// mentioning the router hostname or IP. Fallback: subnet-based matching.
|
|
for (const edge of edgeDevices) {
|
|
// Find the core switch most likely connected (by matching 3rd octet patterns)
|
|
// Example: edge 10.101.0.1 → core 10.255.254.1 or 10.255.254.2
|
|
// For now, link to all core switches (the AI analysis will refine)
|
|
let linked = false
|
|
for (const core of coreDevices) {
|
|
// Check if any core interface description mentions the edge IP or subnet
|
|
for (const iface of core.interfaces) {
|
|
const d = iface.descr || ''
|
|
// Common patterns: "GigaEthernet0/1 - to-router-101", "ae0.101"
|
|
const edgeOctet = edge.host.split('.')[1] // "101", "102", etc.
|
|
if (d.includes(edgeOctet) || d.toLowerCase().includes(edge.host)) {
|
|
links.push({ from: core.host, to: edge.host, type: 'core-to-edge', via: d, status: iface.status })
|
|
linked = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
// Fallback: distribute edge routers across core switches based on subnet patterns
|
|
// 10.101.x → first core, 10.102.x → second core, etc.
|
|
if (!linked && coreDevices.length > 0) {
|
|
const edgeSubnet = parseInt(edge.host.split('.')[1]) // 101, 102, 103...
|
|
const sectorIdx = (edgeSubnet - 101) % coreDevices.length
|
|
const assignedCore = coreDevices[Math.max(0, Math.min(sectorIdx, coreDevices.length - 1))]
|
|
links.push({ from: assignedCore.host, to: edge.host, type: 'core-to-edge', via: `inferred (subnet .${edgeSubnet})`, status: 'unknown' })
|
|
}
|
|
}
|
|
|
|
// Infer Edge → OLT Gateway links by IP range proximity
|
|
// OLT gateways (172.17.x.x) connect through edge routers
|
|
for (const gw of oltGateways) {
|
|
// The gateway's management interface often routes through a specific edge router
|
|
// For now we infer by matching OLT registered names to edge subnet areas
|
|
if (edgeDevices.length > 0) {
|
|
links.push({ from: edgeDevices[0].host, to: gw.host, type: 'edge-to-olt', via: 'inferred', status: 'unknown' })
|
|
} else if (coreDevices.length > 0) {
|
|
links.push({ from: coreDevices[0].host, to: gw.host, type: 'core-to-olt', via: 'inferred', status: 'unknown' })
|
|
}
|
|
}
|
|
|
|
// OLT Gateway → OLTs (from olt-snmp module)
|
|
try {
|
|
const oltSnmp = require('./olt-snmp')
|
|
const oltStats = oltSnmp.getOltStats()
|
|
for (const olt of (oltStats || [])) {
|
|
// Match OLT host to nearest gateway
|
|
const oltIp = olt.host
|
|
if (!oltIp) continue
|
|
let bestGw = null
|
|
for (const gw of oltGateways) {
|
|
// Same /16 subnet?
|
|
const gwParts = gw.host.split('.')
|
|
const oltParts = oltIp.split('.')
|
|
if (gwParts[0] === oltParts[0] && gwParts[1] === oltParts[1]) {
|
|
bestGw = gw
|
|
break
|
|
}
|
|
}
|
|
if (bestGw) {
|
|
links.push({
|
|
from: bestGw.host, to: oltIp, type: 'gateway-to-olt',
|
|
via: olt.name || oltIp,
|
|
status: olt.reachable ? 'up' : 'down',
|
|
oltName: olt.name,
|
|
onuCount: olt.onuCount || 0,
|
|
onlineCount: olt.onlineCount || 0,
|
|
})
|
|
}
|
|
}
|
|
} catch (e) {
|
|
log(`OLT link inference failed: ${e.message}`)
|
|
}
|
|
|
|
return links
|
|
}
|
|
|
|
async function discoverTopology () {
|
|
const now = Date.now()
|
|
if (_topologyCache && (now - _topologyCacheTs) < TOPOLOGY_CACHE_TTL) return _topologyCache
|
|
|
|
log('Network topology discovery starting...')
|
|
const devices = new Map() // agent_host → { interfaces: [], ... }
|
|
|
|
// Step 1: Get list of agent_hosts from BOTH 'interface' and 'ifTable' measurements
|
|
// Edge routers may only appear in ifTable (different telegraf collector)
|
|
const measurements = ['interface', 'ifTable']
|
|
for (const m of measurements) {
|
|
try {
|
|
const hostResult = await influxQuery('telegraf',
|
|
`SHOW TAG VALUES FROM "${m}" WITH KEY = "agent_host"`
|
|
)
|
|
for (const r of (hostResult.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
for (const v of (s.values || [])) {
|
|
const host = v[1]
|
|
if (host && !devices.has(host)) {
|
|
devices.set(host, { host, interfaces: [], type: 'unknown', source: m })
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
log(`Topology host discovery from ${m} failed: ${e.message}`)
|
|
}
|
|
}
|
|
|
|
// Step 2: Get interface summary from 'interface' measurement
|
|
try {
|
|
const ifResult = await influxQuery('telegraf',
|
|
`SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed"
|
|
FROM "interface" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"`
|
|
)
|
|
for (const r of (ifResult.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
const agentHost = s.tags?.agent_host
|
|
if (!agentHost || !devices.has(agentHost)) continue
|
|
const dev = devices.get(agentHost)
|
|
for (const v of (s.values || [])) {
|
|
dev.interfaces.push({
|
|
descr: s.tags?.ifDescr || '',
|
|
status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown',
|
|
speed: v[2],
|
|
source: 'interface',
|
|
})
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
log(`Topology interface query failed: ${e.message}`)
|
|
}
|
|
|
|
// Step 3: Get interface summary from 'ifTable' for devices not yet covered
|
|
// (edge routers, additional switches that only report via ifTable)
|
|
try {
|
|
const ifResult = await influxQuery('telegraf',
|
|
`SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed"
|
|
FROM "ifTable" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"`
|
|
)
|
|
for (const r of (ifResult.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
const agentHost = s.tags?.agent_host
|
|
if (!agentHost) continue
|
|
if (!devices.has(agentHost)) {
|
|
devices.set(agentHost, { host: agentHost, interfaces: [], type: 'unknown', source: 'ifTable' })
|
|
}
|
|
const dev = devices.get(agentHost)
|
|
// Only add if we don't already have interfaces from 'interface' measurement
|
|
const hasIfData = dev.interfaces.some(i => i.source === 'interface')
|
|
for (const v of (s.values || [])) {
|
|
const ifDescr = s.tags?.ifDescr || ''
|
|
// Skip if already have this interface from 'interface' measurement
|
|
if (hasIfData && dev.interfaces.some(i => i.descr === ifDescr && i.source === 'interface')) continue
|
|
dev.interfaces.push({
|
|
descr: ifDescr,
|
|
status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown',
|
|
speed: v[2],
|
|
source: 'ifTable',
|
|
})
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
log(`Topology ifTable query failed: ${e.message}`)
|
|
}
|
|
|
|
// Step 4: Try LLDP neighbor data for real link discovery
|
|
let lldpLinks = []
|
|
try {
|
|
const lldpResult = await influxQuery('telegraf',
|
|
`SELECT last("lldpRemSysName") AS "remoteName", last("lldpRemPortDesc") AS "remotePort"
|
|
FROM "lldpRemTable" WHERE time > now() - 1h GROUP BY "agent_host", "lldpRemIndex"`
|
|
)
|
|
for (const r of (lldpResult.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
const localHost = s.tags?.agent_host
|
|
for (const v of (s.values || [])) {
|
|
if (v[1]) {
|
|
lldpLinks.push({ localHost, remoteName: v[1], remotePort: v[2] || '' })
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// LLDP not available — fine, we'll infer links
|
|
}
|
|
|
|
// Step 5: Get traffic data for bandwidth utilization (top talkers)
|
|
const trafficMap = new Map()
|
|
try {
|
|
const trResult = await influxQuery('telegraf',
|
|
`SELECT last("ifHCInOctets") AS "inOctets", last("ifHCOutOctets") AS "outOctets"
|
|
FROM "ifXTable" WHERE time > now() - 10m GROUP BY "agent_host", "ifDescr"`
|
|
)
|
|
for (const r of (trResult.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
const host = s.tags?.agent_host
|
|
if (!host) continue
|
|
if (!trafficMap.has(host)) trafficMap.set(host, { totalIn: 0, totalOut: 0, topIf: [] })
|
|
const t = trafficMap.get(host)
|
|
for (const v of (s.values || [])) {
|
|
const inO = v[1] || 0
|
|
const outO = v[2] || 0
|
|
t.totalIn += inO
|
|
t.totalOut += outO
|
|
if (inO + outO > 0) {
|
|
t.topIf.push({ descr: s.tags?.ifDescr, inOctets: inO, outOctets: outO })
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Traffic data optional
|
|
}
|
|
|
|
// Step 6: Fetch Kuma monitor data for real reachability
|
|
let kumaMonitors = []
|
|
try {
|
|
kumaMonitors = await fetchKumaMonitors()
|
|
} catch (e) {
|
|
log(`Topology Kuma fetch failed: ${e.message}`)
|
|
}
|
|
|
|
// Build lookup: IP/hostname → Kuma monitor
|
|
const kumaByHost = new Map()
|
|
for (const m of kumaMonitors) {
|
|
if (m.hostname) {
|
|
// Strip protocol and port if present (e.g. "http://10.5.2.65:8086" → "10.5.2.65")
|
|
const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '')
|
|
kumaByHost.set(clean, m)
|
|
}
|
|
// Also index by monitor name in case it contains an IP
|
|
const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/)
|
|
if (ipInName && !kumaByHost.has(ipInName[1])) {
|
|
kumaByHost.set(ipInName[1], m)
|
|
}
|
|
}
|
|
|
|
// Classify devices by their host tag patterns
|
|
for (const [host, dev] of devices) {
|
|
dev.type = classifyDevice(host)
|
|
dev.level = DEVICE_HIERARCHY[dev.type]?.level ?? 4
|
|
dev.label = DEVICE_HIERARCHY[dev.type]?.label ?? 'Unknown'
|
|
dev.icon = DEVICE_HIERARCHY[dev.type]?.icon ?? 'help'
|
|
|
|
// Interface summary stats — only count "active" interfaces (those with speed > 0 or status up)
|
|
// Unused switch ports (down + speed 0) are not problems, just empty ports
|
|
const activeIfs = dev.interfaces.filter(i => i.status === 'up' || (i.speed && i.speed > 0))
|
|
const up = dev.interfaces.filter(i => i.status === 'up').length
|
|
const down = activeIfs.filter(i => i.status === 'down').length // only active-but-down ports matter
|
|
const total = dev.interfaces.length
|
|
const activeTotal = activeIfs.length
|
|
const ifHealth = activeTotal > 0 ? Math.round((up / activeTotal) * 100) : (total > 0 ? 0 : null)
|
|
|
|
// Kuma reachability — source of truth for device being up/down
|
|
const kuma = kumaByHost.get(host)
|
|
const kumaUp = kuma ? kuma.status === 1 : null // null = not monitored in Kuma
|
|
const kumaRt = kuma ? kuma.responseTime : null
|
|
|
|
// Final health: Kuma UP + has active interfaces → use interface health
|
|
// Kuma UP + no active interfaces → 100% (device reachable, just no SNMP polling)
|
|
// Kuma DOWN → 0%
|
|
// No Kuma data → fall back to interface health
|
|
let health
|
|
if (kumaUp === true) {
|
|
health = activeTotal > 0 ? Math.max(ifHealth, 80) : 100 // reachable = at least 80%
|
|
} else if (kumaUp === false) {
|
|
health = 0
|
|
} else {
|
|
health = ifHealth ?? 0
|
|
}
|
|
|
|
dev.ifSummary = { up, down, total, activeTotal, health }
|
|
dev.kuma = kuma ? { status: kuma.status, name: kuma.name, responseTime: kumaRt } : null
|
|
|
|
// Traffic data if available
|
|
const traffic = trafficMap.get(host)
|
|
if (traffic) {
|
|
dev.traffic = {
|
|
totalInGb: Math.round(traffic.totalIn / 1073741824 * 100) / 100,
|
|
totalOutGb: Math.round(traffic.totalOut / 1073741824 * 100) / 100,
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build links (inferred + LLDP-based)
|
|
const links = inferLinks([...devices.values()])
|
|
|
|
// Add LLDP-discovered links
|
|
for (const l of lldpLinks) {
|
|
// Try to resolve remoteName to an IP in our device list
|
|
const remoteDevice = [...devices.values()].find(d =>
|
|
d.host === l.remoteName || d.host.includes(l.remoteName)
|
|
)
|
|
if (remoteDevice) {
|
|
// Check if link already exists
|
|
const exists = links.some(lk =>
|
|
(lk.from === l.localHost && lk.to === remoteDevice.host) ||
|
|
(lk.to === l.localHost && lk.from === remoteDevice.host)
|
|
)
|
|
if (!exists) {
|
|
links.push({
|
|
from: l.localHost, to: remoteDevice.host, type: 'lldp',
|
|
via: `LLDP: ${l.remotePort}`, status: 'up',
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
const topology = {
|
|
devices: [...devices.values()],
|
|
links,
|
|
deviceCount: devices.size,
|
|
linkCount: links.length,
|
|
ts: new Date().toISOString(),
|
|
summary: {
|
|
oltGateways: [...devices.values()].filter(d => d.type === 'olt-gateway').length,
|
|
edgeRouters: [...devices.values()].filter(d => d.type === 'edge-router').length,
|
|
coreSwitches: [...devices.values()].filter(d => d.type === 'core-switch').length,
|
|
servers: [...devices.values()].filter(d => d.type === 'server').length,
|
|
publicInfra: [...devices.values()].filter(d => d.type === 'public-infra').length,
|
|
unknown: [...devices.values()].filter(d => d.type === 'unknown').length,
|
|
},
|
|
}
|
|
|
|
log(`Topology discovered: ${topology.deviceCount} devices, ${topology.linkCount} links (${topology.summary.edgeRouters} routers, ${topology.summary.coreSwitches} core, ${topology.summary.oltGateways} OLT gateways)`)
|
|
|
|
_topologyCache = topology
|
|
_topologyCacheTs = now
|
|
return topology
|
|
}
|
|
|
|
// ── Interface health snapshot ──────────────────────────────────────────────
|
|
|
|
async function getInterfaceHealth (agentHost) {
|
|
// Query both interface and ifTable measurements
|
|
const results = []
|
|
for (const m of ['ifTable', 'interface']) {
|
|
try {
|
|
const result = await influxQuery('telegraf',
|
|
`SELECT last("ifOperStatus") AS "status", last("ifInErrors") AS "inErrors", last("ifOutErrors") AS "outErrors",
|
|
last("ifInDiscards") AS "inDiscards", last("ifOutDiscards") AS "outDiscards", last("ifSpeed") AS "speed"
|
|
FROM "${m}" WHERE "agent_host" = '${agentHost}' AND time > now() - 30m GROUP BY "ifIndex", "ifDescr"`
|
|
)
|
|
const rows = extractSeries(result)
|
|
if (rows.length > 0) return rows // Return first measurement that has data
|
|
results.push(...rows)
|
|
} catch (e) {
|
|
// Try next measurement
|
|
}
|
|
}
|
|
return results
|
|
}
|
|
|
|
// ── Recent error spikes ────────────────────────────────────────────────────
|
|
|
|
async function getErrorSpikes (windowMinutes = 60) {
|
|
const result = await influxQuery('telegraf',
|
|
`SELECT non_negative_derivative(last("ifInErrors"), 1m) AS "errRate"
|
|
FROM "ifTable" WHERE time > now() - ${windowMinutes}m
|
|
GROUP BY time(5m), "agent_host", "ifIndex" FILL(0)`
|
|
)
|
|
const spikes = []
|
|
for (const r of (result.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
for (const v of (s.values || [])) {
|
|
if (v[1] && v[1] > 10) { // >10 errors/min = spike
|
|
spikes.push({
|
|
time: v[0], errRate: v[1],
|
|
host: s.tags?.agent_host, ifIndex: s.tags?.ifIndex,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return spikes
|
|
}
|
|
|
|
// ── Dependency chain builder ──────────────────────────────────────────────
|
|
|
|
async function buildDependencyMap () {
|
|
const topology = await discoverTopology()
|
|
|
|
// Build adjacency from links
|
|
const adj = new Map() // host → [{ target, type, status }]
|
|
for (const link of topology.links) {
|
|
if (!adj.has(link.from)) adj.set(link.from, [])
|
|
adj.get(link.from).push({ target: link.to, ...link })
|
|
// Bidirectional
|
|
if (!adj.has(link.to)) adj.set(link.to, [])
|
|
adj.get(link.to).push({ target: link.from, from: link.to, to: link.from, type: link.type, status: link.status })
|
|
}
|
|
|
|
// Build dependency chains: for each leaf device, trace path to core
|
|
const chains = []
|
|
const deviceMap = new Map(topology.devices.map(d => [d.host, d]))
|
|
|
|
for (const dev of topology.devices) {
|
|
if (dev.type === 'olt-gateway' || dev.type === 'server') {
|
|
// Trace upward to core
|
|
const chain = [dev.host]
|
|
const visited = new Set([dev.host])
|
|
let current = dev.host
|
|
|
|
for (let i = 0; i < 5; i++) { // max depth
|
|
const neighbors = adj.get(current) || []
|
|
// Pick the neighbor with lowest level (closest to core)
|
|
let bestNext = null
|
|
let bestLevel = 999
|
|
for (const n of neighbors) {
|
|
const nDev = deviceMap.get(n.target)
|
|
if (!nDev || visited.has(n.target)) continue
|
|
if (nDev.level < bestLevel) {
|
|
bestLevel = nDev.level
|
|
bestNext = n.target
|
|
}
|
|
}
|
|
if (!bestNext) break
|
|
chain.push(bestNext)
|
|
visited.add(bestNext)
|
|
current = bestNext
|
|
}
|
|
|
|
chain.reverse() // core → ... → leaf
|
|
chains.push({
|
|
path: chain.map(h => {
|
|
const d = deviceMap.get(h)
|
|
return { host: h, type: d?.type, label: d?.label, health: d?.ifSummary?.health }
|
|
}),
|
|
leaf: dev.host,
|
|
leafType: dev.type,
|
|
})
|
|
}
|
|
}
|
|
|
|
// OLT-specific chains (from olt-snmp module)
|
|
try {
|
|
const oltSnmp = require('./olt-snmp')
|
|
const stats = oltSnmp.getOltStats()
|
|
for (const olt of (stats || [])) {
|
|
// Find which gateway this OLT connects through
|
|
const gwLink = topology.links.find(l => l.to === olt.host && l.type === 'gateway-to-olt')
|
|
if (gwLink) {
|
|
// Find the chain for this gateway and extend it
|
|
const gwChain = chains.find(c => c.leaf === gwLink.from)
|
|
if (gwChain) {
|
|
chains.push({
|
|
path: [
|
|
...gwChain.path,
|
|
{ host: olt.host, type: 'olt', label: olt.name || 'OLT', health: olt.reachable ? 100 : 0, onuCount: olt.onuCount, onlineCount: olt.onlineCount },
|
|
],
|
|
leaf: olt.host,
|
|
leafType: 'olt',
|
|
oltName: olt.name,
|
|
onuCount: olt.onuCount || 0,
|
|
onlineCount: olt.onlineCount || 0,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// OLT data optional
|
|
}
|
|
|
|
return {
|
|
topology: {
|
|
devices: topology.devices.map(d => ({
|
|
host: d.host, type: d.type, level: d.level, label: d.label, icon: d.icon,
|
|
ifSummary: d.ifSummary, traffic: d.traffic,
|
|
})),
|
|
links: topology.links,
|
|
summary: topology.summary,
|
|
},
|
|
chains,
|
|
ts: new Date().toISOString(),
|
|
}
|
|
}
|
|
|
|
// ── SNMP Trap log reader ──────────────────────────────────────────────────
|
|
|
|
// Raisecom OLT trap OID → human-readable event type
|
|
const RAISECOM_TRAP_MAP = {
|
|
'enterprises.8886.18.3.1.4.1': 'onu-register', // ONU registered
|
|
'enterprises.8886.18.3.1.4.2': 'onu-deregister', // ONU deregistered (offline)
|
|
'enterprises.8886.18.3.1.4.3': 'onu-state-change', // ONU state change
|
|
'enterprises.8886.1.27.8.2': 'olt-alarm-power', // Power alarm
|
|
'enterprises.8886.1.27.8.6': 'olt-alarm-fan', // Fan alarm
|
|
'enterprises.8886.1.27.8.10': 'olt-alarm-temperature',// Temperature alarm
|
|
'enterprises.8886.1.27.8.11': 'olt-alarm-los', // Loss of signal
|
|
'enterprises.8886.1.27.8.12': 'olt-alarm-dying-gasp',// Dying gasp
|
|
'enterprises.8886.1.27.8.17': 'olt-alarm-link-down', // Link down
|
|
'enterprises.8886.1.27.8.18': 'olt-alarm-link-up', // Link up
|
|
'enterprises.8886.1.27.8.20': 'olt-alarm-onu-los', // ONU loss of signal
|
|
'enterprises.8886.1.27.8.21': 'olt-alarm-onu-lof', // ONU loss of frame
|
|
'enterprises.8886.1.27.8.28': 'olt-alarm-onu-dying-gasp', // ONU dying gasp
|
|
'enterprises.8886.1.27.8.30': 'olt-alarm-onu-power-fail', // ONU power fail
|
|
'enterprises.8886.1.27.8.31': 'olt-alarm-onu-offline', // ONU offline
|
|
'enterprises.8886.1.27.8.37': 'olt-alarm-fiber-cut', // Fiber cut detected
|
|
'enterprises.8886.1.27.8.38': 'olt-alarm-fiber-restored', // Fiber restored
|
|
'enterprises.8886.18.3.6.3.11.2': 'gpon-onu-offline',
|
|
'enterprises.8886.18.3.6.3.11.3': 'gpon-onu-online',
|
|
'enterprises.8886.18.3.6.3.11.4': 'gpon-onu-dying-gasp',
|
|
'enterprises.8886.18.3.6.3.11.5': 'gpon-onu-los',
|
|
'enterprises.8886.18.3.6.3.11.8': 'gpon-onu-power-fail',
|
|
'enterprises.8886.18.3.6.3.11.9': 'gpon-onu-deactivated',
|
|
'enterprises.8886.18.3.6.3.11.10': 'gpon-onu-fiber-cut',
|
|
'enterprises.8886.18.3.6.3.11.11': 'gpon-onu-fiber-restored',
|
|
'enterprises.8886.18.3.6.3.11.12': 'gpon-onu-signal-degraded',
|
|
'enterprises.8886.18.3.6.3.11.13': 'gpon-onu-signal-fail',
|
|
'enterprises.8886.18.3.6.3.11.14': 'gpon-onu-drift-exceed',
|
|
'enterprises.8886.18.3.6.3.11.15': 'gpon-onu-range-exceed',
|
|
'enterprises.8886.1.2.1.5': 'raisecom-cold-start',
|
|
'enterprises.8886.1.26.1.2.1': 'raisecom-config-change',
|
|
'enterprises.8886.1.26.4.4.1': 'raisecom-firmware-event',
|
|
'enterprises.17409.2.2.11.1.1.1': 'tplink-olt-alarm',
|
|
'linkDown': 'link-down',
|
|
'linkUp': 'link-up',
|
|
}
|
|
|
|
async function getRecentTraps (windowMinutes = 60, limit = 200) {
|
|
const result = await influxQuery('telegraf',
|
|
`SELECT * FROM snmp_trap WHERE time > now() - ${windowMinutes}m ORDER BY time DESC LIMIT ${limit}`
|
|
)
|
|
|
|
const events = []
|
|
for (const r of (result.results || [])) {
|
|
for (const s of (r.series || [])) {
|
|
const cols = s.columns || []
|
|
for (const v of (s.values || [])) {
|
|
const row = {}
|
|
cols.forEach((c, i) => { row[c] = v[i] })
|
|
|
|
// Extract meaningful fields
|
|
const trapName = row.name || ''
|
|
const eventType = RAISECOM_TRAP_MAP[trapName] || trapName
|
|
const source = row.source || ''
|
|
|
|
// Extract ONU serial from trap varbinds (search for RCMG/TPLG pattern)
|
|
let onuSerial = null
|
|
for (const [k, val] of Object.entries(row)) {
|
|
if (typeof val === 'string' && /^(RCMG|TPLG|GPON)[A-F0-9]{8,}$/i.test(val)) {
|
|
onuSerial = val
|
|
break
|
|
}
|
|
}
|
|
|
|
// Extract ONU index (numeric field in varbinds)
|
|
let onuIndex = null
|
|
for (const [k, val] of Object.entries(row)) {
|
|
if (k.startsWith('enterprises.') && typeof val === 'number' && k.includes('.1.1.')) {
|
|
onuIndex = val
|
|
break
|
|
}
|
|
}
|
|
|
|
events.push({
|
|
time: row.time,
|
|
source, // OLT IP
|
|
event: eventType,
|
|
trapOid: trapName,
|
|
onuSerial,
|
|
onuIndex,
|
|
sysUptime: row.sysUpTimeInstance,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
return events
|
|
}
|
|
|
|
// ── Build summary from logs ──────────────────────────────────────────────
|
|
|
|
async function buildLogSummary (windowMinutes = 60) {
|
|
const traps = await getRecentTraps(windowMinutes, 500)
|
|
|
|
// Group by OLT source
|
|
const byOlt = {}
|
|
for (const t of traps) {
|
|
if (!byOlt[t.source]) byOlt[t.source] = { events: [], offline: [], online: [], alarms: [] }
|
|
byOlt[t.source].events.push(t)
|
|
if (t.event.includes('offline') || t.event === 'onu-deregister' || t.event.includes('dying-gasp') || t.event.includes('los') || t.event.includes('power-fail')) {
|
|
byOlt[t.source].offline.push(t)
|
|
} else if (t.event.includes('online') || t.event === 'onu-register' || t.event === 'link-up') {
|
|
byOlt[t.source].online.push(t)
|
|
}
|
|
if (t.event.startsWith('olt-alarm') || t.event.startsWith('gpon-onu-fiber') || t.event.startsWith('gpon-onu-signal')) {
|
|
byOlt[t.source].alarms.push(t)
|
|
}
|
|
}
|
|
|
|
// Get OLT SNMP live status
|
|
let oltStats = []
|
|
try {
|
|
const oltSnmp = require('./olt-snmp')
|
|
oltStats = oltSnmp.getOltStats()
|
|
} catch (e) {}
|
|
|
|
// Get outage monitor events
|
|
let incidents = []
|
|
let eventLog = []
|
|
try {
|
|
const monitor = require('./outage-monitor')
|
|
incidents = monitor.getActiveIncidents()
|
|
eventLog = monitor.getEventLog(new Date(Date.now() - windowMinutes * 60000).toISOString(), 100)
|
|
} catch (e) {}
|
|
|
|
// ONU-level: find ONUs that went offline and didn't come back
|
|
const onuEvents = {}
|
|
for (const t of traps) {
|
|
if (!t.onuSerial) continue
|
|
if (!onuEvents[t.onuSerial]) onuEvents[t.onuSerial] = { serial: t.onuSerial, source: t.source, events: [] }
|
|
onuEvents[t.onuSerial].events.push({ time: t.time, event: t.event })
|
|
}
|
|
const stuckOffline = Object.values(onuEvents).filter(o => {
|
|
const last = o.events[0] // already sorted desc
|
|
return last && (last.event.includes('offline') || last.event.includes('deregister') || last.event.includes('dying-gasp') || last.event.includes('los'))
|
|
})
|
|
|
|
// Get Kuma real-time monitor status (source of truth for device reachability)
|
|
let kumaMonitors = []
|
|
try {
|
|
kumaMonitors = await fetchKumaMonitors()
|
|
} catch (e) {
|
|
log(`Kuma fetch failed: ${e.message}`)
|
|
}
|
|
|
|
const kumaDown = kumaMonitors.filter(m => m.status === 0 && m.type !== 'group')
|
|
const kumaUp = kumaMonitors.filter(m => m.status === 1 && m.type !== 'group')
|
|
const kumaSlow = kumaMonitors.filter(m => m.responseTime > 100 && m.type !== 'group')
|
|
|
|
return {
|
|
window: `${windowMinutes}m`,
|
|
totalTraps: traps.length,
|
|
byOlt: Object.entries(byOlt).map(([ip, data]) => {
|
|
const olt = oltStats.find(o => o.host === ip)
|
|
return {
|
|
oltIp: ip,
|
|
oltName: olt?.name || ip,
|
|
totalEvents: data.events.length,
|
|
offlineEvents: data.offline.length,
|
|
onlineEvents: data.online.length,
|
|
alarms: data.alarms.length,
|
|
recentEvents: data.events.slice(0, 10).map(e => ({
|
|
time: e.time, event: e.event, serial: e.onuSerial,
|
|
})),
|
|
}
|
|
}).sort((a, b) => b.offlineEvents - a.offlineEvents),
|
|
oltStats: oltStats.map(o => ({
|
|
name: o.name, host: o.host,
|
|
onuCount: o.onuCount, onlineCount: o.onlineCount,
|
|
offlineCount: (o.onuCount || 0) - (o.onlineCount || 0),
|
|
lastPoll: o.lastPoll, lastError: o.lastError || null,
|
|
})),
|
|
stuckOfflineOnus: stuckOffline.length,
|
|
stuckOfflineSample: stuckOffline.slice(0, 20).map(o => ({
|
|
serial: o.serial, olt: o.source, lastEvent: o.events[0]?.event, lastTime: o.events[0]?.time,
|
|
})),
|
|
activeIncidents: incidents,
|
|
monitorEvents: eventLog.length,
|
|
// Kuma — real device reachability (source of truth)
|
|
kuma: {
|
|
totalMonitors: kumaMonitors.filter(m => m.type !== 'group').length,
|
|
up: kumaUp.length,
|
|
down: kumaDown.map(m => ({ name: m.name, host: m.hostname, type: m.type })),
|
|
slow: kumaSlow.map(m => ({ name: m.name, host: m.hostname, type: m.type, ms: m.responseTime })),
|
|
},
|
|
ts: new Date().toISOString(),
|
|
}
|
|
}
|
|
|
|
// ── AI-powered root cause analysis (LOG-BASED) ───────────────────────────
|
|
|
|
async function analyzeNetworkIssue (context) {
|
|
const { aiCall } = require('./ai')
|
|
|
|
// Gather real data from logs, not interface status
|
|
const logSummary = await buildLogSummary(60)
|
|
|
|
const systemPrompt = `Tu es un ingénieur réseau senior NOC pour Targo/Gigafibre, un ISP fibre optique (GPON) au Québec.
|
|
Tu analyses les LOGS et événements SNMP en temps réel pour diagnostiquer les problèmes réseau.
|
|
|
|
Architecture réseau:
|
|
- OLTs Raisecom et TP-Link desservent les clients via GPON
|
|
- Chaque OLT a plusieurs ports, chaque port un splitter, chaque splitter dessert 32-128 ONUs (modems clients)
|
|
- Les SNMP traps sont la source de vérité pour les événements réseau
|
|
|
|
Types d'événements SNMP importants:
|
|
- onu-deregister / gpon-onu-offline: ONU déconnecté
|
|
- onu-register / gpon-onu-online: ONU reconnecté
|
|
- gpon-onu-dying-gasp / olt-alarm-onu-dying-gasp: Perte de courant chez le client
|
|
- gpon-onu-los / olt-alarm-onu-los: Perte de signal optique (fibre coupée?)
|
|
- gpon-onu-fiber-cut: Coupure fibre détectée
|
|
- gpon-onu-power-fail: Panne d'alimentation ONU
|
|
- gpon-onu-signal-degraded: Signal optique dégradé (fibre sale, connecteur usé)
|
|
- link-down / link-up: Interface réseau up/down
|
|
- olt-alarm-power / olt-alarm-temperature: Alarmes OLT
|
|
|
|
Règles de diagnostic:
|
|
1. Beaucoup de dying-gasp sur même OLT en même temps = panne de courant dans le secteur
|
|
2. Beaucoup de LOS sur même port = coupure fibre en amont du splitter
|
|
3. Dying-gasp isolé = client a débranché ou panne électrique locale
|
|
4. OLT unreachable + tous ONUs offline = panne OLT ou lien amont coupé
|
|
5. Serials qui vont offline/online en boucle = équipement instable, remplacement nécessaire
|
|
6. Signal dégradé = maintenance préventive nécessaire (nettoyage connecteurs)
|
|
|
|
Réponds en JSON valide:
|
|
{
|
|
"summary_fr": "Résumé en 2-3 phrases de l'état du réseau",
|
|
"severity": "critical|high|medium|low|normal",
|
|
"network_health_score": 0-100,
|
|
"active_issues": [
|
|
{
|
|
"type": "power_outage|fiber_cut|olt_failure|unstable_onu|signal_degradation|mass_offline|normal_churn",
|
|
"olt": "nom ou IP de l'OLT",
|
|
"description_fr": "description du problème",
|
|
"affected_customers": number,
|
|
"evidence": ["preuve 1", "preuve 2"],
|
|
"action_fr": "action recommandée",
|
|
"urgency": "immédiat|court-terme|surveillance|aucun"
|
|
}
|
|
],
|
|
"olt_health": [
|
|
{"name": "...", "status": "ok|warning|critical|unreachable", "online": number, "total": number, "issues_fr": "..."}
|
|
],
|
|
"trending": "improving|stable|degrading",
|
|
"next_check_fr": "Recommandation pour le prochain contrôle"
|
|
}`
|
|
|
|
const userPrompt = `Analyse les logs réseau de la dernière heure:
|
|
|
|
CONTEXTE: ${context || 'Rapport de situation réseau — résume les problèmes actifs et identifie les causes'}
|
|
|
|
UPTIME-KUMA — SOURCE DE VÉRITÉ POUR LA JOIGNABILITÉ (${logSummary.kuma.totalMonitors} moniteurs):
|
|
- En ligne: ${logSummary.kuma.up} moniteurs
|
|
- HORS LIGNE: ${JSON.stringify(logSummary.kuma.down)}
|
|
- Lents (>100ms): ${JSON.stringify(logSummary.kuma.slow)}
|
|
NOTE: Si Kuma dit qu'un équipement est UP, il est UP. Ne pas contredire Kuma.
|
|
|
|
ÉVÉNEMENTS SNMP TRAP (${logSummary.totalTraps} événements en ${logSummary.window}):
|
|
${JSON.stringify(logSummary.byOlt, null, 1)}
|
|
|
|
STATUT OLTs — COMPTAGE ONUs (via SNMP polling targo-hub):
|
|
${JSON.stringify(logSummary.oltStats, null, 1)}
|
|
NOTE: Si onuCount=0 pour un OLT mais que Kuma dit qu'il est UP, c'est que le polling n'est pas configuré, PAS que l'OLT est down.
|
|
|
|
ONUs BLOQUÉS OFFLINE (${logSummary.stuckOfflineOnus} ONUs dont le dernier événement est offline):
|
|
${JSON.stringify(logSummary.stuckOfflineSample, null, 1)}
|
|
|
|
INCIDENTS ACTIFS (outage monitor):
|
|
${JSON.stringify(logSummary.activeIncidents, null, 1)}
|
|
|
|
Analyse les patterns, identifie les VRAIS problèmes (pas les faux positifs), et donne un rapport synthétique pour l'équipe.
|
|
Si tout est normal (Kuma OK, pas de clusters d'ONUs offline, pas d'alarmes), dis-le clairement avec severity "normal".`
|
|
|
|
try {
|
|
const result = await aiCall(systemPrompt, userPrompt, { maxTokens: 16384 })
|
|
result._logSummary = {
|
|
totalTraps: logSummary.totalTraps,
|
|
oltCount: logSummary.oltStats.length,
|
|
stuckOffline: logSummary.stuckOfflineOnus,
|
|
incidentCount: logSummary.activeIncidents.length,
|
|
}
|
|
return result
|
|
} catch (e) {
|
|
log(`Network AI analysis failed: ${e.message}`)
|
|
return {
|
|
summary_fr: `Analyse IA indisponible: ${e.message}`,
|
|
severity: 'unknown',
|
|
network_health_score: null,
|
|
active_issues: [],
|
|
olt_health: logSummary.oltStats.map(o => ({
|
|
name: o.name, status: o.reachable ? 'ok' : 'unreachable',
|
|
online: o.onlineCount, total: o.onuCount,
|
|
issues_fr: o.lastError || 'OK',
|
|
})),
|
|
_logSummary: {
|
|
totalTraps: logSummary.totalTraps,
|
|
oltCount: logSummary.oltStats.length,
|
|
stuckOffline: logSummary.stuckOfflineOnus,
|
|
},
|
|
_error: e.message,
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── Network Map (static topology + live enrichment) ───────────────────────
|
|
|
|
let _staticTopo = null
|
|
function loadStaticTopology () {
|
|
if (_staticTopo) return _staticTopo
|
|
try {
|
|
const fs = require('fs')
|
|
const path = require('path')
|
|
const raw = fs.readFileSync(path.join(__dirname, '..', 'data', 'network-topology.json'), 'utf8')
|
|
_staticTopo = JSON.parse(raw)
|
|
log(`Static topology loaded: ${Object.keys(_staticTopo.sites).length} sites, ${_staticTopo.links.length} links`)
|
|
return _staticTopo
|
|
} catch (e) {
|
|
log(`Failed to load static topology: ${e.message}`)
|
|
return { sites: {}, links: [], rings: [] }
|
|
}
|
|
}
|
|
|
|
let _mapCache = null
|
|
let _mapCacheTs = 0
|
|
const MAP_CACHE_TTL = 60000 // 1 min
|
|
|
|
async function getNetworkMap () {
|
|
const now = Date.now()
|
|
if (_mapCache && (now - _mapCacheTs) < MAP_CACHE_TTL) return _mapCache
|
|
|
|
const topo = loadStaticTopology()
|
|
const kumaMonitors = await fetchKumaMonitors().catch(() => [])
|
|
|
|
// Build Kuma lookup by IP
|
|
const kumaByIp = new Map()
|
|
for (const m of kumaMonitors) {
|
|
if (m.hostname) {
|
|
const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '')
|
|
kumaByIp.set(clean, m)
|
|
}
|
|
const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/)
|
|
if (ipInName && !kumaByIp.has(ipInName[1])) kumaByIp.set(ipInName[1], m)
|
|
}
|
|
|
|
// Get OLT stats
|
|
let oltStats = []
|
|
try {
|
|
const oltSnmp = require('./olt-snmp')
|
|
oltStats = oltSnmp.getOltStats() || []
|
|
} catch (e) { /* ok */ }
|
|
const oltByIp = new Map()
|
|
for (const o of oltStats) { if (o.host) oltByIp.set(o.host, o) }
|
|
|
|
// Enrich each site with live data
|
|
const sites = {}
|
|
const allDeviceIps = new Set()
|
|
|
|
for (const [siteId, site] of Object.entries(topo.sites)) {
|
|
const enrichedDevices = (site.devices || []).map(dev => {
|
|
allDeviceIps.add(dev.ip)
|
|
const kuma = kumaByIp.get(dev.ip)
|
|
const olt = oltByIp.get(dev.ip)
|
|
return {
|
|
...dev,
|
|
kuma: kuma ? { status: kuma.status, name: kuma.name, responseTime: kuma.responseTime } : null,
|
|
olt: olt ? { name: olt.name, onuCount: olt.onuCount, onlineCount: olt.onlineCount, reachable: olt.reachable } : null,
|
|
}
|
|
})
|
|
|
|
// Site-level status: aggregate device statuses
|
|
const deviceStatuses = enrichedDevices.map(d => {
|
|
if (d.kuma) return d.kuma.status === 1 ? 'up' : d.kuma.status === 0 ? 'down' : 'pending'
|
|
if (d.olt) return d.olt.reachable ? 'up' : 'down'
|
|
return 'unknown'
|
|
})
|
|
const hasDown = deviceStatuses.includes('down')
|
|
const hasUp = deviceStatuses.includes('up')
|
|
const siteStatus = hasDown ? 'degraded' : hasUp ? 'up' : 'unknown'
|
|
|
|
// ONU totals for the site
|
|
const onuTotal = enrichedDevices.reduce((s, d) => s + (d.olt?.onuCount || 0), 0)
|
|
const onuOnline = enrichedDevices.reduce((s, d) => s + (d.olt?.onlineCount || 0), 0)
|
|
|
|
sites[siteId] = {
|
|
...site,
|
|
devices: enrichedDevices,
|
|
status: siteStatus,
|
|
onuTotal,
|
|
onuOnline,
|
|
}
|
|
}
|
|
|
|
// Enrich links with status (both endpoints up = link up)
|
|
const links = (topo.links || []).map(link => {
|
|
const fromSite = sites[link.from]
|
|
const toSite = sites[link.to]
|
|
const fromUp = fromSite?.status === 'up'
|
|
const toUp = toSite?.status === 'up'
|
|
return {
|
|
...link,
|
|
status: (fromUp && toUp) ? 'up' : (!fromUp || !toUp) ? 'degraded' : 'unknown',
|
|
}
|
|
})
|
|
|
|
const result = {
|
|
sites,
|
|
links,
|
|
rings: topo.rings || [],
|
|
ts: new Date().toISOString(),
|
|
summary: {
|
|
totalSites: Object.keys(sites).length,
|
|
totalLinks: links.length,
|
|
totalOlts: Object.values(sites).flatMap(s => s.devices).filter(d => d.type === 'olt').length,
|
|
totalOnus: Object.values(sites).reduce((s, site) => s + site.onuTotal, 0),
|
|
totalOnusOnline: Object.values(sites).reduce((s, site) => s + site.onuOnline, 0),
|
|
sitesUp: Object.values(sites).filter(s => s.status === 'up').length,
|
|
sitesDown: Object.values(sites).filter(s => s.status === 'degraded').length,
|
|
},
|
|
}
|
|
|
|
_mapCache = result
|
|
_mapCacheTs = now
|
|
return result
|
|
}
|
|
|
|
// ── HTTP handler ───────────────────────────────────────────────────────────
|
|
|
|
async function handle (req, res, method, urlPath) {
|
|
const sub = urlPath.replace('/network/', '')
|
|
|
|
// GET /network/topology — discover network devices and links
|
|
if (sub === 'topology' && method === 'GET') {
|
|
try {
|
|
const topology = await discoverTopology()
|
|
return json(res, 200, topology)
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/dependency-map — full dependency chain with OLT data
|
|
if (sub === 'dependency-map' && method === 'GET') {
|
|
try {
|
|
const depMap = await buildDependencyMap()
|
|
return json(res, 200, depMap)
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/health/:host — interface health for a specific device
|
|
if (sub.startsWith('health/') && method === 'GET') {
|
|
try {
|
|
const host = decodeURIComponent(sub.replace('health/', ''))
|
|
const health = await getInterfaceHealth(host)
|
|
return json(res, 200, { host, interfaces: health })
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/errors — recent error spikes across the network
|
|
if (sub === 'errors' && method === 'GET') {
|
|
try {
|
|
const url = new URL(req.url, 'http://localhost')
|
|
const minutes = parseInt(url.searchParams.get('minutes')) || 60
|
|
const spikes = await getErrorSpikes(minutes)
|
|
return json(res, 200, { spikes, count: spikes.length })
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/kuma — Uptime Kuma monitor status (source of truth)
|
|
if (sub === 'kuma' && method === 'GET') {
|
|
try {
|
|
const monitors = await fetchKumaMonitors()
|
|
const down = monitors.filter(m => m.status === 0 && m.type !== 'group')
|
|
const up = monitors.filter(m => m.status === 1 && m.type !== 'group')
|
|
return json(res, 200, {
|
|
total: monitors.filter(m => m.type !== 'group').length,
|
|
up: up.length,
|
|
down: down.length,
|
|
downList: down,
|
|
groups: monitors.filter(m => m.type === 'group').map(m => ({ name: m.name, status: m.status })),
|
|
monitors: monitors.filter(m => m.type !== 'group'),
|
|
})
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/logs — recent SNMP trap events (parsed)
|
|
if (sub === 'logs' && method === 'GET') {
|
|
try {
|
|
const url = new URL(req.url, 'http://localhost')
|
|
const minutes = parseInt(url.searchParams.get('minutes')) || 60
|
|
const summary = await buildLogSummary(minutes)
|
|
return json(res, 200, summary)
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/traps — raw recent SNMP traps (parsed into events)
|
|
if (sub === 'traps' && method === 'GET') {
|
|
try {
|
|
const url = new URL(req.url, 'http://localhost')
|
|
const minutes = parseInt(url.searchParams.get('minutes')) || 60
|
|
const limit = parseInt(url.searchParams.get('limit')) || 200
|
|
const traps = await getRecentTraps(minutes, limit)
|
|
return json(res, 200, { events: traps, count: traps.length })
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// POST /network/analyze — AI root cause analysis
|
|
if (sub === 'analyze' && method === 'POST') {
|
|
try {
|
|
const body = await parseBody(req)
|
|
const analysis = await analyzeNetworkIssue(body.context || '')
|
|
return json(res, 200, analysis)
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/map — static topology from Excel + live Kuma/OLT enrichment
|
|
if (sub === 'map' && method === 'GET') {
|
|
try {
|
|
const map = await getNetworkMap()
|
|
return json(res, 200, map)
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
// GET /network/influx?db=telegraf&q=SELECT... — raw InfluxDB proxy (for ops dashboard)
|
|
if (sub === 'influx' && method === 'GET') {
|
|
try {
|
|
const url = new URL(req.url, 'http://localhost')
|
|
const db = url.searchParams.get('db') || 'telegraf'
|
|
const q = url.searchParams.get('q')
|
|
if (!q) return json(res, 400, { error: 'Missing q parameter' })
|
|
// Safety: only allow SELECT and SHOW queries
|
|
const ql = q.trim().toUpperCase()
|
|
if (!ql.startsWith('SELECT') && !ql.startsWith('SHOW')) {
|
|
return json(res, 403, { error: 'Only SELECT and SHOW queries allowed' })
|
|
}
|
|
const result = await influxQuery(db, q)
|
|
return json(res, 200, result)
|
|
} catch (e) {
|
|
return json(res, 500, { error: e.message })
|
|
}
|
|
}
|
|
|
|
return json(res, 404, { error: 'Network endpoint not found' })
|
|
}
|
|
|
|
module.exports = {
|
|
handle,
|
|
discoverTopology,
|
|
buildDependencyMap,
|
|
buildLogSummary,
|
|
getRecentTraps,
|
|
fetchKumaMonitors,
|
|
getInterfaceHealth,
|
|
getNetworkMap,
|
|
getErrorSpikes,
|
|
analyzeNetworkIssue,
|
|
influxQuery,
|
|
extractSeries,
|
|
}
|