gigafibre-fsm/services/targo-hub/lib/network-intel.js
louispaulb 607ea54b5c refactor: reduce token count, DRY code, consolidate docs
Backend services:
- targo-hub: extract deepGetValue to helpers.js, DRY disconnect reasons
  lookup map, compact CAPABILITIES, consolidate vision.js prompts/schemas,
  extract dispatch scoring weights, trim section dividers across 9 files
- modem-bridge: extract getSession() helper (6 occurrences), resetIdleTimer(),
  consolidate DM query factory, fix duplicate username fill bug, trim headers
  (server.js -36%, tplink-session.js -47%, docker-compose.yml -57%)

Frontend:
- useWifiDiagnostic: extract THRESHOLDS const, split processDiagnostic into
  6 focused helpers (processOnlineStatus, processWanIPs, processRadios,
  processMeshNodes, processClients, checkRadioIssues)
- EquipmentDetail: merge duplicate ROLE_LABELS, remove verbose comments

Documentation (17 → 13 files, -1,400 lines):
- New consolidated README.md (architecture, services, dependencies, auth)
- Merge ECOSYSTEM-OVERVIEW into ARCHITECTURE.md
- Merge MIGRATION-PLAN + ARCHITECTURE-COMPARE + FIELD-GAP + CHANGELOG → MIGRATION.md
- Merge COMPETITIVE-ANALYSIS into PLATFORM-STRATEGY.md
- Update ROADMAP.md with current phase status
- Delete CONTEXT.md (absorbed into README)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-13 08:39:58 -04:00

1222 lines
46 KiB
JavaScript

'use strict'
/**
* Network Intelligence — AI-powered log analysis via InfluxDB + Grafana
*
* Queries InfluxDB (telegraf) for real-time network metrics:
* - Interface status (up/down) across all routers, switches, OLTs
* - Traffic patterns (ingress/egress) for anomaly detection
* - Error counters (CRC, drops, collisions) for fiber/cable issues
* - SNMP trap history for correlating events
* - LLDP/CDP neighbor discovery for topology links
*
* Then feeds these to Gemini for root cause analysis instead of making
* humans stare at 20 Grafana dashboards trying to correlate mentally.
*
* InfluxDB: 10.5.2.65:8086 (no auth, InfluxQL v1)
* Databases: telegraf (live SNMP), OLT_Logs (syslog), graphite (FastNetMon)
*/
const cfg = require('./config')
const { log, json, parseBody } = require('./helpers')
const http = require('http')
const https = require('https')
const INFLUX = cfg.INFLUXDB_URL || 'http://10.5.2.65:8086'
const KUMA_METRICS_URL = cfg.KUMA_METRICS_URL || 'https://kuma.targo.ca/metrics'
// ── InfluxDB query helper ──────────────────────────────────────────────────
async function influxQuery (db, query) {
const url = `${INFLUX}/query?db=${encodeURIComponent(db)}&q=${encodeURIComponent(query)}`
return new Promise((resolve, reject) => {
const proto = url.startsWith('https') ? require('https') : http
proto.get(url, { timeout: 15000 }, (res) => {
let data = ''
res.on('data', c => data += c)
res.on('end', () => {
try {
const parsed = JSON.parse(data)
resolve(parsed)
} catch (e) {
reject(new Error(`InfluxDB parse error: ${e.message}`))
}
})
}).on('error', reject)
})
}
function extractSeries (result) {
const rows = []
for (const r of (result.results || [])) {
for (const s of (r.series || [])) {
const cols = s.columns || []
for (const v of (s.values || [])) {
const row = {}
cols.forEach((c, i) => { row[c] = v[i] })
row._measurement = s.name
if (s.tags) Object.assign(row, s.tags)
rows.push(row)
}
}
}
return rows
}
// ── Kuma Prometheus metrics scraper ────────────────────────────────────────
let _kumaCache = null
let _kumaCacheTs = 0
const KUMA_CACHE_TTL = 60000 // 1 min
async function fetchKumaMonitors () {
const now = Date.now()
if (_kumaCache && (now - _kumaCacheTs) < KUMA_CACHE_TTL) return _kumaCache
return new Promise((resolve, reject) => {
const proto = KUMA_METRICS_URL.startsWith('https') ? https : http
proto.get(KUMA_METRICS_URL, { timeout: 10000, rejectUnauthorized: false }, (res) => {
let data = ''
res.on('data', c => data += c)
res.on('end', () => {
const monitors = []
const lines = data.split('\n')
// Parse Prometheus format: monitor_status{labels} value
const statusLines = lines.filter(l => l.startsWith('monitor_status{'))
const rtLines = lines.filter(l => l.startsWith('monitor_response_time{'))
const rtMap = {}
for (const line of rtLines) {
const m = line.match(/monitor_id="(\d+)".*}\s+([\d.]+)/)
if (m) rtMap[m[1]] = parseFloat(m[2])
}
for (const line of statusLines) {
const idM = line.match(/monitor_id="(\d+)"/)
const nameM = line.match(/monitor_name="([^"]*)"/)
const typeM = line.match(/monitor_type="([^"]*)"/)
const hostM = line.match(/monitor_hostname="([^"]*)"/)
const valM = line.match(/}\s+([\d.]+)$/)
if (!idM || !nameM) continue
const id = idM[1]
const hostname = hostM ? hostM[1] : null
monitors.push({
id,
name: nameM[1],
type: typeM ? typeM[1] : 'unknown',
hostname: hostname === 'null' ? null : hostname,
status: valM ? parseInt(valM[1]) : -1, // 1=up, 0=down, 2=pending
responseTime: rtMap[id] || null,
})
}
log(`Kuma: scraped ${monitors.length} monitors, ${monitors.filter(m => m.status === 0).length} down`)
_kumaCache = monitors
_kumaCacheTs = now
resolve(monitors)
})
}).on('error', (e) => {
log(`Kuma metrics fetch failed: ${e.message}`)
resolve(_kumaCache || []) // Return stale cache on error
})
})
}
// ── Network topology discovery ─────────────────────────────────────────────
let _topologyCache = null
let _topologyCacheTs = 0
const TOPOLOGY_CACHE_TTL = 300000 // 5 min
// Known network hierarchy (based on Targo/Gigafibre architecture)
// Core → Edge Router → OLT Gateway → OLTs → ONUs
const DEVICE_HIERARCHY = {
'core-switch': { level: 0, label: 'Core Switch', icon: 'hub' },
'edge-router': { level: 1, label: 'Edge Router', icon: 'router' },
'olt-gateway': { level: 2, label: 'OLT Gateway', icon: 'device_hub' },
'public-infra': { level: 1, label: 'Public Infra', icon: 'public' },
'server': { level: 3, label: 'Server', icon: 'dns' },
'unknown': { level: 4, label: 'Unknown', icon: 'help' },
}
function classifyDevice (host) {
if (host.startsWith('172.17.')) return 'olt-gateway'
if (/^10\.10[1-6]\./.test(host)) return 'edge-router'
if (host.startsWith('10.255.')) return 'core-switch'
if (host.startsWith('10.100.')) return 'server'
if (host.startsWith('23.159.') || host.startsWith('100.64.') || host.startsWith('96.125.')) return 'public-infra'
return 'unknown'
}
// Subnet-based parent inference: which core switch likely serves which edge router
// Based on network architecture: core switches on 10.255.x.x connect to edge routers 10.101-106.x.x
function inferLinks (devices) {
const links = []
const byType = {}
for (const d of devices) {
if (!byType[d.type]) byType[d.type] = []
byType[d.type].push(d)
}
const coreDevices = byType['core-switch'] || []
const edgeDevices = byType['edge-router'] || []
const oltGateways = byType['olt-gateway'] || []
// Infer Core → Edge links by analyzing interface descriptions
// Core switches (10.255.x.x) have uplinks to edge routers with descriptions
// mentioning the router hostname or IP. Fallback: subnet-based matching.
for (const edge of edgeDevices) {
// Find the core switch most likely connected (by matching 3rd octet patterns)
// Example: edge 10.101.0.1 → core 10.255.254.1 or 10.255.254.2
// For now, link to all core switches (the AI analysis will refine)
let linked = false
for (const core of coreDevices) {
// Check if any core interface description mentions the edge IP or subnet
for (const iface of core.interfaces) {
const d = iface.descr || ''
// Common patterns: "GigaEthernet0/1 - to-router-101", "ae0.101"
const edgeOctet = edge.host.split('.')[1] // "101", "102", etc.
if (d.includes(edgeOctet) || d.toLowerCase().includes(edge.host)) {
links.push({ from: core.host, to: edge.host, type: 'core-to-edge', via: d, status: iface.status })
linked = true
break
}
}
}
// Fallback: distribute edge routers across core switches based on subnet patterns
// 10.101.x → first core, 10.102.x → second core, etc.
if (!linked && coreDevices.length > 0) {
const edgeSubnet = parseInt(edge.host.split('.')[1]) // 101, 102, 103...
const sectorIdx = (edgeSubnet - 101) % coreDevices.length
const assignedCore = coreDevices[Math.max(0, Math.min(sectorIdx, coreDevices.length - 1))]
links.push({ from: assignedCore.host, to: edge.host, type: 'core-to-edge', via: `inferred (subnet .${edgeSubnet})`, status: 'unknown' })
}
}
// Infer Edge → OLT Gateway links by IP range proximity
// OLT gateways (172.17.x.x) connect through edge routers
for (const gw of oltGateways) {
// The gateway's management interface often routes through a specific edge router
// For now we infer by matching OLT registered names to edge subnet areas
if (edgeDevices.length > 0) {
links.push({ from: edgeDevices[0].host, to: gw.host, type: 'edge-to-olt', via: 'inferred', status: 'unknown' })
} else if (coreDevices.length > 0) {
links.push({ from: coreDevices[0].host, to: gw.host, type: 'core-to-olt', via: 'inferred', status: 'unknown' })
}
}
// OLT Gateway → OLTs (from olt-snmp module)
try {
const oltSnmp = require('./olt-snmp')
const oltStats = oltSnmp.getOltStats()
for (const olt of (oltStats || [])) {
// Match OLT host to nearest gateway
const oltIp = olt.host
if (!oltIp) continue
let bestGw = null
for (const gw of oltGateways) {
// Same /16 subnet?
const gwParts = gw.host.split('.')
const oltParts = oltIp.split('.')
if (gwParts[0] === oltParts[0] && gwParts[1] === oltParts[1]) {
bestGw = gw
break
}
}
if (bestGw) {
links.push({
from: bestGw.host, to: oltIp, type: 'gateway-to-olt',
via: olt.name || oltIp,
status: olt.reachable ? 'up' : 'down',
oltName: olt.name,
onuCount: olt.onuCount || 0,
onlineCount: olt.onlineCount || 0,
})
}
}
} catch (e) {
log(`OLT link inference failed: ${e.message}`)
}
return links
}
async function discoverTopology () {
const now = Date.now()
if (_topologyCache && (now - _topologyCacheTs) < TOPOLOGY_CACHE_TTL) return _topologyCache
log('Network topology discovery starting...')
const devices = new Map() // agent_host → { interfaces: [], ... }
// Step 1: Get list of agent_hosts from BOTH 'interface' and 'ifTable' measurements
// Edge routers may only appear in ifTable (different telegraf collector)
const measurements = ['interface', 'ifTable']
for (const m of measurements) {
try {
const hostResult = await influxQuery('telegraf',
`SHOW TAG VALUES FROM "${m}" WITH KEY = "agent_host"`
)
for (const r of (hostResult.results || [])) {
for (const s of (r.series || [])) {
for (const v of (s.values || [])) {
const host = v[1]
if (host && !devices.has(host)) {
devices.set(host, { host, interfaces: [], type: 'unknown', source: m })
}
}
}
}
} catch (e) {
log(`Topology host discovery from ${m} failed: ${e.message}`)
}
}
// Step 2: Get interface summary from 'interface' measurement
try {
const ifResult = await influxQuery('telegraf',
`SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed"
FROM "interface" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"`
)
for (const r of (ifResult.results || [])) {
for (const s of (r.series || [])) {
const agentHost = s.tags?.agent_host
if (!agentHost || !devices.has(agentHost)) continue
const dev = devices.get(agentHost)
for (const v of (s.values || [])) {
dev.interfaces.push({
descr: s.tags?.ifDescr || '',
status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown',
speed: v[2],
source: 'interface',
})
}
}
}
} catch (e) {
log(`Topology interface query failed: ${e.message}`)
}
// Step 3: Get interface summary from 'ifTable' for devices not yet covered
// (edge routers, additional switches that only report via ifTable)
try {
const ifResult = await influxQuery('telegraf',
`SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed"
FROM "ifTable" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"`
)
for (const r of (ifResult.results || [])) {
for (const s of (r.series || [])) {
const agentHost = s.tags?.agent_host
if (!agentHost) continue
if (!devices.has(agentHost)) {
devices.set(agentHost, { host: agentHost, interfaces: [], type: 'unknown', source: 'ifTable' })
}
const dev = devices.get(agentHost)
// Only add if we don't already have interfaces from 'interface' measurement
const hasIfData = dev.interfaces.some(i => i.source === 'interface')
for (const v of (s.values || [])) {
const ifDescr = s.tags?.ifDescr || ''
// Skip if already have this interface from 'interface' measurement
if (hasIfData && dev.interfaces.some(i => i.descr === ifDescr && i.source === 'interface')) continue
dev.interfaces.push({
descr: ifDescr,
status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown',
speed: v[2],
source: 'ifTable',
})
}
}
}
} catch (e) {
log(`Topology ifTable query failed: ${e.message}`)
}
// Step 4: Try LLDP neighbor data for real link discovery
let lldpLinks = []
try {
const lldpResult = await influxQuery('telegraf',
`SELECT last("lldpRemSysName") AS "remoteName", last("lldpRemPortDesc") AS "remotePort"
FROM "lldpRemTable" WHERE time > now() - 1h GROUP BY "agent_host", "lldpRemIndex"`
)
for (const r of (lldpResult.results || [])) {
for (const s of (r.series || [])) {
const localHost = s.tags?.agent_host
for (const v of (s.values || [])) {
if (v[1]) {
lldpLinks.push({ localHost, remoteName: v[1], remotePort: v[2] || '' })
}
}
}
}
} catch (e) {
// LLDP not available — fine, we'll infer links
}
// Step 5: Get traffic data for bandwidth utilization (top talkers)
const trafficMap = new Map()
try {
const trResult = await influxQuery('telegraf',
`SELECT last("ifHCInOctets") AS "inOctets", last("ifHCOutOctets") AS "outOctets"
FROM "ifXTable" WHERE time > now() - 10m GROUP BY "agent_host", "ifDescr"`
)
for (const r of (trResult.results || [])) {
for (const s of (r.series || [])) {
const host = s.tags?.agent_host
if (!host) continue
if (!trafficMap.has(host)) trafficMap.set(host, { totalIn: 0, totalOut: 0, topIf: [] })
const t = trafficMap.get(host)
for (const v of (s.values || [])) {
const inO = v[1] || 0
const outO = v[2] || 0
t.totalIn += inO
t.totalOut += outO
if (inO + outO > 0) {
t.topIf.push({ descr: s.tags?.ifDescr, inOctets: inO, outOctets: outO })
}
}
}
}
} catch (e) {
// Traffic data optional
}
// Step 6: Fetch Kuma monitor data for real reachability
let kumaMonitors = []
try {
kumaMonitors = await fetchKumaMonitors()
} catch (e) {
log(`Topology Kuma fetch failed: ${e.message}`)
}
// Build lookup: IP/hostname → Kuma monitor
const kumaByHost = new Map()
for (const m of kumaMonitors) {
if (m.hostname) {
// Strip protocol and port if present (e.g. "http://10.5.2.65:8086" → "10.5.2.65")
const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '')
kumaByHost.set(clean, m)
}
// Also index by monitor name in case it contains an IP
const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/)
if (ipInName && !kumaByHost.has(ipInName[1])) {
kumaByHost.set(ipInName[1], m)
}
}
// Classify devices by their host tag patterns
for (const [host, dev] of devices) {
dev.type = classifyDevice(host)
dev.level = DEVICE_HIERARCHY[dev.type]?.level ?? 4
dev.label = DEVICE_HIERARCHY[dev.type]?.label ?? 'Unknown'
dev.icon = DEVICE_HIERARCHY[dev.type]?.icon ?? 'help'
// Interface summary stats — only count "active" interfaces (those with speed > 0 or status up)
// Unused switch ports (down + speed 0) are not problems, just empty ports
const activeIfs = dev.interfaces.filter(i => i.status === 'up' || (i.speed && i.speed > 0))
const up = dev.interfaces.filter(i => i.status === 'up').length
const down = activeIfs.filter(i => i.status === 'down').length // only active-but-down ports matter
const total = dev.interfaces.length
const activeTotal = activeIfs.length
const ifHealth = activeTotal > 0 ? Math.round((up / activeTotal) * 100) : (total > 0 ? 0 : null)
// Kuma reachability — source of truth for device being up/down
const kuma = kumaByHost.get(host)
const kumaUp = kuma ? kuma.status === 1 : null // null = not monitored in Kuma
const kumaRt = kuma ? kuma.responseTime : null
// Final health: Kuma UP + has active interfaces → use interface health
// Kuma UP + no active interfaces → 100% (device reachable, just no SNMP polling)
// Kuma DOWN → 0%
// No Kuma data → fall back to interface health
let health
if (kumaUp === true) {
health = activeTotal > 0 ? Math.max(ifHealth, 80) : 100 // reachable = at least 80%
} else if (kumaUp === false) {
health = 0
} else {
health = ifHealth ?? 0
}
dev.ifSummary = { up, down, total, activeTotal, health }
dev.kuma = kuma ? { status: kuma.status, name: kuma.name, responseTime: kumaRt } : null
// Traffic data if available
const traffic = trafficMap.get(host)
if (traffic) {
dev.traffic = {
totalInGb: Math.round(traffic.totalIn / 1073741824 * 100) / 100,
totalOutGb: Math.round(traffic.totalOut / 1073741824 * 100) / 100,
}
}
}
// Build links (inferred + LLDP-based)
const links = inferLinks([...devices.values()])
// Add LLDP-discovered links
for (const l of lldpLinks) {
// Try to resolve remoteName to an IP in our device list
const remoteDevice = [...devices.values()].find(d =>
d.host === l.remoteName || d.host.includes(l.remoteName)
)
if (remoteDevice) {
// Check if link already exists
const exists = links.some(lk =>
(lk.from === l.localHost && lk.to === remoteDevice.host) ||
(lk.to === l.localHost && lk.from === remoteDevice.host)
)
if (!exists) {
links.push({
from: l.localHost, to: remoteDevice.host, type: 'lldp',
via: `LLDP: ${l.remotePort}`, status: 'up',
})
}
}
}
const topology = {
devices: [...devices.values()],
links,
deviceCount: devices.size,
linkCount: links.length,
ts: new Date().toISOString(),
summary: {
oltGateways: [...devices.values()].filter(d => d.type === 'olt-gateway').length,
edgeRouters: [...devices.values()].filter(d => d.type === 'edge-router').length,
coreSwitches: [...devices.values()].filter(d => d.type === 'core-switch').length,
servers: [...devices.values()].filter(d => d.type === 'server').length,
publicInfra: [...devices.values()].filter(d => d.type === 'public-infra').length,
unknown: [...devices.values()].filter(d => d.type === 'unknown').length,
},
}
log(`Topology discovered: ${topology.deviceCount} devices, ${topology.linkCount} links (${topology.summary.edgeRouters} routers, ${topology.summary.coreSwitches} core, ${topology.summary.oltGateways} OLT gateways)`)
_topologyCache = topology
_topologyCacheTs = now
return topology
}
// ── Interface health snapshot ──────────────────────────────────────────────
async function getInterfaceHealth (agentHost) {
// Query both interface and ifTable measurements
const results = []
for (const m of ['ifTable', 'interface']) {
try {
const result = await influxQuery('telegraf',
`SELECT last("ifOperStatus") AS "status", last("ifInErrors") AS "inErrors", last("ifOutErrors") AS "outErrors",
last("ifInDiscards") AS "inDiscards", last("ifOutDiscards") AS "outDiscards", last("ifSpeed") AS "speed"
FROM "${m}" WHERE "agent_host" = '${agentHost}' AND time > now() - 30m GROUP BY "ifIndex", "ifDescr"`
)
const rows = extractSeries(result)
if (rows.length > 0) return rows // Return first measurement that has data
results.push(...rows)
} catch (e) {
// Try next measurement
}
}
return results
}
// ── Recent error spikes ────────────────────────────────────────────────────
async function getErrorSpikes (windowMinutes = 60) {
const result = await influxQuery('telegraf',
`SELECT non_negative_derivative(last("ifInErrors"), 1m) AS "errRate"
FROM "ifTable" WHERE time > now() - ${windowMinutes}m
GROUP BY time(5m), "agent_host", "ifIndex" FILL(0)`
)
const spikes = []
for (const r of (result.results || [])) {
for (const s of (r.series || [])) {
for (const v of (s.values || [])) {
if (v[1] && v[1] > 10) { // >10 errors/min = spike
spikes.push({
time: v[0], errRate: v[1],
host: s.tags?.agent_host, ifIndex: s.tags?.ifIndex,
})
}
}
}
}
return spikes
}
// ── Dependency chain builder ──────────────────────────────────────────────
async function buildDependencyMap () {
const topology = await discoverTopology()
// Build adjacency from links
const adj = new Map() // host → [{ target, type, status }]
for (const link of topology.links) {
if (!adj.has(link.from)) adj.set(link.from, [])
adj.get(link.from).push({ target: link.to, ...link })
// Bidirectional
if (!adj.has(link.to)) adj.set(link.to, [])
adj.get(link.to).push({ target: link.from, from: link.to, to: link.from, type: link.type, status: link.status })
}
// Build dependency chains: for each leaf device, trace path to core
const chains = []
const deviceMap = new Map(topology.devices.map(d => [d.host, d]))
for (const dev of topology.devices) {
if (dev.type === 'olt-gateway' || dev.type === 'server') {
// Trace upward to core
const chain = [dev.host]
const visited = new Set([dev.host])
let current = dev.host
for (let i = 0; i < 5; i++) { // max depth
const neighbors = adj.get(current) || []
// Pick the neighbor with lowest level (closest to core)
let bestNext = null
let bestLevel = 999
for (const n of neighbors) {
const nDev = deviceMap.get(n.target)
if (!nDev || visited.has(n.target)) continue
if (nDev.level < bestLevel) {
bestLevel = nDev.level
bestNext = n.target
}
}
if (!bestNext) break
chain.push(bestNext)
visited.add(bestNext)
current = bestNext
}
chain.reverse() // core → ... → leaf
chains.push({
path: chain.map(h => {
const d = deviceMap.get(h)
return { host: h, type: d?.type, label: d?.label, health: d?.ifSummary?.health }
}),
leaf: dev.host,
leafType: dev.type,
})
}
}
// OLT-specific chains (from olt-snmp module)
try {
const oltSnmp = require('./olt-snmp')
const stats = oltSnmp.getOltStats()
for (const olt of (stats || [])) {
// Find which gateway this OLT connects through
const gwLink = topology.links.find(l => l.to === olt.host && l.type === 'gateway-to-olt')
if (gwLink) {
// Find the chain for this gateway and extend it
const gwChain = chains.find(c => c.leaf === gwLink.from)
if (gwChain) {
chains.push({
path: [
...gwChain.path,
{ host: olt.host, type: 'olt', label: olt.name || 'OLT', health: olt.reachable ? 100 : 0, onuCount: olt.onuCount, onlineCount: olt.onlineCount },
],
leaf: olt.host,
leafType: 'olt',
oltName: olt.name,
onuCount: olt.onuCount || 0,
onlineCount: olt.onlineCount || 0,
})
}
}
}
} catch (e) {
// OLT data optional
}
return {
topology: {
devices: topology.devices.map(d => ({
host: d.host, type: d.type, level: d.level, label: d.label, icon: d.icon,
ifSummary: d.ifSummary, traffic: d.traffic,
})),
links: topology.links,
summary: topology.summary,
},
chains,
ts: new Date().toISOString(),
}
}
// ── SNMP Trap log reader ──────────────────────────────────────────────────
// Raisecom OLT trap OID → human-readable event type
const RAISECOM_TRAP_MAP = {
'enterprises.8886.18.3.1.4.1': 'onu-register', // ONU registered
'enterprises.8886.18.3.1.4.2': 'onu-deregister', // ONU deregistered (offline)
'enterprises.8886.18.3.1.4.3': 'onu-state-change', // ONU state change
'enterprises.8886.1.27.8.2': 'olt-alarm-power', // Power alarm
'enterprises.8886.1.27.8.6': 'olt-alarm-fan', // Fan alarm
'enterprises.8886.1.27.8.10': 'olt-alarm-temperature',// Temperature alarm
'enterprises.8886.1.27.8.11': 'olt-alarm-los', // Loss of signal
'enterprises.8886.1.27.8.12': 'olt-alarm-dying-gasp',// Dying gasp
'enterprises.8886.1.27.8.17': 'olt-alarm-link-down', // Link down
'enterprises.8886.1.27.8.18': 'olt-alarm-link-up', // Link up
'enterprises.8886.1.27.8.20': 'olt-alarm-onu-los', // ONU loss of signal
'enterprises.8886.1.27.8.21': 'olt-alarm-onu-lof', // ONU loss of frame
'enterprises.8886.1.27.8.28': 'olt-alarm-onu-dying-gasp', // ONU dying gasp
'enterprises.8886.1.27.8.30': 'olt-alarm-onu-power-fail', // ONU power fail
'enterprises.8886.1.27.8.31': 'olt-alarm-onu-offline', // ONU offline
'enterprises.8886.1.27.8.37': 'olt-alarm-fiber-cut', // Fiber cut detected
'enterprises.8886.1.27.8.38': 'olt-alarm-fiber-restored', // Fiber restored
'enterprises.8886.18.3.6.3.11.2': 'gpon-onu-offline',
'enterprises.8886.18.3.6.3.11.3': 'gpon-onu-online',
'enterprises.8886.18.3.6.3.11.4': 'gpon-onu-dying-gasp',
'enterprises.8886.18.3.6.3.11.5': 'gpon-onu-los',
'enterprises.8886.18.3.6.3.11.8': 'gpon-onu-power-fail',
'enterprises.8886.18.3.6.3.11.9': 'gpon-onu-deactivated',
'enterprises.8886.18.3.6.3.11.10': 'gpon-onu-fiber-cut',
'enterprises.8886.18.3.6.3.11.11': 'gpon-onu-fiber-restored',
'enterprises.8886.18.3.6.3.11.12': 'gpon-onu-signal-degraded',
'enterprises.8886.18.3.6.3.11.13': 'gpon-onu-signal-fail',
'enterprises.8886.18.3.6.3.11.14': 'gpon-onu-drift-exceed',
'enterprises.8886.18.3.6.3.11.15': 'gpon-onu-range-exceed',
'enterprises.8886.1.2.1.5': 'raisecom-cold-start',
'enterprises.8886.1.26.1.2.1': 'raisecom-config-change',
'enterprises.8886.1.26.4.4.1': 'raisecom-firmware-event',
'enterprises.17409.2.2.11.1.1.1': 'tplink-olt-alarm',
'linkDown': 'link-down',
'linkUp': 'link-up',
}
async function getRecentTraps (windowMinutes = 60, limit = 200) {
const result = await influxQuery('telegraf',
`SELECT * FROM snmp_trap WHERE time > now() - ${windowMinutes}m ORDER BY time DESC LIMIT ${limit}`
)
const events = []
for (const r of (result.results || [])) {
for (const s of (r.series || [])) {
const cols = s.columns || []
for (const v of (s.values || [])) {
const row = {}
cols.forEach((c, i) => { row[c] = v[i] })
// Extract meaningful fields
const trapName = row.name || ''
const eventType = RAISECOM_TRAP_MAP[trapName] || trapName
const source = row.source || ''
// Extract ONU serial from trap varbinds (search for RCMG/TPLG pattern)
let onuSerial = null
for (const [k, val] of Object.entries(row)) {
if (typeof val === 'string' && /^(RCMG|TPLG|GPON)[A-F0-9]{8,}$/i.test(val)) {
onuSerial = val
break
}
}
// Extract ONU index (numeric field in varbinds)
let onuIndex = null
for (const [k, val] of Object.entries(row)) {
if (k.startsWith('enterprises.') && typeof val === 'number' && k.includes('.1.1.')) {
onuIndex = val
break
}
}
events.push({
time: row.time,
source, // OLT IP
event: eventType,
trapOid: trapName,
onuSerial,
onuIndex,
sysUptime: row.sysUpTimeInstance,
})
}
}
}
return events
}
// ── Build summary from logs ──────────────────────────────────────────────
async function buildLogSummary (windowMinutes = 60) {
const traps = await getRecentTraps(windowMinutes, 500)
// Group by OLT source
const byOlt = {}
for (const t of traps) {
if (!byOlt[t.source]) byOlt[t.source] = { events: [], offline: [], online: [], alarms: [] }
byOlt[t.source].events.push(t)
if (t.event.includes('offline') || t.event === 'onu-deregister' || t.event.includes('dying-gasp') || t.event.includes('los') || t.event.includes('power-fail')) {
byOlt[t.source].offline.push(t)
} else if (t.event.includes('online') || t.event === 'onu-register' || t.event === 'link-up') {
byOlt[t.source].online.push(t)
}
if (t.event.startsWith('olt-alarm') || t.event.startsWith('gpon-onu-fiber') || t.event.startsWith('gpon-onu-signal')) {
byOlt[t.source].alarms.push(t)
}
}
// Get OLT SNMP live status
let oltStats = []
try {
const oltSnmp = require('./olt-snmp')
oltStats = oltSnmp.getOltStats()
} catch (e) {}
// Get outage monitor events
let incidents = []
let eventLog = []
try {
const monitor = require('./outage-monitor')
incidents = monitor.getActiveIncidents()
eventLog = monitor.getEventLog(new Date(Date.now() - windowMinutes * 60000).toISOString(), 100)
} catch (e) {}
// ONU-level: find ONUs that went offline and didn't come back
const onuEvents = {}
for (const t of traps) {
if (!t.onuSerial) continue
if (!onuEvents[t.onuSerial]) onuEvents[t.onuSerial] = { serial: t.onuSerial, source: t.source, events: [] }
onuEvents[t.onuSerial].events.push({ time: t.time, event: t.event })
}
const stuckOffline = Object.values(onuEvents).filter(o => {
const last = o.events[0] // already sorted desc
return last && (last.event.includes('offline') || last.event.includes('deregister') || last.event.includes('dying-gasp') || last.event.includes('los'))
})
// Get Kuma real-time monitor status (source of truth for device reachability)
let kumaMonitors = []
try {
kumaMonitors = await fetchKumaMonitors()
} catch (e) {
log(`Kuma fetch failed: ${e.message}`)
}
const kumaDown = kumaMonitors.filter(m => m.status === 0 && m.type !== 'group')
const kumaUp = kumaMonitors.filter(m => m.status === 1 && m.type !== 'group')
const kumaSlow = kumaMonitors.filter(m => m.responseTime > 100 && m.type !== 'group')
return {
window: `${windowMinutes}m`,
totalTraps: traps.length,
byOlt: Object.entries(byOlt).map(([ip, data]) => {
const olt = oltStats.find(o => o.host === ip)
return {
oltIp: ip,
oltName: olt?.name || ip,
totalEvents: data.events.length,
offlineEvents: data.offline.length,
onlineEvents: data.online.length,
alarms: data.alarms.length,
recentEvents: data.events.slice(0, 10).map(e => ({
time: e.time, event: e.event, serial: e.onuSerial,
})),
}
}).sort((a, b) => b.offlineEvents - a.offlineEvents),
oltStats: oltStats.map(o => ({
name: o.name, host: o.host,
onuCount: o.onuCount, onlineCount: o.onlineCount,
offlineCount: (o.onuCount || 0) - (o.onlineCount || 0),
lastPoll: o.lastPoll, lastError: o.lastError || null,
})),
stuckOfflineOnus: stuckOffline.length,
stuckOfflineSample: stuckOffline.slice(0, 20).map(o => ({
serial: o.serial, olt: o.source, lastEvent: o.events[0]?.event, lastTime: o.events[0]?.time,
})),
activeIncidents: incidents,
monitorEvents: eventLog.length,
// Kuma — real device reachability (source of truth)
kuma: {
totalMonitors: kumaMonitors.filter(m => m.type !== 'group').length,
up: kumaUp.length,
down: kumaDown.map(m => ({ name: m.name, host: m.hostname, type: m.type })),
slow: kumaSlow.map(m => ({ name: m.name, host: m.hostname, type: m.type, ms: m.responseTime })),
},
ts: new Date().toISOString(),
}
}
// ── AI-powered root cause analysis (LOG-BASED) ───────────────────────────
async function analyzeNetworkIssue (context) {
const { aiCall } = require('./ai')
// Gather real data from logs, not interface status
const logSummary = await buildLogSummary(60)
const systemPrompt = `Tu es un ingénieur réseau senior NOC pour Targo/Gigafibre, un ISP fibre optique (GPON) au Québec.
Tu analyses les LOGS et événements SNMP en temps réel pour diagnostiquer les problèmes réseau.
Architecture réseau:
- OLTs Raisecom et TP-Link desservent les clients via GPON
- Chaque OLT a plusieurs ports, chaque port un splitter, chaque splitter dessert 32-128 ONUs (modems clients)
- Les SNMP traps sont la source de vérité pour les événements réseau
Types d'événements SNMP importants:
- onu-deregister / gpon-onu-offline: ONU déconnecté
- onu-register / gpon-onu-online: ONU reconnecté
- gpon-onu-dying-gasp / olt-alarm-onu-dying-gasp: Perte de courant chez le client
- gpon-onu-los / olt-alarm-onu-los: Perte de signal optique (fibre coupée?)
- gpon-onu-fiber-cut: Coupure fibre détectée
- gpon-onu-power-fail: Panne d'alimentation ONU
- gpon-onu-signal-degraded: Signal optique dégradé (fibre sale, connecteur usé)
- link-down / link-up: Interface réseau up/down
- olt-alarm-power / olt-alarm-temperature: Alarmes OLT
Règles de diagnostic:
1. Beaucoup de dying-gasp sur même OLT en même temps = panne de courant dans le secteur
2. Beaucoup de LOS sur même port = coupure fibre en amont du splitter
3. Dying-gasp isolé = client a débranché ou panne électrique locale
4. OLT unreachable + tous ONUs offline = panne OLT ou lien amont coupé
5. Serials qui vont offline/online en boucle = équipement instable, remplacement nécessaire
6. Signal dégradé = maintenance préventive nécessaire (nettoyage connecteurs)
Réponds en JSON valide:
{
"summary_fr": "Résumé en 2-3 phrases de l'état du réseau",
"severity": "critical|high|medium|low|normal",
"network_health_score": 0-100,
"active_issues": [
{
"type": "power_outage|fiber_cut|olt_failure|unstable_onu|signal_degradation|mass_offline|normal_churn",
"olt": "nom ou IP de l'OLT",
"description_fr": "description du problème",
"affected_customers": number,
"evidence": ["preuve 1", "preuve 2"],
"action_fr": "action recommandée",
"urgency": "immédiat|court-terme|surveillance|aucun"
}
],
"olt_health": [
{"name": "...", "status": "ok|warning|critical|unreachable", "online": number, "total": number, "issues_fr": "..."}
],
"trending": "improving|stable|degrading",
"next_check_fr": "Recommandation pour le prochain contrôle"
}`
const userPrompt = `Analyse les logs réseau de la dernière heure:
CONTEXTE: ${context || 'Rapport de situation réseau — résume les problèmes actifs et identifie les causes'}
UPTIME-KUMA — SOURCE DE VÉRITÉ POUR LA JOIGNABILITÉ (${logSummary.kuma.totalMonitors} moniteurs):
- En ligne: ${logSummary.kuma.up} moniteurs
- HORS LIGNE: ${JSON.stringify(logSummary.kuma.down)}
- Lents (>100ms): ${JSON.stringify(logSummary.kuma.slow)}
NOTE: Si Kuma dit qu'un équipement est UP, il est UP. Ne pas contredire Kuma.
ÉVÉNEMENTS SNMP TRAP (${logSummary.totalTraps} événements en ${logSummary.window}):
${JSON.stringify(logSummary.byOlt, null, 1)}
STATUT OLTs — COMPTAGE ONUs (via SNMP polling targo-hub):
${JSON.stringify(logSummary.oltStats, null, 1)}
NOTE: Si onuCount=0 pour un OLT mais que Kuma dit qu'il est UP, c'est que le polling n'est pas configuré, PAS que l'OLT est down.
ONUs BLOQUÉS OFFLINE (${logSummary.stuckOfflineOnus} ONUs dont le dernier événement est offline):
${JSON.stringify(logSummary.stuckOfflineSample, null, 1)}
INCIDENTS ACTIFS (outage monitor):
${JSON.stringify(logSummary.activeIncidents, null, 1)}
Analyse les patterns, identifie les VRAIS problèmes (pas les faux positifs), et donne un rapport synthétique pour l'équipe.
Si tout est normal (Kuma OK, pas de clusters d'ONUs offline, pas d'alarmes), dis-le clairement avec severity "normal".`
try {
const result = await aiCall(systemPrompt, userPrompt, { maxTokens: 16384 })
result._logSummary = {
totalTraps: logSummary.totalTraps,
oltCount: logSummary.oltStats.length,
stuckOffline: logSummary.stuckOfflineOnus,
incidentCount: logSummary.activeIncidents.length,
}
return result
} catch (e) {
log(`Network AI analysis failed: ${e.message}`)
return {
summary_fr: `Analyse IA indisponible: ${e.message}`,
severity: 'unknown',
network_health_score: null,
active_issues: [],
olt_health: logSummary.oltStats.map(o => ({
name: o.name, status: o.reachable ? 'ok' : 'unreachable',
online: o.onlineCount, total: o.onuCount,
issues_fr: o.lastError || 'OK',
})),
_logSummary: {
totalTraps: logSummary.totalTraps,
oltCount: logSummary.oltStats.length,
stuckOffline: logSummary.stuckOfflineOnus,
},
_error: e.message,
}
}
}
// ── Network Map (static topology + live enrichment) ───────────────────────
let _staticTopo = null
function loadStaticTopology () {
if (_staticTopo) return _staticTopo
try {
const fs = require('fs')
const path = require('path')
const raw = fs.readFileSync(path.join(__dirname, '..', 'data', 'network-topology.json'), 'utf8')
_staticTopo = JSON.parse(raw)
log(`Static topology loaded: ${Object.keys(_staticTopo.sites).length} sites, ${_staticTopo.links.length} links`)
return _staticTopo
} catch (e) {
log(`Failed to load static topology: ${e.message}`)
return { sites: {}, links: [], rings: [] }
}
}
let _mapCache = null
let _mapCacheTs = 0
const MAP_CACHE_TTL = 60000 // 1 min
async function getNetworkMap () {
const now = Date.now()
if (_mapCache && (now - _mapCacheTs) < MAP_CACHE_TTL) return _mapCache
const topo = loadStaticTopology()
const kumaMonitors = await fetchKumaMonitors().catch(() => [])
// Build Kuma lookup by IP
const kumaByIp = new Map()
for (const m of kumaMonitors) {
if (m.hostname) {
const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '')
kumaByIp.set(clean, m)
}
const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/)
if (ipInName && !kumaByIp.has(ipInName[1])) kumaByIp.set(ipInName[1], m)
}
// Get OLT stats
let oltStats = []
try {
const oltSnmp = require('./olt-snmp')
oltStats = oltSnmp.getOltStats() || []
} catch (e) { /* ok */ }
const oltByIp = new Map()
for (const o of oltStats) { if (o.host) oltByIp.set(o.host, o) }
// Enrich each site with live data
const sites = {}
const allDeviceIps = new Set()
for (const [siteId, site] of Object.entries(topo.sites)) {
const enrichedDevices = (site.devices || []).map(dev => {
allDeviceIps.add(dev.ip)
const kuma = kumaByIp.get(dev.ip)
const olt = oltByIp.get(dev.ip)
return {
...dev,
kuma: kuma ? { status: kuma.status, name: kuma.name, responseTime: kuma.responseTime } : null,
olt: olt ? { name: olt.name, onuCount: olt.onuCount, onlineCount: olt.onlineCount, reachable: olt.reachable } : null,
}
})
// Site-level status: aggregate device statuses
const deviceStatuses = enrichedDevices.map(d => {
if (d.kuma) return d.kuma.status === 1 ? 'up' : d.kuma.status === 0 ? 'down' : 'pending'
if (d.olt) return d.olt.reachable ? 'up' : 'down'
return 'unknown'
})
const hasDown = deviceStatuses.includes('down')
const hasUp = deviceStatuses.includes('up')
const siteStatus = hasDown ? 'degraded' : hasUp ? 'up' : 'unknown'
// ONU totals for the site
const onuTotal = enrichedDevices.reduce((s, d) => s + (d.olt?.onuCount || 0), 0)
const onuOnline = enrichedDevices.reduce((s, d) => s + (d.olt?.onlineCount || 0), 0)
sites[siteId] = {
...site,
devices: enrichedDevices,
status: siteStatus,
onuTotal,
onuOnline,
}
}
// Enrich links with status (both endpoints up = link up)
const links = (topo.links || []).map(link => {
const fromSite = sites[link.from]
const toSite = sites[link.to]
const fromUp = fromSite?.status === 'up'
const toUp = toSite?.status === 'up'
return {
...link,
status: (fromUp && toUp) ? 'up' : (!fromUp || !toUp) ? 'degraded' : 'unknown',
}
})
const result = {
sites,
links,
rings: topo.rings || [],
ts: new Date().toISOString(),
summary: {
totalSites: Object.keys(sites).length,
totalLinks: links.length,
totalOlts: Object.values(sites).flatMap(s => s.devices).filter(d => d.type === 'olt').length,
totalOnus: Object.values(sites).reduce((s, site) => s + site.onuTotal, 0),
totalOnusOnline: Object.values(sites).reduce((s, site) => s + site.onuOnline, 0),
sitesUp: Object.values(sites).filter(s => s.status === 'up').length,
sitesDown: Object.values(sites).filter(s => s.status === 'degraded').length,
},
}
_mapCache = result
_mapCacheTs = now
return result
}
// ── HTTP handler ───────────────────────────────────────────────────────────
async function handle (req, res, method, urlPath) {
const sub = urlPath.replace('/network/', '')
// GET /network/topology — discover network devices and links
if (sub === 'topology' && method === 'GET') {
try {
const topology = await discoverTopology()
return json(res, 200, topology)
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/dependency-map — full dependency chain with OLT data
if (sub === 'dependency-map' && method === 'GET') {
try {
const depMap = await buildDependencyMap()
return json(res, 200, depMap)
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/health/:host — interface health for a specific device
if (sub.startsWith('health/') && method === 'GET') {
try {
const host = decodeURIComponent(sub.replace('health/', ''))
const health = await getInterfaceHealth(host)
return json(res, 200, { host, interfaces: health })
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/errors — recent error spikes across the network
if (sub === 'errors' && method === 'GET') {
try {
const url = new URL(req.url, 'http://localhost')
const minutes = parseInt(url.searchParams.get('minutes')) || 60
const spikes = await getErrorSpikes(minutes)
return json(res, 200, { spikes, count: spikes.length })
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/kuma — Uptime Kuma monitor status (source of truth)
if (sub === 'kuma' && method === 'GET') {
try {
const monitors = await fetchKumaMonitors()
const down = monitors.filter(m => m.status === 0 && m.type !== 'group')
const up = monitors.filter(m => m.status === 1 && m.type !== 'group')
return json(res, 200, {
total: monitors.filter(m => m.type !== 'group').length,
up: up.length,
down: down.length,
downList: down,
groups: monitors.filter(m => m.type === 'group').map(m => ({ name: m.name, status: m.status })),
monitors: monitors.filter(m => m.type !== 'group'),
})
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/logs — recent SNMP trap events (parsed)
if (sub === 'logs' && method === 'GET') {
try {
const url = new URL(req.url, 'http://localhost')
const minutes = parseInt(url.searchParams.get('minutes')) || 60
const summary = await buildLogSummary(minutes)
return json(res, 200, summary)
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/traps — raw recent SNMP traps (parsed into events)
if (sub === 'traps' && method === 'GET') {
try {
const url = new URL(req.url, 'http://localhost')
const minutes = parseInt(url.searchParams.get('minutes')) || 60
const limit = parseInt(url.searchParams.get('limit')) || 200
const traps = await getRecentTraps(minutes, limit)
return json(res, 200, { events: traps, count: traps.length })
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// POST /network/analyze — AI root cause analysis
if (sub === 'analyze' && method === 'POST') {
try {
const body = await parseBody(req)
const analysis = await analyzeNetworkIssue(body.context || '')
return json(res, 200, analysis)
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/map — static topology from Excel + live Kuma/OLT enrichment
if (sub === 'map' && method === 'GET') {
try {
const map = await getNetworkMap()
return json(res, 200, map)
} catch (e) {
return json(res, 500, { error: e.message })
}
}
// GET /network/influx?db=telegraf&q=SELECT... — raw InfluxDB proxy (for ops dashboard)
if (sub === 'influx' && method === 'GET') {
try {
const url = new URL(req.url, 'http://localhost')
const db = url.searchParams.get('db') || 'telegraf'
const q = url.searchParams.get('q')
if (!q) return json(res, 400, { error: 'Missing q parameter' })
// Safety: only allow SELECT and SHOW queries
const ql = q.trim().toUpperCase()
if (!ql.startsWith('SELECT') && !ql.startsWith('SHOW')) {
return json(res, 403, { error: 'Only SELECT and SHOW queries allowed' })
}
const result = await influxQuery(db, q)
return json(res, 200, result)
} catch (e) {
return json(res, 500, { error: e.message })
}
}
return json(res, 404, { error: 'Network endpoint not found' })
}
module.exports = {
handle,
discoverTopology,
buildDependencyMap,
buildLogSummary,
getRecentTraps,
fetchKumaMonitors,
getInterfaceHealth,
getNetworkMap,
getErrorSpikes,
analyzeNetworkIssue,
influxQuery,
extractSeries,
}