gigafibre-fsm/services/targo-hub/lib/network-intel.js

'use strict'
/**
 * Network Intelligence — AI-powered log analysis via InfluxDB + Grafana
 *
 * Queries InfluxDB (telegraf) for real-time network metrics:
 *   - Interface status (up/down) across all routers, switches, OLTs
 *   - Traffic patterns (ingress/egress) for anomaly detection
 *   - Error counters (CRC, drops, collisions) for fiber/cable issues
 *   - SNMP trap history for correlating events
 *   - LLDP/CDP neighbor discovery for topology links
 *
 * Then feeds these to Gemini for root cause analysis instead of making
 * humans stare at 20 Grafana dashboards trying to correlate mentally.
 *
 * InfluxDB: 10.5.2.65:8086 (no auth, InfluxQL v1)
 * Databases: telegraf (live SNMP), OLT_Logs (syslog), graphite (FastNetMon)
 */

const cfg = require('./config')
const { log, json, parseBody } = require('./helpers')
const http = require('http')
const https = require('https')

const INFLUX = cfg.INFLUXDB_URL || 'http://10.5.2.65:8086'
const KUMA_METRICS_URL = cfg.KUMA_METRICS_URL || 'https://kuma.targo.ca/metrics'

// ── InfluxDB query helper ──────────────────────────────────────────────────

async function influxQuery (db, query) {
  const url = `${INFLUX}/query?db=${encodeURIComponent(db)}&q=${encodeURIComponent(query)}`
  return new Promise((resolve, reject) => {
    const proto = url.startsWith('https') ? require('https') : http
    proto.get(url, { timeout: 15000 }, (res) => {
      let data = ''
      res.on('data', c => data += c)
      res.on('end', () => {
        try {
          const parsed = JSON.parse(data)
          resolve(parsed)
        } catch (e) {
          reject(new Error(`InfluxDB parse error: ${e.message}`))
        }
      })
    }).on('error', reject)
  })
}

function extractSeries (result) {
  const rows = []
  for (const r of (result.results || [])) {
    for (const s of (r.series || [])) {
      const cols = s.columns || []
      for (const v of (s.values || [])) {
        const row = {}
        cols.forEach((c, i) => { row[c] = v[i] })
        row._measurement = s.name
        if (s.tags) Object.assign(row, s.tags)
        rows.push(row)
      }
    }
  }
  return rows
}

// ── Kuma Prometheus metrics scraper ────────────────────────────────────────

let _kumaCache = null
let _kumaCacheTs = 0
const KUMA_CACHE_TTL = 60000 // 1 min

async function fetchKumaMonitors () {
  const now = Date.now()
  if (_kumaCache && (now - _kumaCacheTs) < KUMA_CACHE_TTL) return _kumaCache

  return new Promise((resolve, reject) => {
    const proto = KUMA_METRICS_URL.startsWith('https') ? https : http
    proto.get(KUMA_METRICS_URL, { timeout: 10000, rejectUnauthorized: false }, (res) => {
      let data = ''
      res.on('data', c => data += c)
      res.on('end', () => {
        const monitors = []
        const lines = data.split('\n')
        // Parse Prometheus format: monitor_status{labels} value
        const statusLines = lines.filter(l => l.startsWith('monitor_status{'))
        const rtLines = lines.filter(l => l.startsWith('monitor_response_time{'))

        const rtMap = {}
        for (const line of rtLines) {
          const m = line.match(/monitor_id="(\d+)".*}\s+([\d.]+)/)
          if (m) rtMap[m[1]] = parseFloat(m[2])
        }

        for (const line of statusLines) {
          const idM = line.match(/monitor_id="(\d+)"/)
          const nameM = line.match(/monitor_name="([^"]*)"/)
          const typeM = line.match(/monitor_type="([^"]*)"/)
          const hostM = line.match(/monitor_hostname="([^"]*)"/)
          const valM = line.match(/}\s+([\d.]+)$/)
          if (!idM || !nameM) continue

          const id = idM[1]
          const hostname = hostM ? hostM[1] : null
          monitors.push({
            id,
            name: nameM[1],
            type: typeM ? typeM[1] : 'unknown',
            hostname: hostname === 'null' ? null : hostname,
            status: valM ? parseInt(valM[1]) : -1, // 1=up, 0=down, 2=pending
            responseTime: rtMap[id] || null,
          })
        }

        log(`Kuma: scraped ${monitors.length} monitors, ${monitors.filter(m => m.status === 0).length} down`)
        _kumaCache = monitors
        _kumaCacheTs = now
        resolve(monitors)
      })
    }).on('error', (e) => {
      log(`Kuma metrics fetch failed: ${e.message}`)
      resolve(_kumaCache || []) // Return stale cache on error
    })
  })
}

// ── Network topology discovery ─────────────────────────────────────────────

let _topologyCache = null
let _topologyCacheTs = 0
const TOPOLOGY_CACHE_TTL = 300000 // 5 min

// Known network hierarchy (based on Targo/Gigafibre architecture)
// Core → Edge Router → OLT Gateway → OLTs → ONUs
const DEVICE_HIERARCHY = {
  'core-switch': { level: 0, label: 'Core Switch', icon: 'hub' },
  'edge-router': { level: 1, label: 'Edge Router', icon: 'router' },
  'olt-gateway': { level: 2, label: 'OLT Gateway', icon: 'device_hub' },
  'public-infra': { level: 1, label: 'Public Infra', icon: 'public' },
  'server': { level: 3, label: 'Server', icon: 'dns' },
  'unknown': { level: 4, label: 'Unknown', icon: 'help' },
}

function classifyDevice (host) {
  if (host.startsWith('172.17.')) return 'olt-gateway'
  if (/^10\.10[1-6]\./.test(host)) return 'edge-router'
  if (host.startsWith('10.255.')) return 'core-switch'
  if (host.startsWith('10.100.')) return 'server'
  if (host.startsWith('23.159.') || host.startsWith('100.64.') || host.startsWith('96.125.')) return 'public-infra'
  return 'unknown'
}

// Subnet-based parent inference: which core switch likely serves which edge router
// Based on network architecture: core switches on 10.255.x.x connect to edge routers 10.101-106.x.x
function inferLinks (devices) {
  const links = []
  const byType = {}
  for (const d of devices) {
    if (!byType[d.type]) byType[d.type] = []
    byType[d.type].push(d)
  }

  const coreDevices = byType['core-switch'] || []
  const edgeDevices = byType['edge-router'] || []
  const oltGateways = byType['olt-gateway'] || []

  // Infer Core → Edge links by analyzing interface descriptions
  // Core switches (10.255.x.x) have uplinks to edge routers with descriptions
  // mentioning the router hostname or IP. Fallback: subnet-based matching.
  for (const edge of edgeDevices) {
    // Find the core switch most likely connected (by matching 3rd octet patterns)
    // Example: edge 10.101.0.1 → core 10.255.254.1 or 10.255.254.2
    // For now, link to all core switches (the AI analysis will refine)
    let linked = false
    for (const core of coreDevices) {
      // Check if any core interface description mentions the edge IP or subnet
      for (const iface of core.interfaces) {
        const d = iface.descr || ''
        // Common patterns: "GigaEthernet0/1 - to-router-101", "ae0.101"
        const edgeOctet = edge.host.split('.')[1] // "101", "102", etc.
        if (d.includes(edgeOctet) || d.toLowerCase().includes(edge.host)) {
          links.push({ from: core.host, to: edge.host, type: 'core-to-edge', via: d, status: iface.status })
          linked = true
          break
        }
      }
    }
    // Fallback: distribute edge routers across core switches based on subnet patterns
    // 10.101.x → first core, 10.102.x → second core, etc.
    if (!linked && coreDevices.length > 0) {
      const edgeSubnet = parseInt(edge.host.split('.')[1]) // 101, 102, 103...
      const sectorIdx = (edgeSubnet - 101) % coreDevices.length
      const assignedCore = coreDevices[Math.max(0, Math.min(sectorIdx, coreDevices.length - 1))]
      links.push({ from: assignedCore.host, to: edge.host, type: 'core-to-edge', via: `inferred (subnet .${edgeSubnet})`, status: 'unknown' })
    }
  }

  // Infer Edge → OLT Gateway links by IP range proximity
  // OLT gateways (172.17.x.x) connect through edge routers
  for (const gw of oltGateways) {
    // The gateway's management interface often routes through a specific edge router
    // For now we infer by matching OLT registered names to edge subnet areas
    if (edgeDevices.length > 0) {
      links.push({ from: edgeDevices[0].host, to: gw.host, type: 'edge-to-olt', via: 'inferred', status: 'unknown' })
    } else if (coreDevices.length > 0) {
      links.push({ from: coreDevices[0].host, to: gw.host, type: 'core-to-olt', via: 'inferred', status: 'unknown' })
    }
  }

  // OLT Gateway → OLTs (from olt-snmp module)
  try {
    const oltSnmp = require('./olt-snmp')
    const oltStats = oltSnmp.getOltStats()
    for (const olt of (oltStats || [])) {
      // Match OLT host to nearest gateway
      const oltIp = olt.host
      if (!oltIp) continue
      let bestGw = null
      for (const gw of oltGateways) {
        // Same /16 subnet?
        const gwParts = gw.host.split('.')
        const oltParts = oltIp.split('.')
        if (gwParts[0] === oltParts[0] && gwParts[1] === oltParts[1]) {
          bestGw = gw
          break
        }
      }
      if (bestGw) {
        links.push({
          from: bestGw.host, to: oltIp, type: 'gateway-to-olt',
          via: olt.name || oltIp,
          status: olt.reachable ? 'up' : 'down',
          oltName: olt.name,
          onuCount: olt.onuCount || 0,
          onlineCount: olt.onlineCount || 0,
        })
      }
    }
  } catch (e) {
    log(`OLT link inference failed: ${e.message}`)
  }

  return links
}

async function discoverTopology () {
  const now = Date.now()
  if (_topologyCache && (now - _topologyCacheTs) < TOPOLOGY_CACHE_TTL) return _topologyCache

  log('Network topology discovery starting...')
  const devices = new Map() // agent_host → { interfaces: [], ... }

  // Step 1: Get list of agent_hosts from BOTH 'interface' and 'ifTable' measurements
  // Edge routers may only appear in ifTable (different telegraf collector)
  const measurements = ['interface', 'ifTable']
  for (const m of measurements) {
    try {
      const hostResult = await influxQuery('telegraf',
        `SHOW TAG VALUES FROM "${m}" WITH KEY = "agent_host"`
      )
      for (const r of (hostResult.results || [])) {
        for (const s of (r.series || [])) {
          for (const v of (s.values || [])) {
            const host = v[1]
            if (host && !devices.has(host)) {
              devices.set(host, { host, interfaces: [], type: 'unknown', source: m })
            }
          }
        }
      }
    } catch (e) {
      log(`Topology host discovery from ${m} failed: ${e.message}`)
    }
  }

  // Step 2: Get interface summary from 'interface' measurement
  try {
    const ifResult = await influxQuery('telegraf',
      `SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed"
       FROM "interface" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"`
    )
    for (const r of (ifResult.results || [])) {
      for (const s of (r.series || [])) {
        const agentHost = s.tags?.agent_host
        if (!agentHost || !devices.has(agentHost)) continue
        const dev = devices.get(agentHost)
        for (const v of (s.values || [])) {
          dev.interfaces.push({
            descr: s.tags?.ifDescr || '',
            status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown',
            speed: v[2],
            source: 'interface',
          })
        }
      }
    }
  } catch (e) {
    log(`Topology interface query failed: ${e.message}`)
  }

  // Step 3: Get interface summary from 'ifTable' for devices not yet covered
  // (edge routers, additional switches that only report via ifTable)
  try {
    const ifResult = await influxQuery('telegraf',
      `SELECT last("ifOperStatus") AS "status", last("ifSpeed") AS "speed"
       FROM "ifTable" WHERE time > now() - 1h GROUP BY "agent_host", "ifDescr"`
    )
    for (const r of (ifResult.results || [])) {
      for (const s of (r.series || [])) {
        const agentHost = s.tags?.agent_host
        if (!agentHost) continue
        if (!devices.has(agentHost)) {
          devices.set(agentHost, { host: agentHost, interfaces: [], type: 'unknown', source: 'ifTable' })
        }
        const dev = devices.get(agentHost)
        // Only add if we don't already have interfaces from 'interface' measurement
        const hasIfData = dev.interfaces.some(i => i.source === 'interface')
        for (const v of (s.values || [])) {
          const ifDescr = s.tags?.ifDescr || ''
          // Skip if already have this interface from 'interface' measurement
          if (hasIfData && dev.interfaces.some(i => i.descr === ifDescr && i.source === 'interface')) continue
          dev.interfaces.push({
            descr: ifDescr,
            status: v[1] === 1 ? 'up' : v[1] === 2 ? 'down' : 'unknown',
            speed: v[2],
            source: 'ifTable',
          })
        }
      }
    }
  } catch (e) {
    log(`Topology ifTable query failed: ${e.message}`)
  }

  // Step 4: Try LLDP neighbor data for real link discovery
  let lldpLinks = []
  try {
    const lldpResult = await influxQuery('telegraf',
      `SELECT last("lldpRemSysName") AS "remoteName", last("lldpRemPortDesc") AS "remotePort"
       FROM "lldpRemTable" WHERE time > now() - 1h GROUP BY "agent_host", "lldpRemIndex"`
    )
    for (const r of (lldpResult.results || [])) {
      for (const s of (r.series || [])) {
        const localHost = s.tags?.agent_host
        for (const v of (s.values || [])) {
          if (v[1]) {
            lldpLinks.push({ localHost, remoteName: v[1], remotePort: v[2] || '' })
          }
        }
      }
    }
  } catch (e) {
    // LLDP not available — fine, we'll infer links
  }

  // Step 5: Get traffic data for bandwidth utilization (top talkers)
  const trafficMap = new Map()
  try {
    const trResult = await influxQuery('telegraf',
      `SELECT last("ifHCInOctets") AS "inOctets", last("ifHCOutOctets") AS "outOctets"
       FROM "ifXTable" WHERE time > now() - 10m GROUP BY "agent_host", "ifDescr"`
    )
    for (const r of (trResult.results || [])) {
      for (const s of (r.series || [])) {
        const host = s.tags?.agent_host
        if (!host) continue
        if (!trafficMap.has(host)) trafficMap.set(host, { totalIn: 0, totalOut: 0, topIf: [] })
        const t = trafficMap.get(host)
        for (const v of (s.values || [])) {
          const inO = v[1] || 0
          const outO = v[2] || 0
          t.totalIn += inO
          t.totalOut += outO
          if (inO + outO > 0) {
            t.topIf.push({ descr: s.tags?.ifDescr, inOctets: inO, outOctets: outO })
          }
        }
      }
    }
  } catch (e) {
    // Traffic data optional
  }

  // Step 6: Fetch Kuma monitor data for real reachability
  let kumaMonitors = []
  try {
    kumaMonitors = await fetchKumaMonitors()
  } catch (e) {
    log(`Topology Kuma fetch failed: ${e.message}`)
  }

  // Build lookup: IP/hostname → Kuma monitor
  const kumaByHost = new Map()
  for (const m of kumaMonitors) {
    if (m.hostname) {
      // Strip protocol and port if present (e.g. "http://10.5.2.65:8086" → "10.5.2.65")
      const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '')
      kumaByHost.set(clean, m)
    }
    // Also index by monitor name in case it contains an IP
    const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/)
    if (ipInName && !kumaByHost.has(ipInName[1])) {
      kumaByHost.set(ipInName[1], m)
    }
  }

  // Classify devices by their host tag patterns
  for (const [host, dev] of devices) {
    dev.type = classifyDevice(host)
    dev.level = DEVICE_HIERARCHY[dev.type]?.level ?? 4
    dev.label = DEVICE_HIERARCHY[dev.type]?.label ?? 'Unknown'
    dev.icon = DEVICE_HIERARCHY[dev.type]?.icon ?? 'help'

    // Interface summary stats — only count "active" interfaces (those with speed > 0 or status up)
    // Unused switch ports (down + speed 0) are not problems, just empty ports
    const activeIfs = dev.interfaces.filter(i => i.status === 'up' || (i.speed && i.speed > 0))
    const up = dev.interfaces.filter(i => i.status === 'up').length
    const down = activeIfs.filter(i => i.status === 'down').length // only active-but-down ports matter
    const total = dev.interfaces.length
    const activeTotal = activeIfs.length
    const ifHealth = activeTotal > 0 ? Math.round((up / activeTotal) * 100) : (total > 0 ? 0 : null)

    // Kuma reachability — source of truth for device being up/down
    const kuma = kumaByHost.get(host)
    const kumaUp = kuma ? kuma.status === 1 : null // null = not monitored in Kuma
    const kumaRt = kuma ? kuma.responseTime : null

    // Final health: Kuma UP + has active interfaces → use interface health
    //               Kuma UP + no active interfaces → 100% (device reachable, just no SNMP polling)
    //               Kuma DOWN → 0%
    //               No Kuma data → fall back to interface health
    let health
    if (kumaUp === true) {
      health = activeTotal > 0 ? Math.max(ifHealth, 80) : 100 // reachable = at least 80%
    } else if (kumaUp === false) {
      health = 0
    } else {
      health = ifHealth ?? 0
    }

    dev.ifSummary = { up, down, total, activeTotal, health }
    dev.kuma = kuma ? { status: kuma.status, name: kuma.name, responseTime: kumaRt } : null

    // Traffic data if available
    const traffic = trafficMap.get(host)
    if (traffic) {
      dev.traffic = {
        totalInGb: Math.round(traffic.totalIn / 1073741824 * 100) / 100,
        totalOutGb: Math.round(traffic.totalOut / 1073741824 * 100) / 100,
      }
    }
  }

  // Build links (inferred + LLDP-based)
  const links = inferLinks([...devices.values()])

  // Add LLDP-discovered links
  for (const l of lldpLinks) {
    // Try to resolve remoteName to an IP in our device list
    const remoteDevice = [...devices.values()].find(d =>
      d.host === l.remoteName || d.host.includes(l.remoteName)
    )
    if (remoteDevice) {
      // Check if link already exists
      const exists = links.some(lk =>
        (lk.from === l.localHost && lk.to === remoteDevice.host) ||
        (lk.to === l.localHost && lk.from === remoteDevice.host)
      )
      if (!exists) {
        links.push({
          from: l.localHost, to: remoteDevice.host, type: 'lldp',
          via: `LLDP: ${l.remotePort}`, status: 'up',
        })
      }
    }
  }

  const topology = {
    devices: [...devices.values()],
    links,
    deviceCount: devices.size,
    linkCount: links.length,
    ts: new Date().toISOString(),
    summary: {
      oltGateways: [...devices.values()].filter(d => d.type === 'olt-gateway').length,
      edgeRouters: [...devices.values()].filter(d => d.type === 'edge-router').length,
      coreSwitches: [...devices.values()].filter(d => d.type === 'core-switch').length,
      servers: [...devices.values()].filter(d => d.type === 'server').length,
      publicInfra: [...devices.values()].filter(d => d.type === 'public-infra').length,
      unknown: [...devices.values()].filter(d => d.type === 'unknown').length,
    },
  }

  log(`Topology discovered: ${topology.deviceCount} devices, ${topology.linkCount} links (${topology.summary.edgeRouters} routers, ${topology.summary.coreSwitches} core, ${topology.summary.oltGateways} OLT gateways)`)

  _topologyCache = topology
  _topologyCacheTs = now
  return topology
}

// ── Interface health snapshot ──────────────────────────────────────────────

async function getInterfaceHealth (agentHost) {
  // Query both interface and ifTable measurements
  const results = []
  for (const m of ['ifTable', 'interface']) {
    try {
      const result = await influxQuery('telegraf',
        `SELECT last("ifOperStatus") AS "status", last("ifInErrors") AS "inErrors", last("ifOutErrors") AS "outErrors",
                last("ifInDiscards") AS "inDiscards", last("ifOutDiscards") AS "outDiscards", last("ifSpeed") AS "speed"
         FROM "${m}" WHERE "agent_host" = '${agentHost}' AND time > now() - 30m GROUP BY "ifIndex", "ifDescr"`
      )
      const rows = extractSeries(result)
      if (rows.length > 0) return rows // Return first measurement that has data
      results.push(...rows)
    } catch (e) {
      // Try next measurement
    }
  }
  return results
}

// ── Recent error spikes ────────────────────────────────────────────────────

async function getErrorSpikes (windowMinutes = 60) {
  const result = await influxQuery('telegraf',
    `SELECT non_negative_derivative(last("ifInErrors"), 1m) AS "errRate"
     FROM "ifTable" WHERE time > now() - ${windowMinutes}m
     GROUP BY time(5m), "agent_host", "ifIndex" FILL(0)`
  )
  const spikes = []
  for (const r of (result.results || [])) {
    for (const s of (r.series || [])) {
      for (const v of (s.values || [])) {
        if (v[1] && v[1] > 10) { // >10 errors/min = spike
          spikes.push({
            time: v[0], errRate: v[1],
            host: s.tags?.agent_host, ifIndex: s.tags?.ifIndex,
          })
        }
      }
    }
  }
  return spikes
}

// ── Dependency chain builder ──────────────────────────────────────────────

async function buildDependencyMap () {
  const topology = await discoverTopology()

  // Build adjacency from links
  const adj = new Map() // host → [{ target, type, status }]
  for (const link of topology.links) {
    if (!adj.has(link.from)) adj.set(link.from, [])
    adj.get(link.from).push({ target: link.to, ...link })
    // Bidirectional
    if (!adj.has(link.to)) adj.set(link.to, [])
    adj.get(link.to).push({ target: link.from, from: link.to, to: link.from, type: link.type, status: link.status })
  }

  // Build dependency chains: for each leaf device, trace path to core
  const chains = []
  const deviceMap = new Map(topology.devices.map(d => [d.host, d]))

  for (const dev of topology.devices) {
    if (dev.type === 'olt-gateway' || dev.type === 'server') {
      // Trace upward to core
      const chain = [dev.host]
      const visited = new Set([dev.host])
      let current = dev.host

      for (let i = 0; i < 5; i++) { // max depth
        const neighbors = adj.get(current) || []
        // Pick the neighbor with lowest level (closest to core)
        let bestNext = null
        let bestLevel = 999
        for (const n of neighbors) {
          const nDev = deviceMap.get(n.target)
          if (!nDev || visited.has(n.target)) continue
          if (nDev.level < bestLevel) {
            bestLevel = nDev.level
            bestNext = n.target
          }
        }
        if (!bestNext) break
        chain.push(bestNext)
        visited.add(bestNext)
        current = bestNext
      }

      chain.reverse() // core → ... → leaf
      chains.push({
        path: chain.map(h => {
          const d = deviceMap.get(h)
          return { host: h, type: d?.type, label: d?.label, health: d?.ifSummary?.health }
        }),
        leaf: dev.host,
        leafType: dev.type,
      })
    }
  }

  // OLT-specific chains (from olt-snmp module)
  try {
    const oltSnmp = require('./olt-snmp')
    const stats = oltSnmp.getOltStats()
    for (const olt of (stats || [])) {
      // Find which gateway this OLT connects through
      const gwLink = topology.links.find(l => l.to === olt.host && l.type === 'gateway-to-olt')
      if (gwLink) {
        // Find the chain for this gateway and extend it
        const gwChain = chains.find(c => c.leaf === gwLink.from)
        if (gwChain) {
          chains.push({
            path: [
              ...gwChain.path,
              { host: olt.host, type: 'olt', label: olt.name || 'OLT', health: olt.reachable ? 100 : 0, onuCount: olt.onuCount, onlineCount: olt.onlineCount },
            ],
            leaf: olt.host,
            leafType: 'olt',
            oltName: olt.name,
            onuCount: olt.onuCount || 0,
            onlineCount: olt.onlineCount || 0,
          })
        }
      }
    }
  } catch (e) {
    // OLT data optional
  }

  return {
    topology: {
      devices: topology.devices.map(d => ({
        host: d.host, type: d.type, level: d.level, label: d.label, icon: d.icon,
        ifSummary: d.ifSummary, traffic: d.traffic,
      })),
      links: topology.links,
      summary: topology.summary,
    },
    chains,
    ts: new Date().toISOString(),
  }
}

// ── SNMP Trap log reader ──────────────────────────────────────────────────

// Raisecom OLT trap OID → human-readable event type
const RAISECOM_TRAP_MAP = {
  'enterprises.8886.18.3.1.4.1': 'onu-register',       // ONU registered
  'enterprises.8886.18.3.1.4.2': 'onu-deregister',      // ONU deregistered (offline)
  'enterprises.8886.18.3.1.4.3': 'onu-state-change',    // ONU state change
  'enterprises.8886.1.27.8.2':   'olt-alarm-power',     // Power alarm
  'enterprises.8886.1.27.8.6':   'olt-alarm-fan',       // Fan alarm
  'enterprises.8886.1.27.8.10':  'olt-alarm-temperature',// Temperature alarm
  'enterprises.8886.1.27.8.11':  'olt-alarm-los',       // Loss of signal
  'enterprises.8886.1.27.8.12':  'olt-alarm-dying-gasp',// Dying gasp
  'enterprises.8886.1.27.8.17':  'olt-alarm-link-down', // Link down
  'enterprises.8886.1.27.8.18':  'olt-alarm-link-up',   // Link up
  'enterprises.8886.1.27.8.20':  'olt-alarm-onu-los',   // ONU loss of signal
  'enterprises.8886.1.27.8.21':  'olt-alarm-onu-lof',   // ONU loss of frame
  'enterprises.8886.1.27.8.28':  'olt-alarm-onu-dying-gasp', // ONU dying gasp
  'enterprises.8886.1.27.8.30':  'olt-alarm-onu-power-fail', // ONU power fail
  'enterprises.8886.1.27.8.31':  'olt-alarm-onu-offline',    // ONU offline
  'enterprises.8886.1.27.8.37':  'olt-alarm-fiber-cut',      // Fiber cut detected
  'enterprises.8886.1.27.8.38':  'olt-alarm-fiber-restored', // Fiber restored
  'enterprises.8886.18.3.6.3.11.2':  'gpon-onu-offline',
  'enterprises.8886.18.3.6.3.11.3':  'gpon-onu-online',
  'enterprises.8886.18.3.6.3.11.4':  'gpon-onu-dying-gasp',
  'enterprises.8886.18.3.6.3.11.5':  'gpon-onu-los',
  'enterprises.8886.18.3.6.3.11.8':  'gpon-onu-power-fail',
  'enterprises.8886.18.3.6.3.11.9':  'gpon-onu-deactivated',
  'enterprises.8886.18.3.6.3.11.10': 'gpon-onu-fiber-cut',
  'enterprises.8886.18.3.6.3.11.11': 'gpon-onu-fiber-restored',
  'enterprises.8886.18.3.6.3.11.12': 'gpon-onu-signal-degraded',
  'enterprises.8886.18.3.6.3.11.13': 'gpon-onu-signal-fail',
  'enterprises.8886.18.3.6.3.11.14': 'gpon-onu-drift-exceed',
  'enterprises.8886.18.3.6.3.11.15': 'gpon-onu-range-exceed',
  'enterprises.8886.1.2.1.5':        'raisecom-cold-start',
  'enterprises.8886.1.26.1.2.1':     'raisecom-config-change',
  'enterprises.8886.1.26.4.4.1':     'raisecom-firmware-event',
  'enterprises.17409.2.2.11.1.1.1':  'tplink-olt-alarm',
  'linkDown': 'link-down',
  'linkUp': 'link-up',
}

async function getRecentTraps (windowMinutes = 60, limit = 200) {
  const result = await influxQuery('telegraf',
    `SELECT * FROM snmp_trap WHERE time > now() - ${windowMinutes}m ORDER BY time DESC LIMIT ${limit}`
  )

  const events = []
  for (const r of (result.results || [])) {
    for (const s of (r.series || [])) {
      const cols = s.columns || []
      for (const v of (s.values || [])) {
        const row = {}
        cols.forEach((c, i) => { row[c] = v[i] })

        // Extract meaningful fields
        const trapName = row.name || ''
        const eventType = RAISECOM_TRAP_MAP[trapName] || trapName
        const source = row.source || ''

        // Extract ONU serial from trap varbinds (search for RCMG/TPLG pattern)
        let onuSerial = null
        for (const [k, val] of Object.entries(row)) {
          if (typeof val === 'string' && /^(RCMG|TPLG|GPON)[A-F0-9]{8,}$/i.test(val)) {
            onuSerial = val
            break
          }
        }

        // Extract ONU index (numeric field in varbinds)
        let onuIndex = null
        for (const [k, val] of Object.entries(row)) {
          if (k.startsWith('enterprises.') && typeof val === 'number' && k.includes('.1.1.')) {
            onuIndex = val
            break
          }
        }

        events.push({
          time: row.time,
          source,        // OLT IP
          event: eventType,
          trapOid: trapName,
          onuSerial,
          onuIndex,
          sysUptime: row.sysUpTimeInstance,
        })
      }
    }
  }
  return events
}

// ── Build summary from logs ──────────────────────────────────────────────

async function buildLogSummary (windowMinutes = 60) {
  const traps = await getRecentTraps(windowMinutes, 500)

  // Group by OLT source
  const byOlt = {}
  for (const t of traps) {
    if (!byOlt[t.source]) byOlt[t.source] = { events: [], offline: [], online: [], alarms: [] }
    byOlt[t.source].events.push(t)
    if (t.event.includes('offline') || t.event === 'onu-deregister' || t.event.includes('dying-gasp') || t.event.includes('los') || t.event.includes('power-fail')) {
      byOlt[t.source].offline.push(t)
    } else if (t.event.includes('online') || t.event === 'onu-register' || t.event === 'link-up') {
      byOlt[t.source].online.push(t)
    }
    if (t.event.startsWith('olt-alarm') || t.event.startsWith('gpon-onu-fiber') || t.event.startsWith('gpon-onu-signal')) {
      byOlt[t.source].alarms.push(t)
    }
  }

  // Get OLT SNMP live status
  let oltStats = []
  try {
    const oltSnmp = require('./olt-snmp')
    oltStats = oltSnmp.getOltStats()
  } catch (e) {}

  // Get outage monitor events
  let incidents = []
  let eventLog = []
  try {
    const monitor = require('./outage-monitor')
    incidents = monitor.getActiveIncidents()
    eventLog = monitor.getEventLog(new Date(Date.now() - windowMinutes * 60000).toISOString(), 100)
  } catch (e) {}

  // ONU-level: find ONUs that went offline and didn't come back
  const onuEvents = {}
  for (const t of traps) {
    if (!t.onuSerial) continue
    if (!onuEvents[t.onuSerial]) onuEvents[t.onuSerial] = { serial: t.onuSerial, source: t.source, events: [] }
    onuEvents[t.onuSerial].events.push({ time: t.time, event: t.event })
  }
  const stuckOffline = Object.values(onuEvents).filter(o => {
    const last = o.events[0] // already sorted desc
    return last && (last.event.includes('offline') || last.event.includes('deregister') || last.event.includes('dying-gasp') || last.event.includes('los'))
  })

  // Get Kuma real-time monitor status (source of truth for device reachability)
  let kumaMonitors = []
  try {
    kumaMonitors = await fetchKumaMonitors()
  } catch (e) {
    log(`Kuma fetch failed: ${e.message}`)
  }

  const kumaDown = kumaMonitors.filter(m => m.status === 0 && m.type !== 'group')
  const kumaUp = kumaMonitors.filter(m => m.status === 1 && m.type !== 'group')
  const kumaSlow = kumaMonitors.filter(m => m.responseTime > 100 && m.type !== 'group')

  return {
    window: `${windowMinutes}m`,
    totalTraps: traps.length,
    byOlt: Object.entries(byOlt).map(([ip, data]) => {
      const olt = oltStats.find(o => o.host === ip)
      return {
        oltIp: ip,
        oltName: olt?.name || ip,
        totalEvents: data.events.length,
        offlineEvents: data.offline.length,
        onlineEvents: data.online.length,
        alarms: data.alarms.length,
        recentEvents: data.events.slice(0, 10).map(e => ({
          time: e.time, event: e.event, serial: e.onuSerial,
        })),
      }
    }).sort((a, b) => b.offlineEvents - a.offlineEvents),
    oltStats: oltStats.map(o => ({
      name: o.name, host: o.host,
      onuCount: o.onuCount, onlineCount: o.onlineCount,
      offlineCount: (o.onuCount || 0) - (o.onlineCount || 0),
      lastPoll: o.lastPoll, lastError: o.lastError || null,
    })),
    stuckOfflineOnus: stuckOffline.length,
    stuckOfflineSample: stuckOffline.slice(0, 20).map(o => ({
      serial: o.serial, olt: o.source, lastEvent: o.events[0]?.event, lastTime: o.events[0]?.time,
    })),
    activeIncidents: incidents,
    monitorEvents: eventLog.length,
    // Kuma — real device reachability (source of truth)
    kuma: {
      totalMonitors: kumaMonitors.filter(m => m.type !== 'group').length,
      up: kumaUp.length,
      down: kumaDown.map(m => ({ name: m.name, host: m.hostname, type: m.type })),
      slow: kumaSlow.map(m => ({ name: m.name, host: m.hostname, type: m.type, ms: m.responseTime })),
    },
    ts: new Date().toISOString(),
  }
}

// ── AI-powered root cause analysis (LOG-BASED) ───────────────────────────

async function analyzeNetworkIssue (context) {
  const { aiCall } = require('./ai')

  // Gather real data from logs, not interface status
  const logSummary = await buildLogSummary(60)

  const systemPrompt = `Tu es un ingénieur réseau senior NOC pour Targo/Gigafibre, un ISP fibre optique (GPON) au Québec.
Tu analyses les LOGS et événements SNMP en temps réel pour diagnostiquer les problèmes réseau.

Architecture réseau:
- OLTs Raisecom et TP-Link desservent les clients via GPON
- Chaque OLT a plusieurs ports, chaque port un splitter, chaque splitter dessert 32-128 ONUs (modems clients)
- Les SNMP traps sont la source de vérité pour les événements réseau

Types d'événements SNMP importants:
- onu-deregister / gpon-onu-offline: ONU déconnecté
- onu-register / gpon-onu-online: ONU reconnecté
- gpon-onu-dying-gasp / olt-alarm-onu-dying-gasp: Perte de courant chez le client
- gpon-onu-los / olt-alarm-onu-los: Perte de signal optique (fibre coupée?)
- gpon-onu-fiber-cut: Coupure fibre détectée
- gpon-onu-power-fail: Panne d'alimentation ONU
- gpon-onu-signal-degraded: Signal optique dégradé (fibre sale, connecteur usé)
- link-down / link-up: Interface réseau up/down
- olt-alarm-power / olt-alarm-temperature: Alarmes OLT

Règles de diagnostic:
1. Beaucoup de dying-gasp sur même OLT en même temps = panne de courant dans le secteur
2. Beaucoup de LOS sur même port = coupure fibre en amont du splitter
3. Dying-gasp isolé = client a débranché ou panne électrique locale
4. OLT unreachable + tous ONUs offline = panne OLT ou lien amont coupé
5. Serials qui vont offline/online en boucle = équipement instable, remplacement nécessaire
6. Signal dégradé = maintenance préventive nécessaire (nettoyage connecteurs)

Réponds en JSON valide:
{
  "summary_fr": "Résumé en 2-3 phrases de l'état du réseau",
  "severity": "critical|high|medium|low|normal",
  "network_health_score": 0-100,
  "active_issues": [
    {
      "type": "power_outage|fiber_cut|olt_failure|unstable_onu|signal_degradation|mass_offline|normal_churn",
      "olt": "nom ou IP de l'OLT",
      "description_fr": "description du problème",
      "affected_customers": number,
      "evidence": ["preuve 1", "preuve 2"],
      "action_fr": "action recommandée",
      "urgency": "immédiat|court-terme|surveillance|aucun"
    }
  ],
  "olt_health": [
    {"name": "...", "status": "ok|warning|critical|unreachable", "online": number, "total": number, "issues_fr": "..."}
  ],
  "trending": "improving|stable|degrading",
  "next_check_fr": "Recommandation pour le prochain contrôle"
}`

  const userPrompt = `Analyse les logs réseau de la dernière heure:

CONTEXTE: ${context || 'Rapport de situation réseau — résume les problèmes actifs et identifie les causes'}

UPTIME-KUMA — SOURCE DE VÉRITÉ POUR LA JOIGNABILITÉ (${logSummary.kuma.totalMonitors} moniteurs):
- En ligne: ${logSummary.kuma.up} moniteurs
- HORS LIGNE: ${JSON.stringify(logSummary.kuma.down)}
- Lents (>100ms): ${JSON.stringify(logSummary.kuma.slow)}
NOTE: Si Kuma dit qu'un équipement est UP, il est UP. Ne pas contredire Kuma.

ÉVÉNEMENTS SNMP TRAP (${logSummary.totalTraps} événements en ${logSummary.window}):
${JSON.stringify(logSummary.byOlt, null, 1)}

STATUT OLTs — COMPTAGE ONUs (via SNMP polling targo-hub):
${JSON.stringify(logSummary.oltStats, null, 1)}
NOTE: Si onuCount=0 pour un OLT mais que Kuma dit qu'il est UP, c'est que le polling n'est pas configuré, PAS que l'OLT est down.

ONUs BLOQUÉS OFFLINE (${logSummary.stuckOfflineOnus} ONUs dont le dernier événement est offline):
${JSON.stringify(logSummary.stuckOfflineSample, null, 1)}

INCIDENTS ACTIFS (outage monitor):
${JSON.stringify(logSummary.activeIncidents, null, 1)}

Analyse les patterns, identifie les VRAIS problèmes (pas les faux positifs), et donne un rapport synthétique pour l'équipe.
Si tout est normal (Kuma OK, pas de clusters d'ONUs offline, pas d'alarmes), dis-le clairement avec severity "normal".`

  try {
    const result = await aiCall(systemPrompt, userPrompt, { maxTokens: 16384 })
    result._logSummary = {
      totalTraps: logSummary.totalTraps,
      oltCount: logSummary.oltStats.length,
      stuckOffline: logSummary.stuckOfflineOnus,
      incidentCount: logSummary.activeIncidents.length,
    }
    return result
  } catch (e) {
    log(`Network AI analysis failed: ${e.message}`)
    return {
      summary_fr: `Analyse IA indisponible: ${e.message}`,
      severity: 'unknown',
      network_health_score: null,
      active_issues: [],
      olt_health: logSummary.oltStats.map(o => ({
        name: o.name, status: o.reachable ? 'ok' : 'unreachable',
        online: o.onlineCount, total: o.onuCount,
        issues_fr: o.lastError || 'OK',
      })),
      _logSummary: {
        totalTraps: logSummary.totalTraps,
        oltCount: logSummary.oltStats.length,
        stuckOffline: logSummary.stuckOfflineOnus,
      },
      _error: e.message,
    }
  }
}

// ── Network Map (static topology + live enrichment) ───────────────────────

let _staticTopo = null
function loadStaticTopology () {
  if (_staticTopo) return _staticTopo
  try {
    const fs = require('fs')
    const path = require('path')
    const raw = fs.readFileSync(path.join(__dirname, '..', 'data', 'network-topology.json'), 'utf8')
    _staticTopo = JSON.parse(raw)
    log(`Static topology loaded: ${Object.keys(_staticTopo.sites).length} sites, ${_staticTopo.links.length} links`)
    return _staticTopo
  } catch (e) {
    log(`Failed to load static topology: ${e.message}`)
    return { sites: {}, links: [], rings: [] }
  }
}

let _mapCache = null
let _mapCacheTs = 0
const MAP_CACHE_TTL = 60000 // 1 min

async function getNetworkMap () {
  const now = Date.now()
  if (_mapCache && (now - _mapCacheTs) < MAP_CACHE_TTL) return _mapCache

  const topo = loadStaticTopology()
  const kumaMonitors = await fetchKumaMonitors().catch(() => [])

  // Build Kuma lookup by IP
  const kumaByIp = new Map()
  for (const m of kumaMonitors) {
    if (m.hostname) {
      const clean = m.hostname.replace(/^https?:\/\//, '').replace(/:\d+$/, '').replace(/\/.*$/, '')
      kumaByIp.set(clean, m)
    }
    const ipInName = m.name.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/)
    if (ipInName && !kumaByIp.has(ipInName[1])) kumaByIp.set(ipInName[1], m)
  }

  // Get OLT stats
  let oltStats = []
  try {
    const oltSnmp = require('./olt-snmp')
    oltStats = oltSnmp.getOltStats() || []
  } catch (e) { /* ok */ }
  const oltByIp = new Map()
  for (const o of oltStats) { if (o.host) oltByIp.set(o.host, o) }

  // Enrich each site with live data
  const sites = {}
  const allDeviceIps = new Set()

  for (const [siteId, site] of Object.entries(topo.sites)) {
    const enrichedDevices = (site.devices || []).map(dev => {
      allDeviceIps.add(dev.ip)
      const kuma = kumaByIp.get(dev.ip)
      const olt = oltByIp.get(dev.ip)
      return {
        ...dev,
        kuma: kuma ? { status: kuma.status, name: kuma.name, responseTime: kuma.responseTime } : null,
        olt: olt ? { name: olt.name, onuCount: olt.onuCount, onlineCount: olt.onlineCount, reachable: olt.reachable } : null,
      }
    })

    // Site-level status: aggregate device statuses
    const deviceStatuses = enrichedDevices.map(d => {
      if (d.kuma) return d.kuma.status === 1 ? 'up' : d.kuma.status === 0 ? 'down' : 'pending'
      if (d.olt) return d.olt.reachable ? 'up' : 'down'
      return 'unknown'
    })
    const hasDown = deviceStatuses.includes('down')
    const hasUp = deviceStatuses.includes('up')
    const siteStatus = hasDown ? 'degraded' : hasUp ? 'up' : 'unknown'

    // ONU totals for the site
    const onuTotal = enrichedDevices.reduce((s, d) => s + (d.olt?.onuCount || 0), 0)
    const onuOnline = enrichedDevices.reduce((s, d) => s + (d.olt?.onlineCount || 0), 0)

    sites[siteId] = {
      ...site,
      devices: enrichedDevices,
      status: siteStatus,
      onuTotal,
      onuOnline,
    }
  }

  // Enrich links with status (both endpoints up = link up)
  const links = (topo.links || []).map(link => {
    const fromSite = sites[link.from]
    const toSite = sites[link.to]
    const fromUp = fromSite?.status === 'up'
    const toUp = toSite?.status === 'up'
    return {
      ...link,
      status: (fromUp && toUp) ? 'up' : (!fromUp || !toUp) ? 'degraded' : 'unknown',
    }
  })

  const result = {
    sites,
    links,
    rings: topo.rings || [],
    ts: new Date().toISOString(),
    summary: {
      totalSites: Object.keys(sites).length,
      totalLinks: links.length,
      totalOlts: Object.values(sites).flatMap(s => s.devices).filter(d => d.type === 'olt').length,
      totalOnus: Object.values(sites).reduce((s, site) => s + site.onuTotal, 0),
      totalOnusOnline: Object.values(sites).reduce((s, site) => s + site.onuOnline, 0),
      sitesUp: Object.values(sites).filter(s => s.status === 'up').length,
      sitesDown: Object.values(sites).filter(s => s.status === 'degraded').length,
    },
  }

  _mapCache = result
  _mapCacheTs = now
  return result
}

// ── HTTP handler ───────────────────────────────────────────────────────────

async function handle (req, res, method, urlPath) {
  const sub = urlPath.replace('/network/', '')

  // GET /network/topology — discover network devices and links
  if (sub === 'topology' && method === 'GET') {
    try {
      const topology = await discoverTopology()
      return json(res, 200, topology)
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/dependency-map — full dependency chain with OLT data
  if (sub === 'dependency-map' && method === 'GET') {
    try {
      const depMap = await buildDependencyMap()
      return json(res, 200, depMap)
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/health/:host — interface health for a specific device
  if (sub.startsWith('health/') && method === 'GET') {
    try {
      const host = decodeURIComponent(sub.replace('health/', ''))
      const health = await getInterfaceHealth(host)
      return json(res, 200, { host, interfaces: health })
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/errors — recent error spikes across the network
  if (sub === 'errors' && method === 'GET') {
    try {
      const url = new URL(req.url, 'http://localhost')
      const minutes = parseInt(url.searchParams.get('minutes')) || 60
      const spikes = await getErrorSpikes(minutes)
      return json(res, 200, { spikes, count: spikes.length })
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/kuma — Uptime Kuma monitor status (source of truth)
  if (sub === 'kuma' && method === 'GET') {
    try {
      const monitors = await fetchKumaMonitors()
      const down = monitors.filter(m => m.status === 0 && m.type !== 'group')
      const up = monitors.filter(m => m.status === 1 && m.type !== 'group')
      return json(res, 200, {
        total: monitors.filter(m => m.type !== 'group').length,
        up: up.length,
        down: down.length,
        downList: down,
        groups: monitors.filter(m => m.type === 'group').map(m => ({ name: m.name, status: m.status })),
        monitors: monitors.filter(m => m.type !== 'group'),
      })
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/logs — recent SNMP trap events (parsed)
  if (sub === 'logs' && method === 'GET') {
    try {
      const url = new URL(req.url, 'http://localhost')
      const minutes = parseInt(url.searchParams.get('minutes')) || 60
      const summary = await buildLogSummary(minutes)
      return json(res, 200, summary)
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/traps — raw recent SNMP traps (parsed into events)
  if (sub === 'traps' && method === 'GET') {
    try {
      const url = new URL(req.url, 'http://localhost')
      const minutes = parseInt(url.searchParams.get('minutes')) || 60
      const limit = parseInt(url.searchParams.get('limit')) || 200
      const traps = await getRecentTraps(minutes, limit)
      return json(res, 200, { events: traps, count: traps.length })
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // POST /network/analyze — AI root cause analysis
  if (sub === 'analyze' && method === 'POST') {
    try {
      const body = await parseBody(req)
      const analysis = await analyzeNetworkIssue(body.context || '')
      return json(res, 200, analysis)
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/map — static topology from Excel + live Kuma/OLT enrichment
  if (sub === 'map' && method === 'GET') {
    try {
      const map = await getNetworkMap()
      return json(res, 200, map)
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  // GET /network/influx?db=telegraf&q=SELECT... — raw InfluxDB proxy (for ops dashboard)
  if (sub === 'influx' && method === 'GET') {
    try {
      const url = new URL(req.url, 'http://localhost')
      const db = url.searchParams.get('db') || 'telegraf'
      const q = url.searchParams.get('q')
      if (!q) return json(res, 400, { error: 'Missing q parameter' })
      // Safety: only allow SELECT and SHOW queries
      const ql = q.trim().toUpperCase()
      if (!ql.startsWith('SELECT') && !ql.startsWith('SHOW')) {
        return json(res, 403, { error: 'Only SELECT and SHOW queries allowed' })
      }
      const result = await influxQuery(db, q)
      return json(res, 200, result)
    } catch (e) {
      return json(res, 500, { error: e.message })
    }
  }

  return json(res, 404, { error: 'Network endpoint not found' })
}

module.exports = {
  handle,
  discoverTopology,
  buildDependencyMap,
  buildLogSummary,
  getRecentTraps,
  fetchKumaMonitors,
  getInterfaceHealth,
  getNetworkMap,
  getErrorSpikes,
  analyzeNetworkIssue,
  influxQuery,
  extractSeries,
}