'use strict'
const { WebSocket } = require('ws')
const cfg = require('./config')
const { log, json, parseBody, lookupCustomerByPhone, createCommunication, erpFetch } = require('./helpers')
const { execTool } = require('./agent')
// Gemini Live API WebSocket endpoint
const GEMINI_WS = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent'
const LIVE_MODEL = cfg.VOICE_MODEL || 'models/gemini-2.5-flash-live-preview'
// Active voice sessions: callSid → session
const sessions = new Map()
// ── Audio conversion: Twilio mulaw 8kHz ↔ Gemini PCM16 16kHz/24kHz ──
// mulaw → linear PCM16 lookup table (ITU-T G.711)
const MULAW_TO_LINEAR = new Int16Array(256)
;(() => {
for (let i = 0; i < 256; i++) {
let mu = ~i & 0xFF
const sign = (mu & 0x80) ? -1 : 1
const exponent = (mu >> 4) & 0x07
const mantissa = mu & 0x0F
let magnitude = ((mantissa << 1) + 33) << (exponent + 2)
magnitude -= 0x84
MULAW_TO_LINEAR[i] = sign * magnitude
}
})()
// linear PCM16 → mulaw lookup (bias method)
function linearToMulaw (sample) {
const BIAS = 0x84, CLIP = 32635
const sign = (sample >> 8) & 0x80
if (sign) sample = -sample
if (sample > CLIP) sample = CLIP
sample += BIAS
let exponent = 7
const expMask = 0x4000
for (; exponent > 0; exponent--) {
if (sample & (expMask >> (7 - exponent))) break
}
const mantissa = (sample >> (exponent + 3)) & 0x0F
return ~(sign | (exponent << 4) | mantissa) & 0xFF
}
// Twilio mulaw 8kHz base64 → PCM16 16kHz base64 (for Gemini input)
function mulawToGemini (mulawB64) {
const mulawBuf = Buffer.from(mulawB64, 'base64')
// Decode mulaw → PCM16 @ 8kHz
const pcm8k = Buffer.alloc(mulawBuf.length * 2)
for (let i = 0; i < mulawBuf.length; i++) {
pcm8k.writeInt16LE(MULAW_TO_LINEAR[mulawBuf[i]], i * 2)
}
// Upsample 8kHz → 16kHz (linear interpolation)
const pcm16k = Buffer.alloc(pcm8k.length * 2)
for (let i = 0; i < pcm8k.length / 2; i++) {
const s0 = pcm8k.readInt16LE(i * 2)
const s1 = (i + 1 < pcm8k.length / 2) ? pcm8k.readInt16LE((i + 1) * 2) : s0
pcm16k.writeInt16LE(s0, i * 4)
pcm16k.writeInt16LE(Math.round((s0 + s1) / 2), i * 4 + 2)
}
return pcm16k.toString('base64')
}
// Gemini PCM16 24kHz base64 → Twilio mulaw 8kHz base64
function geminiToMulaw (pcmB64) {
const pcm24k = Buffer.from(pcmB64, 'base64')
const samplesIn = pcm24k.length / 2
// Downsample 24kHz → 8kHz (take every 3rd sample)
const samplesOut = Math.floor(samplesIn / 3)
const mulawBuf = Buffer.alloc(samplesOut)
for (let i = 0; i < samplesOut; i++) {
const sample = pcm24k.readInt16LE(i * 6)
mulawBuf[i] = linearToMulaw(sample)
}
return mulawBuf.toString('base64')
}
// ── Gemini Live tool definitions (different format than OpenAI) ──
function buildGeminiTools () {
const openaiTools = require('./agent-tools.json')
return [{
functionDeclarations: openaiTools
.filter(t => t.function.name !== 'get_chat_link') // voice doesn't need chat link
.map(t => ({
name: t.function.name,
description: t.function.description,
parameters: t.function.parameters.required?.length
? { type: 'OBJECT', properties: Object.fromEntries(Object.entries(t.function.parameters.properties).map(([k, v]) => [k, { type: v.type.toUpperCase(), description: v.description || '' }])), required: t.function.parameters.required }
: { type: 'OBJECT', properties: {} },
})),
}]
}
// ── Voice system prompt ──
const VOICE_SYSTEM_PROMPT = `Tu es l'assistant vocal de Gigafibre, fournisseur Internet fibre optique au Québec.
Règles vocales:
- Parle TOUJOURS en français québécois naturel
- Sois conversationnel et chaleureux, comme un vrai agent au téléphone
- Phrases courtes — max 1-2 phrases avant de laisser le client répondre
- Utilise les outils pour consulter le compte en temps réel
- Pour les problèmes techniques, vérifie l'état du modem (check_device_status) avant de suggérer quoi que ce soit
- Signal optique < -25 dBm = problème fibre physique → crée un ticket
- Modem hors ligne > 10 min → suggère un redémarrage
- Ne partage JAMAIS d'infos techniques internes (OLT port, network ID, IP de gestion)
- Si tu ne peux pas résoudre, crée un ticket
- Quand tu obtiens des données d'un outil, résume en langage simple (pas de jargon technique)`
// ── Inbound call handler: lookup caller, build dynamic IVR ──
async function handleInboundCall (req, res) {
const body = await parseBody(req)
const from = body.From || ''
const callSid = body.CallSid || ''
log(`Voice IN: ${from} (CallSid: ${callSid})`)
const customer = await lookupCustomerByPhone(from)
if (!customer) {
// Unknown caller — polite redirect
const twiml = `
Bonjour, bienvenue chez Gigafibre. Nous n'avons pas trouvé votre numéro dans notre système. Veuillez nous joindre au 4 5 0, 6 5 5, 3 2 3 5.
`
res.writeHead(200, { 'Content-Type': 'text/xml' })
return res.end(twiml)
}
// Fetch locations that have active subscriptions (not just any Service Location)
let locations = []
try {
const subFields = JSON.stringify(['service_location', 'item_code', 'custom_description'])
const subFilters = JSON.stringify({ party_type: 'Customer', party: customer.name, status: 'Active' })
const r = await erpFetch(`/api/resource/Subscription?filters=${encodeURIComponent(subFilters)}&fields=${encodeURIComponent(subFields)}&limit_page_length=50`)
const subs = (r.status === 200 && r.data?.data) || []
// Deduplicate by service_location, keep the service description
const locMap = new Map()
for (const s of subs) {
if (!s.service_location) continue
if (!locMap.has(s.service_location)) locMap.set(s.service_location, { name: s.service_location, service: s.custom_description || s.item_code })
}
// Resolve addresses for each unique location
for (const [locId, loc] of locMap) {
try {
const lr = await erpFetch(`/api/resource/Service Location/${encodeURIComponent(locId)}?fields=["address_line","city"]`)
if (lr.status === 200 && lr.data?.data) {
loc.address_line = lr.data.data.address_line || locId
loc.city = lr.data.data.city || ''
}
} catch { loc.address_line = locId; loc.city = '' }
}
locations = [...locMap.values()]
} catch {}
const firstName = (customer.customer_name || '').split(' ')[0] || 'client'
if (locations.length === 0) {
// No active locations — go straight to voice agent
const twiml = buildStreamTwiml(callSid, customer, null, firstName)
res.writeHead(200, { 'Content-Type': 'text/xml' })
return res.end(twiml)
}
if (locations.length === 1) {
// Single location — go straight to voice agent with context
const twiml = buildStreamTwiml(callSid, customer, locations[0], firstName)
res.writeHead(200, { 'Content-Type': 'text/xml' })
return res.end(twiml)
}
// Multiple locations — dynamic IVR
let gatherOptions = ''
locations.forEach((loc, i) => {
const addr = (loc.address_line || '').replace(/&/g, 'et')
gatherOptions += `Pour le ${addr}, appuyez ${i + 1}. `
})
gatherOptions += `Pour autre chose, appuyez 0.`
// Store locations in session for the gather callback
sessions.set(`gather:${callSid}`, { customer, locations })
const twiml = `
Bonjour ${firstName}! ${gatherOptions}
Je n'ai pas reçu de réponse. Je vous transfère à un assistant.
${cfg.HUB_PUBLIC_URL}/voice/connect-agent?callSid=${callSid}&customer=${encodeURIComponent(customer.name)}
`
res.writeHead(200, { 'Content-Type': 'text/xml' })
res.end(twiml)
}
// ── Gather callback: customer pressed a digit ──
async function handleGather (req, res) {
const body = await parseBody(req)
const digit = body.Digits || '0'
const callSid = body.CallSid || ''
const session = sessions.get(`gather:${callSid}`)
sessions.delete(`gather:${callSid}`)
if (!session) {
res.writeHead(200, { 'Content-Type': 'text/xml' })
return res.end('Erreur de session. Veuillez rappeler.')
}
const { customer, locations } = session
const idx = parseInt(digit, 10)
const selectedLoc = (idx >= 1 && idx <= locations.length) ? locations[idx - 1] : null
const firstName = (customer.customer_name || '').split(' ')[0] || 'client'
const twiml = buildStreamTwiml(callSid, customer, selectedLoc, firstName)
res.writeHead(200, { 'Content-Type': 'text/xml' })
res.end(twiml)
}
// ── Build TwiML that opens a Media Stream to our WebSocket ──
function buildStreamTwiml (callSid, customer, location, firstName) {
const wsUrl = cfg.HUB_PUBLIC_URL.replace('https://', 'wss://').replace('http://', 'ws://') + '/voice/ws'
const meta = JSON.stringify({
callSid, customerId: customer.name, customerName: customer.customer_name,
location: location ? { name: location.name, address: location.address_line, city: location.city } : null,
})
const locContext = location
? ` pour votre adresse au ${(location.address_line || '').replace(/&/g, 'et')}`
: ''
return `
Parfait ${firstName}, je vous mets en ligne avec notre assistant${locContext}. Un moment.
`
}
// ── Connect to agent directly (no location selection) ──
async function handleConnectAgent (req, res) {
const body = await parseBody(req)
const callSid = body.CallSid || req.url?.searchParams?.get('callSid') || ''
const customerName = req.url?.searchParams?.get('customer') || ''
let customer = null
if (customerName) {
try {
const r = await erpFetch(`/api/resource/Customer/${encodeURIComponent(customerName)}`)
if (r.status === 200) customer = r.data.data
} catch {}
}
const firstName = customer ? (customer.customer_name || '').split(' ')[0] : ''
const twiml = buildStreamTwiml(callSid, customer || { name: customerName, customer_name: customerName }, null, firstName || 'client')
res.writeHead(200, { 'Content-Type': 'text/xml' })
res.end(twiml)
}
// ── WebSocket handler: Twilio Media Stream ↔ Gemini Live ──
function handleMediaStream (ws, req) {
let geminiWs = null
let streamSid = null
let meta = null
let callSid = null
ws.on('message', async (data) => {
let msg
try { msg = JSON.parse(data) } catch { return }
switch (msg.event) {
case 'connected':
log('Twilio Media Stream connected')
break
case 'start':
streamSid = msg.start?.streamSid
callSid = msg.start?.callSid
// Decode metadata passed from TwiML
const metaB64 = msg.start?.customParameters?.meta
if (metaB64) {
try { meta = JSON.parse(Buffer.from(metaB64, 'base64').toString()) } catch {}
}
log(`Voice session started: ${callSid} customer=${meta?.customerId}`)
// Open Gemini Live session
geminiWs = await openGeminiSession(ws, streamSid, meta)
if (geminiWs) {
sessions.set(callSid, { ws, geminiWs, streamSid, meta, startedAt: Date.now() })
}
break
case 'media':
// Forward audio to Gemini (convert mulaw → PCM16)
if (geminiWs?.readyState === WebSocket.OPEN && msg.media?.payload) {
try {
const pcmB64 = mulawToGemini(msg.media.payload)
geminiWs.send(JSON.stringify({
realtimeInput: { audio: { data: pcmB64, mimeType: 'audio/pcm;rate=16000' } },
}))
} catch {}
}
break
case 'stop':
log(`Voice session ended: ${callSid}`)
if (geminiWs) { try { geminiWs.close() } catch {} }
if (callSid) {
const session = sessions.get(callSid)
if (session) {
const duration = Math.floor((Date.now() - session.startedAt) / 1000)
logVoiceSession(meta, duration)
sessions.delete(callSid)
}
}
break
}
})
ws.on('close', () => {
if (geminiWs) try { geminiWs.close() } catch {}
})
}
// ── Open Gemini Live WebSocket session ──
async function openGeminiSession (twilioWs, streamSid, meta) {
if (!cfg.AI_API_KEY) {
log('Voice agent: no AI_API_KEY configured')
return null
}
return new Promise((resolve) => {
const url = `${GEMINI_WS}?key=${cfg.AI_API_KEY}`
const ws = new WebSocket(url)
let setupDone = false
const timeout = setTimeout(() => {
if (!setupDone) { log('Gemini Live: setup timeout'); ws.close(); resolve(null) }
}, 10000)
ws.on('open', () => {
// Build context from customer data
const locContext = meta?.location
? `\nAdresse sélectionnée: ${meta.location.address}, ${meta.location.city} (${meta.location.name})`
: ''
ws.send(JSON.stringify({
setup: {
model: LIVE_MODEL,
generationConfig: {
responseModalities: ['AUDIO'],
speechConfig: {
voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Aoede' } },
},
},
systemInstruction: {
parts: [{ text: `${VOICE_SYSTEM_PROMPT}\n\nContexte: Client ${meta?.customerName || 'inconnu'} (${meta?.customerId || '?'})${locContext}` }],
},
tools: buildGeminiTools(),
realtimeInputConfig: {
automaticActivityDetection: {
disabled: false,
startOfSpeechSensitivity: 'HIGH',
endOfSpeechSensitivity: 'MEDIUM',
silenceDurationMs: 1200,
},
activityHandling: 'START_OF_ACTIVITY_INTERRUPTS',
},
inputAudioTranscription: {},
outputAudioTranscription: {},
},
}))
})
ws.on('message', async (data) => {
let msg
try { msg = JSON.parse(data.toString()) } catch { return }
// Setup confirmation
if (msg.setupComplete) {
setupDone = true
clearTimeout(timeout)
log(`Gemini Live session ready (model: ${LIVE_MODEL})`)
resolve(ws)
return
}
// Audio response from Gemini → convert and send to Twilio
if (msg.serverContent?.modelTurn?.parts) {
for (const part of msg.serverContent.modelTurn.parts) {
if (part.inlineData?.data) {
try {
const mulawB64 = geminiToMulaw(part.inlineData.data)
if (twilioWs.readyState === WebSocket.OPEN) {
twilioWs.send(JSON.stringify({
event: 'media',
streamSid,
media: { payload: mulawB64 },
}))
}
} catch {}
}
}
}
// Transcription (for logging)
if (msg.serverContent?.outputTranscription?.text) {
log(`Voice AI: ${msg.serverContent.outputTranscription.text.substring(0, 100)}`)
}
if (msg.serverContent?.inputTranscription?.text) {
log(`Voice Customer: ${msg.serverContent.inputTranscription.text.substring(0, 100)}`)
}
// Tool call from Gemini
if (msg.toolCall?.functionCalls) {
const responses = []
for (const fc of msg.toolCall.functionCalls) {
log(`Voice tool call: ${fc.name}(${JSON.stringify(fc.args)})`)
const result = await execTool(fc.name, fc.args || {})
log(`Voice tool result: ${fc.name} →`, JSON.stringify(result).substring(0, 200))
responses.push({ id: fc.id, name: fc.name, response: { result } })
}
ws.send(JSON.stringify({ toolResponse: { functionResponses: responses } }))
}
})
ws.on('error', (e) => {
log('Gemini Live WS error:', e.message)
if (!setupDone) { clearTimeout(timeout); resolve(null) }
})
ws.on('close', () => {
log('Gemini Live session closed')
})
})
}
// ── Log voice session to ERPNext ──
async function logVoiceSession (meta, duration) {
if (!meta?.customerId) return
try {
const dMin = Math.floor(duration / 60), dSec = duration % 60
const durStr = `${dMin}m${dSec.toString().padStart(2, '0')}s`
await createCommunication({
communication_type: 'Communication', communication_medium: 'Phone',
sent_or_received: 'Received', sender: 'voice-ai@gigafibre.ca',
sender_full_name: 'Gigafibre Voice AI', phone_no: meta.phone || '',
content: `Appel vocal IA — Client: ${meta.customerName}, Durée: ${durStr}${meta.location ? ', Adresse: ' + meta.location.address : ''}`,
subject: `Appel vocal IA: ${meta.customerName}`,
reference_doctype: 'Customer', reference_name: meta.customerId, status: 'Linked',
})
log(`Voice session logged: ${meta.customerId} (${durStr})`)
} catch (e) { log('Voice session log error:', e.message) }
}
module.exports = { handleInboundCall, handleGather, handleConnectAgent, handleMediaStream }