akpande2 commited on
Commit
4592755
·
verified ·
1 Parent(s): d38b1ea

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +16 -0
  2. README.md +10 -11
  3. compose.yaml +25 -0
  4. main.py +36 -0
  5. pipeline.py +317 -0
  6. requirements.txt +13 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
2
+
3
+ RUN apt-get update && apt-get install -y \
4
+ ffmpeg \
5
+ libsndfile1 \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ WORKDIR /app
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY pipeline.py .
13
+ COPY main.py .
14
+
15
+ EXPOSE 8000
16
+ CMD ["python", "main.py"]
README.md CHANGED
@@ -1,12 +1,11 @@
1
- ---
2
- title: IAura 1
3
- emoji: 📉
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: first model for iAura
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
+ This is a GPU-accelerated microservice for analyzing call center audio. It performs:
2
+ - Speaker Diarization (Pyannote 3.1)
3
+ - Speech-to-Text (Whisper Large-v3)
4
+ - Emotion Recognition (Wav2Vec2)
5
+ - Biometric Profiling (Age/Gender Voting System)
6
+ - Agent KPI Scoring & Customer Journey Mapping
 
 
 
 
7
 
8
+ ## Prerequisites
9
+ - **Server:** AWS EC2 `g4dn.xlarge` (or any machine with NVIDIA GPU + 16GB VRAM)
10
+ - **OS:** Ubuntu 22.04 (Recommended) with NVIDIA Drivers installed
11
+ - **Docker:** Docker Engine + NVIDIA Container Toolkit
compose.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ call-center-ai:
5
+ build: .
6
+ container_name: call_center_engine
7
+ restart: always
8
+ ports:
9
+ - "8000:8000"
10
+ environment:
11
+ # Team will paste the token here or in a .env file
12
+ - HF_TOKEN=${HF_TOKEN}
13
+ deploy:
14
+ resources:
15
+ reservations:
16
+ devices:
17
+ - driver: nvidia
18
+ count: 1
19
+ capabilities: [gpu]
20
+ volumes:
21
+ # Optional: Persist models so they don't re-download on restart
22
+ - huggingface_cache:/root/.cache/huggingface
23
+
24
+ volumes:
25
+ huggingface_cache:
main.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import uvicorn
4
+ from fastapi import FastAPI, UploadFile, File, HTTPException
5
+ from pipeline import PlatinumAnalyticsPipeline
6
+
7
+ app = FastAPI(title="Call Center Analytics Engine")
8
+ pipeline_engine = None
9
+
10
+ @app.on_event("startup")
11
+ async def startup_event():
12
+ global pipeline_engine
13
+ hf_token = os.environ.get("HF_TOKEN")
14
+ pipeline_engine = PlatinumAnalyticsPipeline(hf_token=hf_token)
15
+
16
+ @app.post("/analyze")
17
+ async def analyze_audio(file: UploadFile = File(...)):
18
+ if not pipeline_engine:
19
+ raise HTTPException(status_code=500, detail="Engine not initialized")
20
+
21
+ temp_path = f"temp_{file.filename}"
22
+ try:
23
+ with open(temp_path, "wb") as buffer:
24
+ shutil.copyfileobj(file.file, buffer)
25
+
26
+ result = pipeline_engine.process_call(temp_path)
27
+ return result
28
+
29
+ except Exception as e:
30
+ raise HTTPException(status_code=500, detail=str(e))
31
+ finally:
32
+ if os.path.exists(temp_path):
33
+ os.remove(temp_path)
34
+
35
+ if __name__ == "__main__":
36
+ uvicorn.run(app, host="0.0.0.0", port=8000)
pipeline.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ import torch
5
+ import librosa
6
+ import whisper
7
+ import numpy as np
8
+ import warnings
9
+ import json
10
+ import gc
11
+ from collections import Counter
12
+ from pyannote.audio import Pipeline
13
+ from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
14
+ from datetime import datetime
15
+
16
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
17
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
18
+ logging.getLogger("pyannote").setLevel(logging.ERROR)
19
+ logging.getLogger("transformers").setLevel(logging.ERROR)
20
+ warnings.filterwarnings("ignore")
21
+
22
+ class NumpyEncoder(json.JSONEncoder):
23
+ def default(self, obj):
24
+ if isinstance(obj, np.integer): return int(obj)
25
+ if isinstance(obj, np.floating): return float(obj)
26
+ if isinstance(obj, np.ndarray): return obj.tolist()
27
+ return super(NumpyEncoder, self).default(obj)
28
+
29
+ class PlatinumAnalyticsPipeline:
30
+ def __init__(self, hf_token=None, device=None):
31
+ self._flush_memory()
32
+ self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
33
+ print(f"Initializing Engine on {self.device}...")
34
+
35
+ try:
36
+ self.stt_model = whisper.load_model("large-v3", device=self.device)
37
+ except:
38
+ print("VRAM fallback: Using 'medium.en'")
39
+ self.stt_model = whisper.load_model("medium.en", device=self.device)
40
+
41
+ self.diarization_pipeline = None
42
+ if hf_token:
43
+ try:
44
+ self.diarization_pipeline = Pipeline.from_pretrained(
45
+ "pyannote/speaker-diarization-3.1",
46
+ token=hf_token
47
+ ).to(torch.device(self.device))
48
+ except: pass
49
+
50
+ self.emotion_classifier = pipeline(
51
+ "audio-classification",
52
+ model="superb/wav2vec2-base-superb-er",
53
+ device=0 if self.device == "cuda" else -1
54
+ )
55
+
56
+ self.ag_model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
57
+ self.ag_processor = Wav2Vec2Processor.from_pretrained(self.ag_model_name)
58
+ self.ag_model = Wav2Vec2ForSequenceClassification.from_pretrained(self.ag_model_name)
59
+ self.ag_model.to(self.device)
60
+ self.ag_model.eval()
61
+
62
+ def process_call(self, audio_path):
63
+ if not os.path.exists(audio_path): raise FileNotFoundError("File missing")
64
+ self._flush_memory()
65
+
66
+ wav, sr = librosa.load(audio_path, sr=16000, mono=True)
67
+ wav = wav.astype(np.float32)
68
+
69
+ segments = self._run_diarization(wav, sr)
70
+ merged = self._merge_segments(segments)
71
+
72
+ results = []
73
+ spk_buffer = {}
74
+ pad = int(0.05 * sr)
75
+
76
+ for i, seg in enumerate(merged):
77
+ duration = seg['end'] - seg['start']
78
+ if duration < 0.1: continue
79
+
80
+ start = max(0, int(seg['start'] * sr) - pad)
81
+ end = min(len(wav), int(seg['end'] * sr) + pad)
82
+ chunk = wav[start:end]
83
+
84
+ if self._is_silence(chunk): continue
85
+
86
+ if duration > 1.0:
87
+ if seg['speaker'] not in spk_buffer: spk_buffer[seg['speaker']] = []
88
+ spk_buffer[seg['speaker']].append(chunk)
89
+
90
+ text = self._transcribe_chunk(chunk, sr)
91
+ if not text: continue
92
+
93
+ results.append({
94
+ "segment_id": i + 1,
95
+ "start": float(f"{seg['start']:.2f}"),
96
+ "end": float(f"{seg['end']:.2f}"),
97
+ "speaker": seg['speaker'],
98
+ "role": "UNKNOWN",
99
+ "emotion": self._detect_emotion(chunk),
100
+ "text": text,
101
+ "tone": self._calculate_tone_physics(chunk, sr, text)
102
+ })
103
+
104
+ results = self._assign_roles(results)
105
+
106
+ identification = {}
107
+ for r in results:
108
+ identification[r['speaker']] = r['role']
109
+
110
+ biometrics = self._analyze_biometrics_voting(spk_buffer, results)
111
+ cust_metrics = self._analyze_customer_journey(results)
112
+ agent_metrics = self._analyze_agent_kpi(results, cust_metrics['impact_score'])
113
+
114
+ final_output = {
115
+ "metadata": biometrics,
116
+ "identification": identification,
117
+ "agent_metrics": agent_metrics,
118
+ "customer_metrics": cust_metrics,
119
+ "transcript": results
120
+ }
121
+
122
+ self._flush_memory()
123
+ return final_output
124
+
125
+ def _analyze_biometrics_voting(self, buffer, transcript):
126
+ profiles = {}
127
+ context_map = {}
128
+
129
+ for line in transcript:
130
+ if line['role'] == "AGENT":
131
+ txt = line['text'].lower()
132
+ target_list = [x['speaker'] for x in transcript if x['role'] == "CUSTOMER"]
133
+ if not target_list: continue
134
+ target = target_list[0]
135
+ if any(w in txt for w in ["sir", "mr.", "mister", "man"]): context_map[target] = "MALE"
136
+ if any(w in txt for w in ["ma'am", "miss", "mrs", "madam"]): context_map[target] = "FEMALE"
137
+
138
+ for spk, chunks in buffer.items():
139
+ if not chunks: continue
140
+
141
+ pitches = [t['tone']['pitch_hz'] for t in transcript if t['speaker'] == spk and t['tone']['pitch_hz'] > 60]
142
+ avg_pitch = float(np.mean(pitches)) if pitches else 0.0
143
+
144
+ raw = np.concatenate(chunks)
145
+ if len(raw) > 16000 * 10: raw = raw[:16000*10]
146
+ norm = (raw - np.mean(raw)) / (np.std(raw) + 1e-7)
147
+
148
+ ai_gender = "UNKNOWN"
149
+ age_bracket = "26-35"
150
+
151
+ try:
152
+ inputs = self.ag_processor(norm, sampling_rate=16000, return_tensors="pt").to(self.device)
153
+ with torch.no_grad(): logits = self.ag_model(**inputs).logits
154
+ probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
155
+ scores = {self.ag_model.config.id2label[i]: float(probs[i]) for i in range(len(probs))}
156
+
157
+ ai_gender = "MALE" if scores.get('male', 0) > scores.get('female', 0) else "FEMALE"
158
+
159
+ s_ch = scores.get('child',0) + scores.get('teen',0)
160
+ s_sn = scores.get('senior',0)
161
+
162
+ if s_ch > 0.35: age_bracket = "18-25"
163
+ elif s_sn > 0.40: age_bracket = "56+"
164
+ elif s_sn > 0.15: age_bracket = "46-55"
165
+ else: age_bracket = "26-35"
166
+ except: pass
167
+
168
+ final_gender = ai_gender
169
+ if spk in context_map: final_gender = context_map[spk]
170
+ elif avg_pitch > 0 and avg_pitch < 155 and ai_gender == "FEMALE": final_gender = "MALE"
171
+
172
+ role = [r['role'] for r in transcript if r['speaker'] == spk][0]
173
+ if role == "AGENT" and age_bracket == "18-25": age_bracket = "18-25 (Young Adult)"
174
+
175
+ profiles[spk] = {
176
+ "gender": final_gender,
177
+ "age_bracket": age_bracket,
178
+ "debug_pitch": int(avg_pitch)
179
+ }
180
+ return profiles
181
+
182
+ def _analyze_customer_journey(self, results):
183
+ cust_segs = [r for r in results if r['role'] == "CUSTOMER"]
184
+ if not cust_segs:
185
+ return {"initial_emotion": "N/A", "final_emotion": "N/A", "impact_score": 0, "impact_label": "N/A"}
186
+
187
+ def get_mode(segs): return Counter([s['emotion'] for s in segs]).most_common(1)[0][0]
188
+
189
+ initial = get_mode(cust_segs[:5])
190
+ final = get_mode(cust_segs[-5:])
191
+
192
+ val_map = {"HAPPY": 2, "NEUTRAL": 1, "SAD": 0, "ANGRY": 0}
193
+ score_diff = val_map.get(final, 1) - val_map.get(initial, 1)
194
+
195
+ impact_label = "STANDARD"
196
+ if score_diff > 0: impact_label = "POSITIVE UPLIFT"
197
+ elif score_diff < 0: impact_label = "NEGATIVE CHURN RISK"
198
+
199
+ return {
200
+ "initial_emotion": initial,
201
+ "final_emotion": final,
202
+ "impact_score": score_diff,
203
+ "impact_label": impact_label
204
+ }
205
+
206
+ def _analyze_agent_kpi(self, results, impact_bonus):
207
+ segs = [r for r in results if r['role'] == "AGENT"]
208
+ if not segs: return {}
209
+
210
+ wpm = float(np.mean([s['tone']['wpm'] for s in segs]))
211
+ vol = float(np.mean([s['tone']['volume'] for s in segs]))
212
+ pitch = float(np.mean([s['tone']['pitch_hz'] for s in segs if s['tone']['pitch_hz'] > 0]))
213
+
214
+ score = 60
215
+
216
+ if 130 <= wpm <= 165: score += 15
217
+ elif wpm > 185 or wpm < 110: score -= 10
218
+
219
+ pitch_std = float(np.std([s['tone']['pitch_hz'] for s in segs if s['tone']['pitch_hz'] > 0]))
220
+ if pitch_std > 20: score += 10
221
+
222
+ emotions = [s['emotion'] for s in segs]
223
+ neg_ratio = (emotions.count("ANGRY") + emotions.count("SAD")) / len(segs)
224
+
225
+ sentiment = "NEUTRAL"
226
+ if neg_ratio > 0.1:
227
+ sentiment = "NEGATIVE"
228
+ score -= 20
229
+ elif emotions.count("HAPPY") > len(segs) * 0.2:
230
+ sentiment = "POSITIVE"
231
+ score += 10
232
+
233
+ engagement = "NORMAL"
234
+ if score > 75: engagement = "HIGH"
235
+ if score < 50: engagement = "LOW/WITHDRAWN"
236
+
237
+ score += (impact_bonus * 15)
238
+ score = max(0, min(100, int(score)))
239
+
240
+ return {
241
+ "overall_score": score,
242
+ "overall_emotion": Counter(emotions).most_common(1)[0][0],
243
+ "sentiment_level": sentiment,
244
+ "engagement_level": engagement,
245
+ "avg_pace_wpm": round(wpm, 1),
246
+ "avg_pitch_hz": round(pitch, 1),
247
+ "avg_volume": round(vol, 1)
248
+ }
249
+
250
+ def _is_silence(self, chunk): return np.sqrt(np.mean(chunk**2)) < 0.003
251
+
252
+ def _transcribe_chunk(self, chunk, sr):
253
+ if len(chunk) < sr:
254
+ pad = np.zeros(int(sr*0.5), dtype=np.float32)
255
+ chunk = np.concatenate([pad, chunk, pad])
256
+ try:
257
+ res = self.stt_model.transcribe(chunk.astype(np.float32), language="en", beam_size=5, temperature=0.0)
258
+ text = res['text'].strip()
259
+ if len(text) < 2 or text.lower() in ["you", "bye."]: return None
260
+ return text
261
+ except: return None
262
+
263
+ def _detect_emotion(self, chunk):
264
+ try:
265
+ emotions = self.emotion_classifier(chunk.astype(np.float32), top_k=None)
266
+ scores = {e['label']: e['score'] for e in emotions}
267
+ if scores.get('ang', 0) > 0.25: return "ANGRY"
268
+ if scores.get('hap', 0) > 0.40: return "HAPPY"
269
+ if scores.get('sad', 0) > 0.40: return "SAD"
270
+ return "NEUTRAL"
271
+ except: return "NEUTRAL"
272
+
273
+ def _calculate_tone_physics(self, chunk, sr, text):
274
+ rms = float(np.mean(librosa.feature.rms(y=chunk))) * 1000
275
+ f0 = librosa.yin(chunk.astype(np.float64), fmin=60, fmax=400)
276
+ f0 = f0[f0 > 0]
277
+ pitch = float(np.mean(f0)) if len(f0) > 0 else 0.0
278
+ wpm = int(len(text.split()) / ((len(chunk)/sr)/60))
279
+ return {"pitch_hz": round(pitch, 1), "volume": round(rms, 2), "wpm": wpm}
280
+
281
+ def _run_diarization(self, wav, sr):
282
+ segments = []
283
+ if not self.diarization_pipeline: return [{"start":0,"end":len(wav)/sr,"speaker":"SPEAKER_00"}]
284
+ try:
285
+ tensor = torch.from_numpy(wav).float().unsqueeze(0)
286
+ output = self.diarization_pipeline({"waveform": tensor, "sample_rate": sr})
287
+ dia = output.speaker_diarization if hasattr(output, "speaker_diarization") else output.annotation
288
+ for t, _, s in dia.itertracks(yield_label=True): segments.append({"start":t.start,"end":t.end,"speaker":s})
289
+ except: segments = [{"start":0,"end":len(wav)/sr,"speaker":"SPEAKER_00"}]
290
+ return sorted(segments, key=lambda x: x['start']) if segments else [{"start":0,"end":len(wav)/sr,"speaker":"SPEAKER_00"}]
291
+
292
+ def _merge_segments(self, segments):
293
+ if not segments: return []
294
+ merged = [segments[0]]
295
+ for curr in segments[1:]:
296
+ if curr['speaker'] == merged[-1]['speaker'] and (curr['start'] - merged[-1]['end'] < 1.0):
297
+ merged[-1]['end'] = curr['end']
298
+ else: merged.append(curr)
299
+ return merged
300
+
301
+ def _assign_roles(self, results):
302
+ if not results: return results
303
+ counts = Counter([r['speaker'] for r in results for _ in r['text'].split()])
304
+ if not counts: return results
305
+ agent = counts.most_common(1)[0][0]
306
+ for r in results: r['role'] = "AGENT" if r['speaker'] == agent else "CUSTOMER"
307
+ return results
308
+
309
+ def _flush_memory(self):
310
+ gc.collect()
311
+ torch.cuda.empty_cache()
312
+
313
+ def save_json(self, data, base):
314
+ fn = f"{base}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
315
+ with open(fn, 'w', encoding='utf-8') as f:
316
+ json.dump(data, f, indent=4, ensure_ascii=False, cls=NumpyEncoder)
317
+ return fn
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ torchvision
4
+ openai-whisper
5
+ pyannote.audio
6
+ transformers
7
+ accelerate
8
+ librosa
9
+ soundfile
10
+ numpy
11
+ fastapi
12
+ uvicorn
13
+ python-multipart