Spaces:

akpande2
/

iAura_1

Paused

App Files Files Community

iAura_1 / pipeline.py

akpande2

Update pipeline.py

08c2b3f verified 6 days ago

raw

history blame contribute delete

47.6 kB

	"""
	ULTRA-ROBUST CALL CENTER ANALYTICS
	===================================
	✅ Multiple gender detection models with voting
	✅ Best STT model (Whisper Large-v3 + optimizations)
	✅ Enhanced for European accents
	✅ Robust pitch analysis with multiple methods
	✅ Production-grade accuracy

	MODELS USED:
	- STT: Whisper Large-v3 (best for accents)
	- Gender: 3 models + voting system
	- Age: Wav2Vec2 Large + validation
	- Diarization: pyannote 3.1 (SOTA)
	"""

	from keybert import KeyBERT
	from sentence_transformers import SentenceTransformer
	import os
	import sys
	import logging
	import torch
	import librosa
	import whisper
	import numpy as np
	import warnings
	import json
	import gc
	from collections import Counter, defaultdict
	from pyannote.audio import Pipeline
	from transformers import (
	pipeline,
	Wav2Vec2Processor,
	Wav2Vec2ForSequenceClassification,
	AutoModelForAudioClassification,
	AutoFeatureExtractor
	)
	from datetime import datetime
	from scipy import signal as scipy_signal
	from scipy.stats import mode as scipy_mode
	import parselmouth
	from parselmouth.praat import call

	os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
	logging.getLogger("pyannote").setLevel(logging.ERROR)
	logging.getLogger("transformers").setLevel(logging.ERROR)
	warnings.filterwarnings("ignore")

	class NumpyEncoder(json.JSONEncoder):
	def default(self, obj):
	if isinstance(obj, np.integer): return int(obj)
	if isinstance(obj, np.floating): return float(obj)
	if isinstance(obj, np.ndarray): return obj.tolist()
	return super(NumpyEncoder, self).default(obj)


	class UltraRobustCallAnalytics:
	def __init__(self, hf_token=None, device=None):
	# 1. DEFINE DEVICE FIRST (Move this up)
	self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
	print(f"🚀 Initializing ULTRA-ROBUST Analytics Engine on {self.device}...")
	print("="*70)

	# 2. NOW YOU CAN FLUSH MEMORY (Move this down)
	self._flush_memory()

	# ===== BEST STT MODEL: Whisper Large-v3 =====
	try:
	print(" → Loading Whisper Large-v3 (BEST for accents)...")
	self.stt_model = whisper.load_model("large-v3", device=self.device)
	self.stt_model_name = "large-v3"
	print(" ✓ Whisper Large-v3 loaded")
	except:
	print(" ⚠ Falling back to Large-v2...")
	try:
	self.stt_model = whisper.load_model("large-v2", device=self.device)
	self.stt_model_name = "large-v2"
	print(" ✓ Whisper Large-v2 loaded")
	except:
	print(" ⚠ Final fallback to Medium...")
	self.stt_model = whisper.load_model("medium", device=self.device)
	self.stt_model_name = "medium"
	print(" ✓ Whisper Medium loaded")

	# ===== DIARIZATION =====
	self.diarization_pipeline = None
	if hf_token:
	print(f" → Attempting to load Pyannote with token starting: {hf_token[:4]}...")

	# Universal Loader: Tries 'token' (New) then 'use_auth_token' (Old)
	try:
	# Attempt 1: New Syntax
	self.diarization_pipeline = Pipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	token=hf_token
	).to(torch.device(self.device))
	print(" ✓ Diarization loaded (New Syntax)")
	except TypeError:
	# Attempt 2: Old Syntax (Fallback)
	print(" ⚠ New syntax failed, trying legacy syntax...")
	try:
	self.diarization_pipeline = Pipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	use_auth_token=hf_token
	).to(torch.device(self.device))
	print(" ✓ Diarization loaded (Legacy Syntax)")
	except Exception as e:
	print(f" ❌ CRITICAL PYANNOTE ERROR (Legacy): {e}")
	except Exception as e:
	print(f" ❌ CRITICAL PYANNOTE ERROR: {e}")

	# ===== EMOTION CLASSIFIER =====
	print(" → Loading emotion classifier...")
	self.emotion_classifier = pipeline(
	"audio-classification",
	model="superb/wav2vec2-base-superb-er",
	device=0 if self.device == "cuda" else -1
	)
	print(" ✓ Emotion classifier loaded")

	# ===== MULTIPLE GENDER MODELS FOR VOTING =====
	print("\n → Loading MULTIPLE gender detection models...")
	self.gender_models = {}

	# Model 1: Age-Gender (Primary)
	try:
	print(" Loading Gender Model 1: audeering/wav2vec2-large...")
	self.ag_model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
	self.ag_processor = Wav2Vec2Processor.from_pretrained(self.ag_model_name)
	self.ag_model = Wav2Vec2ForSequenceClassification.from_pretrained(self.ag_model_name)
	self.ag_model.to(self.device).eval()
	self.gender_models['audeering'] = {
	'processor': self.ag_processor,
	'model': self.ag_model
	}
	print(" ✓ Model 1 loaded")
	except Exception as e:
	print(f" ✗ Model 1 failed: {e}")

	# Model 2: Alefiury Gender Classifier
	try:
	print(" Loading Gender Model 2: alefiury/wav2vec2-large-xlsr-53-gender...")
	model2_name = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
	processor2 = AutoFeatureExtractor.from_pretrained(model2_name)
	model2 = AutoModelForAudioClassification.from_pretrained(model2_name)
	model2.to(self.device).eval()
	self.gender_models['alefiury'] = {
	'processor': processor2,
	'model': model2
	}
	print(" ✓ Model 2 loaded")
	except Exception as e:
	print(f" ✗ Model 2 failed: {e}")

	# Model 3: MIT Gender Detection
	try:
	print(" Loading Gender Model 3: MIT/ast-finetuned-speech-commands...")
	model3_name = "MIT/ast-finetuned-speech-commands-v2"
	processor3 = AutoFeatureExtractor.from_pretrained(model3_name)
	model3 = AutoModelForAudioClassification.from_pretrained(model3_name)
	model3.to(self.device).eval()
	self.gender_models['mit'] = {
	'processor': processor3,
	'model': model3
	}
	print(" ✓ Model 3 loaded")
	except Exception as e:
	print(f" ✗ Model 3 failed: {e}")

	print(f" ✓ Loaded {len(self.gender_models)} gender detection models")

	print("\n" + "="*70)
	print("✅ Engine initialized successfully")
	print("="*70 + "\n")

	print(" → Loading KeyBERT for keyword extraction...")
	try:
	self.keyword_model = KeyBERT('all-MiniLM-L6-v2')
	print(" ✓ Keyword extractor loaded")
	except Exception as e:
	print(f" ⚠ Keyword model failed: {e}")
	self.keyword_model = None

	print(" → Loading zero-shot topic classifier...")
	try:
	self.topic_classifier = pipeline(
	"zero-shot-classification",
	model="facebook/bart-large-mnli",
	device=0 if self.device == "cuda" else -1
	)
	self.topic_labels = [
	"billing_payment",
	"technical_support",
	"product_inquiry",
	"complaint_issue",
	"account_management",
	"sales_marketing",
	"service_cancellation",
	"feedback_survey",
	"appointment_scheduling",
	"general_inquiry"
	]
	print(" ✓ Topic classifier loaded")
	except Exception as e:
	print(f" ⚠ Topic classifier failed: {e}")
	self.topic_classifier = None

	def process_call(self, audio_path):
	"""Main processing with maximum robustness"""
	if not os.path.exists(audio_path):
	raise FileNotFoundError(f"Audio file not found: {audio_path}")

	self._flush_memory()

	print(f"📁 Processing: {audio_path}")
	print("="*70)

	# Load and preprocess
	wav, sr = librosa.load(audio_path, sr=16000, mono=True)
	wav = wav.astype(np.float32)

	# Audio enhancement for call center quality
	wav = self._enhance_audio_for_callcenter(wav, sr)

	duration = len(wav) / sr
	print(f" ✓ Audio loaded: {duration:.1f}s @ {sr}Hz")

	# Enhanced diarization
	print("\n → Running enhanced diarization...")
	segments = self._run_enhanced_diarization(wav, sr, audio_path)
	print(f" ✓ Found {len(set(s['speaker'] for s in segments))} speakers, {len(segments)} segments")

	# Smart merging
	merged = self._merge_segments_smart(segments, min_gap=0.25)
	print(f" ✓ Merged to {len(merged)} segments")

	# Process segments
	results = []
	spk_audio_buffer = defaultdict(list)
	pad = int(0.1 * sr) # Increased padding

	print("\n → Transcribing with Whisper Large-v3...")
	for i, seg in enumerate(merged):
	seg_duration = seg['end'] - seg['start']
	if seg_duration < 0.1:
	continue

	start_idx = max(0, int(seg['start'] * sr) - pad)
	end_idx = min(len(wav), int(seg['end'] * sr) + pad)
	chunk = wav[start_idx:end_idx]

	if self._is_silence(chunk):
	continue

	# Collect audio for biometrics
	if seg_duration > 0.4:
	spk_audio_buffer[seg['speaker']].append(chunk)

	# ENHANCED TRANSCRIPTION
	text = self._transcribe_chunk_robust(chunk, sr)
	if not text:
	continue



	emotion = self._detect_emotion(chunk)
	sentiment = self._map_emotion_to_sentiment(emotion)
	speech_rate = self._calculate_speech_rate(text, seg_duration)
	keywords = self._extract_keywords(text, top_n=5)
	topic = self._classify_topic(text)
	results.append({
	"segment_id": i + 1,
	"start": float(f"{seg['start']:.2f}"),
	"end": float(f"{seg['end']:.2f}"),
	"duration": float(f"{seg_duration:.2f}"),
	"speaker": seg['speaker'],
	"role": "UNKNOWN",
	"text": text,
	"emotion": emotion,
	"sentiment": sentiment, # NEW
	"speech_rate": speech_rate, # NEW
	"keywords": keywords, # NEW
	"topic": topic, # NEW
	"tone": self._calculate_tone_advanced(chunk, sr, text)
	})

	if (i + 1) % 10 == 0:
	print(f" Processed {i + 1}/{len(merged)} segments...")

	print(f" ✓ Transcribed {len(results)} segments with text")

	# Assign roles
	print("\n → Assigning speaker roles...")
	results = self._assign_roles_smart(results)

	identification = {}
	for r in results:
	identification[r['speaker']] = r['role']
	print(f" ✓ Roles: {identification}")

	# ULTRA-ROBUST BIOMETRICS WITH VOTING
	print("\n → Analyzing biometrics with multi-model voting...")
	biometrics = self._analyze_biometrics_ultra_robust(spk_audio_buffer, results, wav, sr)
	for spk, bio in biometrics.items():
	print(f" {spk}: {bio['gender']} (confidence: {bio['gender_confidence']:.2f}), {bio['age_bracket']}")

	# Customer journey
	print("\n → Analyzing customer journey...")
	cust_metrics = self._analyze_customer_journey(results)
	print(f" ✓ Journey: {cust_metrics['emotional_arc']}")

	# Agent KPI
	print("\n → Analyzing agent performance...")
	agent_metrics = self._analyze_agent_kpi(results, cust_metrics['impact_score'])
	print(f" ✓ Agent score: {agent_metrics.get('overall_score', 'N/A')}/100")

	# Compile output
	call_summary = self._aggregate_call_insights(results)
	final_output = {
	"metadata": {
	"file": os.path.basename(audio_path),
	"duration_seconds": float(f"{duration:.2f}"),
	"sample_rate": sr,
	"total_segments": len(results),
	"stt_model": self.stt_model_name,
	"gender_models_used": len(self.gender_models),
	"speakers": biometrics,
	"call_summary": call_summary # NEW
	},
	"identification": identification,
	"agent_metrics": agent_metrics,
	"customer_metrics": cust_metrics,
	"transcript": results
	}

	self._flush_memory()
	print("\n" + "="*70)
	print("✅ Processing complete")
	print("="*70 + "\n")

	return final_output

	def _enhance_audio_for_callcenter(self, wav, sr):
	"""Enhance audio quality for better transcription"""
	# 1. Normalize
	wav = wav / (np.max(np.abs(wav)) + 1e-7)

	# 2. High-pass filter to remove low-frequency noise
	try:
	sos = scipy_signal.butter(4, 80, 'hp', fs=sr, output='sos')
	wav = scipy_signal.sosfilt(sos, wav)
	except:
	pass

	# 3. Gentle compression to balance volume
	wav = np.sign(wav) * np.log1p(np.abs(wav) * 10) / np.log1p(10)

	return wav.astype(np.float32)

	def _transcribe_chunk_robust(self, chunk, sr):
	"""
	ULTRA-ROBUST TRANSCRIPTION
	Optimized for:
	- European accents
	- Call center quality
	- Background noise
	"""
	# Ensure minimum length
	if len(chunk) < sr * 0.3:
	pad = np.zeros(int(sr * 0.5), dtype=np.float32)
	chunk = np.concatenate([pad, chunk, pad])

	try:
	# BEST SETTINGS FOR CALL CENTER + EUROPEAN ACCENTS
	result = self.stt_model.transcribe(
	chunk.astype(np.float32),
	language="en", # English only
	task="transcribe",

	# Quality settings
	beam_size=5, # Higher = more accurate but slower
	best_of=5, # Sample best of 5 runs
	temperature=0.0, # Deterministic

	# Accent handling
	condition_on_previous_text=True, # Use context

	# Noise handling
	compression_ratio_threshold=2.4, # More lenient
	logprob_threshold=-1.0, # More lenient
	no_speech_threshold=0.6, # Standard

	# Speed vs accuracy
	fp16=(self.device == "cuda"), # Use FP16 on GPU

	# Word timestamps for quality check
	word_timestamps=True
	)

	text = result['text'].strip()

	# Quality filters
	if len(text) < 2:
	return None

	# Filter garbage
	garbage = ["you", "thank you", ".", "...", "bye", "okay"]
	if text.lower() in garbage:
	return None

	# Check if it's actual speech (has vowels and consonants)
	if not any(c in text.lower() for c in 'aeiou'):
	return None

	# Check word-level confidence if available
	if 'words' in result and result['words']:
	avg_prob = np.mean([w.get('probability', 1.0) for w in result['words']])
	if avg_prob < 0.3: # Very low confidence
	return None

	return text

	except Exception as e:
	print(f" ⚠ Transcription error: {e}")
	return None

	def _analyze_biometrics_ultra_robust(self, audio_buffer, transcript, full_wav, sr):
	"""
	ULTRA-ROBUST GENDER DETECTION
	Uses multiple models + voting + pitch + conversation context
	"""
	profiles = {}

	# Collect conversation context
	context_gender = self._extract_gender_from_conversation(transcript)

	for spk, chunks in audio_buffer.items():
	if not chunks:
	continue

	print(f"\n Analyzing {spk}...")

	# Concatenate audio (max 15 seconds from different parts)
	raw_audio = self._prepare_audio_for_analysis(chunks, sr)

	# ===== METHOD 1: ADVANCED PITCH ANALYSIS =====
	pitch_gender, pitch_confidence, pitch_stats = self._analyze_pitch_robust(raw_audio, sr, full_wav, transcript, spk)
	print(f" Pitch analysis: {pitch_gender} (conf: {pitch_confidence:.2f})")

	# ===== METHOD 2: MULTI-MODEL AI VOTING =====
	ai_gender, ai_confidence, all_predictions = self._multi_model_gender_detection(raw_audio, sr)
	print(f" AI models: {ai_gender} (conf: {ai_confidence:.2f})")
	print(f" Individual: {all_predictions}")

	# ===== METHOD 3: CONVERSATION CONTEXT =====
	context_gend = context_gender.get(spk, "UNKNOWN")
	print(f" Context clues: {context_gend}")

	# ===== METHOD 4: FORMANT ANALYSIS =====
	formant_gender, formant_confidence = self._analyze_formants(raw_audio, sr)
	print(f" Formant analysis: {formant_gender} (conf: {formant_confidence:.2f})")

	# ===== VOTING SYSTEM WITH CONFIDENCE WEIGHTING =====
	votes = []

	# Context vote (HIGHEST priority if available)
	if context_gend != "UNKNOWN":
	votes.extend([context_gend] * 4) # 4 votes for context

	# Pitch vote (HIGH priority)
	if pitch_confidence > 0.6:
	votes.extend([pitch_gender] * 3) # 3 votes for confident pitch
	elif pitch_confidence > 0.4:
	votes.append(pitch_gender) # 1 vote for moderate pitch

	# AI models vote (MEDIUM priority)
	if ai_confidence > 0.7:
	votes.extend([ai_gender] * 2) # 2 votes for confident AI
	elif ai_confidence > 0.5:
	votes.append(ai_gender) # 1 vote for moderate AI

	# Formant vote (MEDIUM priority)
	if formant_confidence > 0.6:
	votes.extend([formant_gender] * 2)
	elif formant_confidence > 0.4:
	votes.append(formant_gender)

	# Count votes
	if votes:
	vote_counts = Counter(votes)
	final_gender = vote_counts.most_common(1)[0][0]
	total_votes = len(votes)
	winning_votes = vote_counts[final_gender]
	final_confidence = winning_votes / total_votes
	else:
	# Fallback
	final_gender = ai_gender if ai_confidence > 0.5 else "UNKNOWN"
	final_confidence = ai_confidence

	print(f" FINAL: {final_gender} (confidence: {final_confidence:.2f})")
	print(f" Vote breakdown: {dict(Counter(votes))}")

	# ===== AGE DETECTION =====
	age_bracket = self._detect_age_robust(raw_audio, sr, pitch_stats)

	# Get role
	role = [r['role'] for r in transcript if r['speaker'] == spk]
	role = role[0] if role else "UNKNOWN"

	profiles[spk] = {
	"gender": final_gender,
	"gender_confidence": round(final_confidence, 2),
	"gender_methods": {
	"context": context_gend,
	"pitch": f"{pitch_gender} ({pitch_confidence:.2f})",
	"ai_models": f"{ai_gender} ({ai_confidence:.2f})",
	"formants": f"{formant_gender} ({formant_confidence:.2f})",
	"vote_breakdown": dict(Counter(votes))
	},
	"age_bracket": age_bracket,
	"voice_stats": {
	"avg_pitch_hz": pitch_stats['mean'],
	"pitch_range": f"{pitch_stats['min']:.0f}-{pitch_stats['max']:.0f}Hz",
	"pitch_std": pitch_stats['std']
	}
	}

	return profiles

	def _prepare_audio_for_analysis(self, chunks, sr, max_duration=15):
	"""Prepare audio by taking samples from different parts"""
	raw = np.concatenate(chunks)

	# Take samples from beginning, middle, end
	if len(raw) > sr * max_duration:
	segment_len = sr * 5 # 5 seconds each
	total_len = len(raw)

	samples = []
	# Beginning
	samples.append(raw[:segment_len])
	# Middle
	mid_start = (total_len // 2) - (segment_len // 2)
	samples.append(raw[mid_start:mid_start + segment_len])
	# End
	samples.append(raw[-segment_len:])

	raw = np.concatenate(samples)

	# Normalize
	raw = raw - np.mean(raw)
	std = np.std(raw)
	if std > 1e-7:
	raw = raw / std

	return raw

	def _analyze_pitch_robust(self, audio, sr, full_wav, transcript, speaker):
	"""Advanced pitch analysis using multiple methods"""

	# Collect all pitch values from transcript
	transcript_pitches = [
	t['tone']['pitch_hz']
	for t in transcript
	if t['speaker'] == speaker and t['tone']['pitch_hz'] > 60
	]

	# Method 1: YIN algorithm
	try:
	f0_yin = librosa.yin(audio.astype(np.float64), fmin=60, fmax=400, sr=sr)
	f0_yin_valid = f0_yin[f0_yin > 0]
	except:
	f0_yin_valid = []

	# Method 2: PYIN (probabilistic YIN)
	try:
	f0_pyin, voiced_flag, voiced_probs = librosa.pyin(
	audio.astype(np.float64),
	fmin=60,
	fmax=400,
	sr=sr
	)
	f0_pyin_valid = f0_pyin[~np.isnan(f0_pyin)]
	except:
	f0_pyin_valid = []

	# Combine all pitch measurements
	all_pitches = []
	if len(f0_yin_valid) > 0:
	all_pitches.extend(f0_yin_valid)
	if len(f0_pyin_valid) > 0:
	all_pitches.extend(f0_pyin_valid)
	if len(transcript_pitches) > 0:
	all_pitches.extend(transcript_pitches)

	if len(all_pitches) == 0:
	return "UNKNOWN", 0.0, {'mean': 0, 'std': 0, 'min': 0, 'max': 0}

	# Calculate statistics
	mean_pitch = np.mean(all_pitches)
	std_pitch = np.std(all_pitches)
	min_pitch = np.min(all_pitches)
	max_pitch = np.max(all_pitches)

	pitch_stats = {
	'mean': round(mean_pitch, 1),
	'std': round(std_pitch, 1),
	'min': round(min_pitch, 1),
	'max': round(max_pitch, 1)
	}

	# Gender classification with refined thresholds
	# Research-based ranges:
	# Male: 85-180 Hz (average ~120 Hz)
	# Female: 165-255 Hz (average ~210 Hz)

	if mean_pitch < 150:
	gender = "MALE"
	# Confidence based on how far below 150
	confidence = min(1.0, (150 - mean_pitch) / 40)
	elif mean_pitch > 180:
	gender = "FEMALE"
	# Confidence based on how far above 180
	confidence = min(1.0, (mean_pitch - 180) / 40)
	else:
	# Ambiguous range (150-180 Hz)
	if mean_pitch < 165:
	gender = "MALE"
	confidence = 0.5
	else:
	gender = "FEMALE"
	confidence = 0.5

	return gender, confidence, pitch_stats

	def _multi_model_gender_detection(self, audio, sr):
	"""Run multiple AI models and aggregate predictions"""
	predictions = []
	confidences = []

	for model_name, model_dict in self.gender_models.items():
	try:
	processor = model_dict['processor']
	model = model_dict['model']

	# Prepare inputs
	inputs = processor(
	audio,
	sampling_rate=sr,
	return_tensors="pt",
	padding=True
	).to(self.device)

	# Predict
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()

	# Extract gender prediction
	labels = model.config.id2label

	# Find male/female labels (different models use different names)
	male_score = 0
	female_score = 0

	for idx, label in labels.items():
	label_lower = label.lower()
	if 'male' in label_lower and 'female' not in label_lower:
	male_score = max(male_score, probs[idx])
	elif 'female' in label_lower:
	female_score = max(female_score, probs[idx])

	if male_score > female_score:
	predictions.append("MALE")
	confidences.append(male_score)
	else:
	predictions.append("FEMALE")
	confidences.append(female_score)

	except Exception as e:
	print(f" Model {model_name} error: {e}")
	continue

	if not predictions:
	return "UNKNOWN", 0.0, {}

	# Aggregate predictions
	pred_counter = Counter(predictions)
	majority_vote = pred_counter.most_common(1)[0][0]

	# Calculate confidence
	majority_indices = [i for i, p in enumerate(predictions) if p == majority_vote]
	avg_confidence = np.mean([confidences[i] for i in majority_indices])

	# Individual predictions
	individual = {
	f"model_{i+1}": f"{pred} ({conf:.2f})"
	for i, (pred, conf) in enumerate(zip(predictions, confidences))
	}

	return majority_vote, float(avg_confidence), individual

	def _extract_gender_from_conversation(self, transcript):
	"""Extract gender clues from conversation"""
	context_map = {}

	# Extended keyword lists
	male_keywords = [
	"sir", "mr.", "mister", "mr ", "gentleman", "he", "him", "his",
	"man", "guy", "male", "father", "dad", "son", "brother", "husband"
	]

	female_keywords = [
	"ma'am", "miss", "mrs", "mrs.", "madam", "madame", "ms", "ms.",
	"she", "her", "hers", "woman", "lady", "female", "mother", "mom",
	"daughter", "sister", "wife"
	]

	for line in transcript:
	if line['role'] == "AGENT":
	txt = line['text'].lower()

	# Find who agent is talking to
	customers = [x['speaker'] for x in transcript if x['role'] == "CUSTOMER"]
	if not customers:
	continue

	target = customers[0]

	# Check for keywords
	if any(keyword in txt for keyword in male_keywords):
	context_map[target] = "MALE"
	elif any(keyword in txt for keyword in female_keywords):
	context_map[target] = "FEMALE"

	return context_map

	def _analyze_formants(self, audio, sr):
	"""Analyze formant frequencies (F1, F2) for gender detection"""
	try:
	# Use Praat for formant analysis
	import parselmouth
	from parselmouth.praat import call
	snd = parselmouth.Sound(audio, sampling_frequency=sr)
	formant = snd.to_formant_burg()

	# Extract F1 and F2 for voiced segments
	f1_values = []
	f2_values = []

	duration = snd.get_total_duration()
	time_step = 0.01 # 10ms steps

	for t in np.arange(0, duration, time_step):
	f1 = formant.get_value_at_time(1, t)
	f2 = formant.get_value_at_time(2, t)

	if not np.isnan(f1) and not np.isnan(f2):
	f1_values.append(f1)
	f2_values.append(f2)

	if len(f1_values) < 10:
	return "UNKNOWN", 0.0

	avg_f1 = np.mean(f1_values)
	avg_f2 = np.mean(f2_values)

	# Gender classification based on formants
	# Typical ranges:
	# Male: F1 ~120 Hz, F2 ~1200 Hz
	# Female: F1 ~220 Hz, F2 ~2100 Hz

	# Combined metric
	if avg_f1 < 170 and avg_f2 < 1650:
	gender = "MALE"
	confidence = 0.7
	elif avg_f1 > 190 and avg_f2 > 1750:
	gender = "FEMALE"
	confidence = 0.7
	else:
	# Use F2 as primary indicator
	if avg_f2 < 1600:
	gender = "MALE"
	else:
	gender = "FEMALE"
	confidence = 0.5

	return gender, confidence
	except ImportError:
	return "UNKNOWN", 0.0
	except Exception as e:
	return "UNKNOWN", 0.0

	def _detect_age_robust(self, audio, sr, pitch_stats):
	"""Robust age detection"""
	try:
	if 'audeering' not in self.gender_models:
	return "26-35" # Default

	processor = self.gender_models['audeering']['processor']
	model = self.gender_models['audeering']['model']

	inputs = processor(audio, sampling_rate=sr, return_tensors="pt").to(self.device)

	with torch.no_grad():
	logits = model(**inputs).logits
	probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()

	# Map labels to age buckets (aggregating across genders)
	# Labels usually look like: 'female_20-29', 'male_20-29', etc.
	labels = model.config.id2label
	age_scores = defaultdict(float)

	for i, score in enumerate(probs):
	label = labels[i]
	# Extract age part (assuming format gender_age)
	parts = label.split('_')
	if len(parts) > 1:
	age_group = parts[-1] # e.g., "20-29"
	age_scores[age_group] += score

	# Get best age bracket
	if age_scores:
	best_age = max(age_scores, key=age_scores.get)
	return best_age

	return "UNKNOWN"

	except Exception as e:
	print(f" ⚠ Age detection failed: {e}")
	return "UNKNOWN"

	def _run_enhanced_diarization(self, wav, sr, file_path):
	"""
	Run Pyannote diarization or fallback to simple segmentation
	"""
	if self.diarization_pipeline is None:
	print(" ⚠ No auth token provided, using energy-based fallback segmentation")
	return self._energy_based_segmentation(wav, sr)

	try:
	# Run pipeline
	diarization = self.diarization_pipeline(file_path, min_speakers=2, max_speakers=2)

	segments = []
	for turn, _, speaker in diarization.itertracks(yield_label=True):
	segments.append({
	"start": turn.start,
	"end": turn.end,
	"speaker": speaker
	})
	return segments

	except Exception as e:
	print(f" ⚠ Diarization error: {e}, using fallback")
	return self._energy_based_segmentation(wav, sr)

	def _energy_based_segmentation(self, wav, sr):
	"""Fallback if deep learning diarization fails"""
	# Simple energy detection to split speech from silence
	# Treating as single speaker (SPEAKER_00)
	intervals = librosa.effects.split(wav, top_db=30)
	segments = []
	for start, end in intervals:
	segments.append({
	"start": start / sr,
	"end": end / sr,
	"speaker": "SPEAKER_00"
	})
	return segments

	def _merge_segments_smart(self, segments, min_gap=0.5):
	"""Merge segments from same speaker that are close together"""
	if not segments:
	return []

	merged = []
	current = segments[0]

	for next_seg in segments[1:]:
	# If same speaker and gap is small
	if (next_seg['speaker'] == current['speaker'] and
	(next_seg['start'] - current['end']) < min_gap):
	# Extend current segment
	current['end'] = next_seg['end']
	else:
	merged.append(current)
	current = next_seg

	merged.append(current)
	return merged

	def _is_silence(self, chunk, threshold=0.005):
	"""Check if audio chunk is essentially silence"""
	return np.max(np.abs(chunk)) < threshold

	def _detect_emotion(self, chunk):
	"""Detect emotion from audio chunk"""
	try:
	# Ensure chunk is long enough for model
	if len(chunk) < 16000 * 0.5:
	return "neutral"

	# Use the pipeline loaded in init
	# Note: Pipeline expects file path or numpy array
	preds = self.emotion_classifier(chunk, top_k=1)
	return preds[0]['label']
	except:
	return "neutral"

	def _calculate_tone_advanced(self, chunk, sr, text):
	"""
	Calculate pitch, jitter, and shimmer using Parselmouth (Praat)
	"""
	try:
	if len(chunk) < sr * 0.1:
	return {"pitch_hz": 0, "jitter": 0, "shimmer": 0}

	snd = parselmouth.Sound(chunk, sampling_frequency=sr)

	# Pitch
	pitch = snd.to_pitch()
	pitch_val = pitch.selected_array['frequency']
	pitch_val = pitch_val[pitch_val != 0]
	avg_pitch = np.mean(pitch_val) if len(pitch_val) > 0 else 0

	# Pulses for Jitter/Shimmer
	point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500)

	try:
	jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
	except:
	jitter = 0

	try:
	shimmer = call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
	except:
	shimmer = 0

	return {
	"pitch_hz": round(float(avg_pitch), 1),
	"jitter": round(float(jitter * 100), 2), # percentage
	"shimmer": round(float(shimmer * 100), 2) # db
	}
	except:
	return {"pitch_hz": 0, "jitter": 0, "shimmer": 0}

	def _assign_roles_smart(self, results):
	"""
	Assign AGENT vs CUSTOMER roles using Golden Phrases and Verbosity.
	"""
	speakers = list(set(r['speaker'] for r in results))
	if len(speakers) == 1:
	# If only one speaker found, assume it's the Agent monologuing
	for r in results: r['role'] = "AGENT"
	return results

	speaker_scores = defaultdict(int)
	word_counts = defaultdict(int)

	# 1. GOLDEN PHRASES (Almost 100% guarantee of Agent)
	# These override normal scoring
	golden_agent_phrases = [
	"my name is", "this is steve", "this is sam", "this is mike", # Common names
	"calling from", "on a recorded line", "green solutions",
	"energy solutions", "federal government", "rebate program"
	]

	# 2. STANDARD SCORING KEYWORDS
	agent_keywords = [
	"manager", "supervisor", "qualified", "eligible",
	"whatsapp", "ping you", "verification", "consumption"
	]

	customer_keywords = [
	"who is this", "stop calling", "not interested",
	"take me off", "do not call", "why are you asking"
	]

	agent_found_via_golden = None

	for res in results:
	text = res['text'].lower()
	spk = res['speaker']

	# Count words for verbosity check
	words = text.split()
	word_counts[spk] += len(words)

	# Check Golden Phrases (Instant Win)
	if agent_found_via_golden is None:
	for phrase in golden_agent_phrases:
	if phrase in text:
	print(f" ★ Golden Phrase found for {spk}: '{phrase}'")
	agent_found_via_golden = spk
	break

	# Standard Scoring
	if any(k in text for k in agent_keywords):
	speaker_scores[spk] += 2

	if any(k in text for k in customer_keywords):
	speaker_scores[spk] -= 3 # Strong negative for objections

	# 3. DECISION LOGIC
	final_agent = None

	if agent_found_via_golden:
	# If we found a golden phrase, trust it implicitly
	final_agent = agent_found_via_golden
	else:
	# Fallback: Verbosity Check (Agent usually talks more)
	# Get speaker with max words
	talkative_spk = max(word_counts, key=word_counts.get)
	total_words = sum(word_counts.values())

	# If one speaker dominates >60% of conversation, likely the agent
	if word_counts[talkative_spk] / max(1, total_words) > 0.60:
	speaker_scores[talkative_spk] += 5

	# Validating scores
	final_agent = max(speaker_scores, key=speaker_scores.get)

	# 4. ASSIGN ROLES
	print(f" ✓ Role Assignment: Identified {final_agent} as AGENT")

	identification = {}
	for res in results:
	if res['speaker'] == final_agent:
	res['role'] = "AGENT"
	else:
	res['role'] = "CUSTOMER"
	identification[res['speaker']] = res['role']

	return results

	def _analyze_customer_journey(self, results):
	"""Analyze sentiment flow of the customer"""
	cust_segments = [r for r in results if r['role'] == "CUSTOMER"]

	if not cust_segments:
	return {"emotional_arc": "No customer audio", "impact_score": 0}

	# Map emotions to scores
	emo_map = {
	"happy": 1.0, "joy": 1.0, "neutral": 0.1,
	"sad": -0.5, "angry": -1.0, "frustrated": -1.0
	}

	start_score = sum(emo_map.get(s['emotion'], 0) for s in cust_segments[:3]) / min(3, len(cust_segments))
	end_score = sum(emo_map.get(s['emotion'], 0) for s in cust_segments[-3:]) / min(3, len(cust_segments))

	impact = end_score - start_score

	if impact > 0.2: arc = "Positive Resolution"
	elif impact < -0.2: arc = "Negative Escalation"
	else: arc = "Neutral/Unresolved"

	return {
	"emotional_arc": arc,
	"start_sentiment": round(start_score, 2),
	"end_sentiment": round(end_score, 2),
	"impact_score": round(impact, 2)
	}

	def _analyze_agent_kpi(self, results, customer_impact):
	"""Calculate Agent performance metrics"""
	agent_segments = [r for r in results if r['role'] == "AGENT"]

	if not agent_segments:
	return {"overall_score": 0}

	# 1. Politeness (Keyword based)
	polite_words = ["please", "thank", "sorry", "apologize", "appreciate"]
	total_words = sum(len(s['text'].split()) for s in agent_segments)
	polite_count = sum(1 for s in agent_segments if any(w in s['text'].lower() for w in polite_words))

	politeness_score = min(100, (polite_count / max(1, len(agent_segments))) * 200)

	# 2. Tone Consistency (Jitter/Shimmer variance)
	jitter_vals = [s['tone']['jitter'] for s in agent_segments]
	tone_stability = 100 - min(100, np.std(jitter_vals) * 10) if jitter_vals else 50

	# 3. Resolution Impact (from customer journey)
	# Map -1.0 to 1.0 range -> 0 to 100
	resolution_score = 50 + (customer_impact * 50)
	resolution_score = max(0, min(100, resolution_score))

	# Overall Weighted Score
	overall = (
	(politeness_score * 0.3) +
	(tone_stability * 0.2) +
	(resolution_score * 0.5)
	)

	return {
	"overall_score": int(overall),
	"politeness": int(politeness_score),
	"tone_stability": int(tone_stability),
	"resolution_effectiveness": int(resolution_score)
	}

	def _flush_memory(self):
	"""Aggressive memory cleanup"""
	gc.collect()
	if self.device == "cuda":
	torch.cuda.empty_cache()

	def _map_emotion_to_sentiment(self, emotion):
	"""Map emotion labels to sentiment with polarity score"""
	emotion_lower = emotion.lower()

	positive_emotions = {
	'happy': 0.8, 'joy': 0.9, 'excited': 0.85,
	'pleased': 0.7, 'satisfied': 0.75, 'content': 0.6
	}
	negative_emotions = {
	'sad': -0.6, 'angry': -0.9, 'frustrated': -0.8,
	'annoyed': -0.7, 'disappointed': -0.65, 'upset': -0.75
	}
	if emotion_lower in positive_emotions:
	return {
	"sentiment": "positive",
	"polarity_score": positive_emotions[emotion_lower],
	"confidence": "high"
	}

	if emotion_lower in negative_emotions:
	return {
	"sentiment": "negative",
	"polarity_score": negative_emotions[emotion_lower],
	"confidence": "high"
	}
	return {
	"sentiment": "neutral",
	"polarity_score": 0.0,
	"confidence": "medium"
	}

	def _calculate_speech_rate(self, text, duration_seconds):
	"""Calculate words per minute (WPM) and classify pace"""
	if duration_seconds < 0.1:
	return {"wpm": 0, "word_count": 0, "speech_pace": "unknown"}
	words = text.split()
	word_count = len(words)
	wpm = (word_count / (duration_seconds / 60.0)) if duration_seconds > 0 else 0
	if wpm < 100: pace = "slow"
	elif wpm < 140: pace = "normal"
	elif wpm < 180: pace = "fast"
	else: pace = "very_fast"

	return {
	"wpm": round(wpm, 1),
	"word_count": word_count,
	"speech_pace": pace
	}

	def _extract_keywords(self, text, top_n=5):
	"""Extract keywords/keyphrases using KeyBERT"""
	if self.keyword_model is None or len(text.split()) < 3:
	return []

	try:
	keywords = self.keyword_model.extract_keywords(
	text,
	keyphrase_ngram_range=(1, 2),
	stop_words='english',
	top_n=top_n,
	use_maxsum=True,
	nr_candidates=20
	)
	return [
	{"keyword": kw[0], "relevance": round(float(kw[1]), 3)}
	for kw in keywords
	]
	except:
	return []


	def _classify_topic(self, text):
	"""Classify text into call center topics"""
	if self.topic_classifier is None or len(text.split()) < 5:
	return {"topic": "unknown", "confidence": 0.0}
	try:
	result = self.topic_classifier(text, self.topic_labels, multi_label=False)
	return {
	"topic": result['labels'][0],
	"confidence": round(float(result['scores'][0]), 3),
	"top_3_topics": [
	{"topic": label, "score": round(float(score), 3)}
	for label, score in zip(result['labels'][:3], result['scores'][:3])
	]
	}
	except:
	return {"topic": "unknown", "confidence": 0.0}


	def _aggregate_call_insights(self, results):
	"""Aggregate keywords and topics at call level"""
	if not results:
	return {"top_keywords": [], "primary_topic": {"topic": "unknown"}}
	all_keywords = {}
	for seg in results:
	if 'keywords' in seg:
	for kw in seg['keywords']:
	keyword = kw['keyword']
	score = kw['relevance']
	all_keywords[keyword] = max(all_keywords.get(keyword, 0), score)

	top_keywords = [
	{"keyword": k, "relevance": round(v, 3)}
	for k, v in sorted(all_keywords.items(), key=lambda x: x[1], reverse=True)[:10]
	]

	# Aggregate topics
	topic_votes = defaultdict(float)
	for seg in results:
	if 'topic' in seg and seg['topic']['confidence'] > 0.5:
	topic_votes[seg['topic']['topic']] += seg['topic']['confidence']

	primary_topic = {
	"topic": max(topic_votes, key=topic_votes.get) if topic_votes else "unknown",
	"confidence": round(topic_votes[max(topic_votes, key=topic_votes.get)] / len(results), 3) if topic_votes else 0.0
	}

	# Calculate stats
	total_words = sum(seg.get('speech_rate', {}).get('word_count', 0) for seg in results)
	wpm_values = [seg.get('speech_rate', {}).get('wpm', 0) for seg in results if seg.get('speech_rate', {}).get('wpm', 0) > 0]
	average_wpm = round(np.mean(wpm_values), 1) if wpm_values else 0

	return {
	"top_keywords": top_keywords,
	"primary_topic": primary_topic,
	"total_words": total_words,
	"average_wpm": average_wpm
	}


	if __name__ == "__main__":
	# Example usage
	print("Initialize with: analyzer = UltraRobustCallAnalytics(hf_token='YOUR_TOKEN')")
	print("Process with: result = analyzer.process_call('path/to/audio.wav')")