satyamr196 commited on
Commit
09343ba
·
1 Parent(s): aa23397

Instead of Pre-loading of dataset, now server will use the lazy loading, i.e. load dataset only when a user sends request, overall it's loaded only once

Browse files
Files changed (1) hide show
  1. ASR_Server.py +39 -7
ASR_Server.py CHANGED
@@ -38,16 +38,32 @@ job_status = {
38
  "error_trace": None
39
  }
40
 
 
41
  csv_path = "test.csv"
42
- # csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
43
- # csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
44
  df = pd.read_csv(csv_path)
45
  print(f"CSV Loaded with {len(df)} rows")
46
 
47
- # Load dataset without decoding audio (required!)
48
- dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
49
- # dataset = dataset.with_format("python", decode_audio=False)
50
- dataset = dataset.cast_column("audio", Audio(decode=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def generateTranscript(ASR_model):
53
  import os
@@ -76,7 +92,23 @@ def generateTranscript(ASR_model):
76
  csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
77
 
78
  try:
79
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  # Check if transcript already exists
81
  df_transcript = download_csv(csv_transcript)
82
  if(df_transcript is None):
 
38
  "error_trace": None
39
  }
40
 
41
+
42
  csv_path = "test.csv"
 
 
43
  df = pd.read_csv(csv_path)
44
  print(f"CSV Loaded with {len(df)} rows")
45
 
46
+ # lazy loading of dataset code:
47
+ _dataset = None
48
+ _dataset_lock = threading.Lock()
49
+
50
+ def get_dataset():
51
+ """Lazily loads the dataset and ensures it's only loaded once."""
52
+ global _dataset
53
+ # Use a lock to prevent race conditions (two requests loading at once)
54
+ with _dataset_lock:
55
+ if _dataset is None:
56
+ print("Loading dataset for the first time...")
57
+ try:
58
+ ds = load_dataset("satyamr196/asr_fairness_audio", split="train")
59
+ _dataset = ds.cast_column("audio", Audio(decode=False))
60
+ print("Dataset loaded successfully.")
61
+ except Exception as e:
62
+ print(f"FATAL: Failed to load dataset: {e}")
63
+ # Propagate the error
64
+ raise
65
+ return _dataset
66
+ # --- END LAZY LOADING ---
67
 
68
  def generateTranscript(ASR_model):
69
  import os
 
92
  csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
93
 
94
  try:
95
+
96
+ # --- BLOCK TO LOAD THE DATASET ---
97
+ try:
98
+ print("Attempting to get dataset...")
99
+ dataset = get_dataset()
100
+ except Exception as e:
101
+ tb = traceback.format_exc()
102
+ print(f"❌ Failed to load dataset: {e}")
103
+ job_status.update({
104
+ "running": False,
105
+ "message": f"Critical error: Failed to load dataset.",
106
+ "error": str(e),
107
+ "error_trace": tb[:1000],
108
+ })
109
+ return # Stop the function if dataset fails to load
110
+ # --- END OF dataset load BLOCK ---
111
+
112
  # Check if transcript already exists
113
  df_transcript = download_csv(csv_transcript)
114
  if(df_transcript is None):