WilhelmT commited on
Commit
579b0dd
·
verified ·
1 Parent(s): 5a0c6b9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -2
README.md CHANGED
@@ -81,6 +81,7 @@ The `embedl-models` package is required, it provides the optimized FlashHead imp
81
  ---
82
 
83
  ## Usage Examples
 
84
 
85
  ### vLLM Inference
86
 
@@ -92,7 +93,7 @@ model_id = "embedl/Llama-3.2-1B-Instruct-FlashHead-W4A16"
92
 
93
  if __name__ == "__main__":
94
  sampling = SamplingParams(max_tokens=128, temperature=0.0)
95
- llm = LLM(model=model_id, trust_remote_code=True)
96
 
97
  prompt = "Write a haiku about coffee."
98
  output = llm.generate([prompt], sampling)
@@ -115,7 +116,8 @@ model_id = "embedl/Llama-3.2-1B-Instruct-FlashHead-W4A16"
115
  if __name__ == "__main__":
116
  asyncio.run(
117
  run_repl(
118
- model=model_id
 
119
  )
120
  )
121
  ```
 
81
  ---
82
 
83
  ## Usage Examples
84
+ **Note (vLLM context length):** `max_model_len=131072` may fail on GPUs without enough free VRAM for the KV cache. If you see a KV cache memory error, lower `max_model_len` (or increase `gpu_memory_utilization`).
85
 
86
  ### vLLM Inference
87
 
 
93
 
94
  if __name__ == "__main__":
95
  sampling = SamplingParams(max_tokens=128, temperature=0.0)
96
+ llm = LLM(model=model_id, trust_remote_code=True, max_model_len=131072)
97
 
98
  prompt = "Write a haiku about coffee."
99
  output = llm.generate([prompt], sampling)
 
116
  if __name__ == "__main__":
117
  asyncio.run(
118
  run_repl(
119
+ model=model_id,
120
+ max_model_len=131072
121
  )
122
  )
123
  ```