akhaliq's picture
akhaliq HF Staff
Update Gradio app with multiple files
4b82591 verified
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import spaces
import os
# Model configuration
MODEL_PATH = "ibm-granite/granite-4.0-h-1b"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Global variables to store model and tokenizer
tokenizer = None
model = None
def load_model():
"""Load the model and tokenizer"""
global tokenizer, model
if tokenizer is None or model is None:
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map=DEVICE)
model.eval()
print("Model loaded successfully!")
@spaces.GPU # Use GPU for inference
def chat_with_model(message, history):
"""
Chat function that processes user input and generates responses
Args:
message (str): Current user message
history (list): Previous conversation history
Returns:
str: Model response
"""
try:
# Load model if not already loaded
load_model()
# Prepare chat format
messages = []
# Add system message for better performance
messages.append({
"role": "system",
"content": "You are a helpful AI assistant. Provide clear, accurate, and helpful responses."
})
# Add conversation history
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
# Add current message
messages.append({"role": "user", "content": message})
# Apply chat template
chat = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize input
input_tokens = tokenizer(chat, return_tensors="pt").to(DEVICE)
# Generate response
with torch.no_grad():
output = model.generate(
**input_tokens,
max_new_tokens=200,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode response
full_response = tokenizer.batch_decode(output)[0]
# Extract only the assistant's response
# Find the start of assistant role
assistant_start = full_response.find('<|start_of_role|>assistant<|end_of_role|>')
if assistant_start != -1:
assistant_start += len('<|start_of_role|>assistant<|end_of_role|>')
assistant_response = full_response[assistant_start:].strip()
else:
# Fallback to original method if pattern not found
response_start = full_response.find('<|assistant|>')
if response_start != -1:
response_start += len('<|assistant|>')
assistant_response = full_response[response_start:].strip()
else:
assistant_response = full_response.strip()
# Clean up the response - remove end markers
assistant_response = assistant_response.replace('<|endoftext|>', '').replace('<|end_of_text|>', '').strip()
return assistant_response
except Exception as e:
print(f"Error generating response: {e}")
return f"I apologize, but I encountered an error: {str(e)}. Please try again."
def clear_chat():
"""Clear the chat history"""
return []
# Create the Gradio chat interface
def create_chat_app():
with gr.Blocks(title="IBM Granite Chat", css="""
.header {
text-align: center;
padding: 10px;
background: linear-gradient(90deg, #0066cc, #004499);
color: white;
margin-bottom: 20px;
border-radius: 10px;
}
.header a {
color: #ffffff;
text-decoration: none;
font-weight: bold;
}
.header a:hover {
text-decoration: underline;
}
""") as demo:
# Header with attribution
gr.HTML("""
<div class="header">
<h1>IBM Granite 4.0 Chat</h1>
<p>Powered by <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a></p>
</div>
""")
# Chat interface
chatbot = gr.ChatInterface(
fn=chat_with_model,
title="Chat with IBM Granite 4.0",
description="Chat with the IBM Granite 4.0 1B parameter language model. Ask questions, get help, or have a conversation!",
examples=[
"What is machine learning?",
"Explain quantum computing in simple terms",
"How can I improve my programming skills?",
"What are the latest developments in AI?",
"Tell me about IBM Research"
],
)
# Additional info
with gr.Accordion("Model Information", open=False):
gr.Markdown(f"""
## Model Details
- **Model**: {MODEL_PATH}
- **Parameters**: 1B
- **Device**: {DEVICE.upper()}
- **Max Tokens**: 200 per response
- **Temperature**: 0.7 (for balanced creativity and accuracy)
## Tips
- Ask specific questions for better results
- The model works best with clear, concise prompts
- Try asking follow-up questions to dive deeper into topics
- The model can help with programming, explanations, and general knowledge
""")
return demo
if __name__ == "__main__":
# Create and launch the app
app = create_chat_app()
# Launch configuration
app.launch()