download
raw
19.6 kB
#include "vision_app.hpp"
#include <iostream>
#include <string>
#include <cstdlib>
#include <thread>
#include <chrono>
#include <pybind11/embed.h>
#include <pybind11/stl.h>
#include "httplib.h"
namespace py = pybind11;
using namespace pybind11::literals;
namespace vision_app {
const char* HTML_CONTENT = R"HTML(
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Qwen3.5-Vision C++ App</title>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
<style>
:root {
--primary: #4F46E5;
--primary-hover: #4338CA;
--bg-color: #F3F4F6;
--panel-bg: #FFFFFF;
--text-main: #111827;
--text-muted: #6B7280;
--border-color: #E5E7EB;
--border-radius: 12px;
}
body {
font-family: 'Inter', sans-serif;
background-color: var(--bg-color);
color: var(--text-main);
margin: 0;
padding: 2rem;
min-height: 100vh;
display: flex;
justify-content: center;
}
.container {
max-width: 1200px;
width: 100%;
display: flex;
flex-direction: column;
gap: 2rem;
}
.header {
text-align: center;
}
.header h1 {
margin: 0;
font-size: 2.5rem;
font-weight: 700;
color: var(--text-main);
letter-spacing: -0.025em;
}
.header p {
margin: 0.5rem 0 0;
color: var(--text-muted);
font-size: 1.125rem;
}
.main-content {
display: grid;
grid-template-columns: 1fr 1.2fr;
gap: 2rem;
}
@media (max-width: 768px) {
.main-content {
grid-template-columns: 1fr;
}
}
.panel {
background: var(--panel-bg);
padding: 2rem;
border-radius: var(--border-radius);
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
border: 1px solid var(--border-color);
}
.form-group {
margin-bottom: 1.5rem;
}
label {
font-weight: 500;
display: block;
margin-bottom: 0.5rem;
color: var(--text-main);
font-size: 0.875rem;
}
.image-upload-wrapper {
position: relative;
border: 2px dashed var(--border-color);
border-radius: var(--border-radius);
padding: 2rem;
text-align: center;
cursor: pointer;
transition: all 0.2s ease;
background: #F9FAFB;
}
.image-upload-wrapper:hover {
border-color: var(--primary);
background: #EEF2FF;
}
.image-upload-wrapper input[type="file"] {
position: absolute;
top: 0; left: 0; width: 100%; height: 100%;
opacity: 0; cursor: pointer;
}
.upload-text {
color: var(--text-muted);
font-size: 0.875rem;
pointer-events: none;
}
#preview {
max-width: 100%;
max-height: 300px;
display: none;
margin-top: 1rem;
border-radius: 8px;
object-fit: contain;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
}
select, textarea {
width: 100%;
padding: 0.75rem 1rem;
border-radius: 8px;
border: 1px solid var(--border-color);
font-family: inherit;
font-size: 0.875rem;
box-sizing: border-box;
transition: border-color 0.2s;
background: #FFFFFF;
}
select:focus, textarea:focus {
outline: none;
border-color: var(--primary);
box-shadow: 0 0 0 3px rgba(79, 70, 229, 0.1);
}
textarea { resize: vertical; min-height: 100px; }
button {
background: var(--primary);
color: white;
border: none;
padding: 0.875rem 1.5rem;
font-size: 1rem;
font-weight: 500;
border-radius: 8px;
cursor: pointer;
width: 100%;
transition: background-color 0.2s, transform 0.1s;
}
button:hover { background: var(--primary-hover); }
button:active { transform: scale(0.98); }
button:disabled { background: #9CA3AF; cursor: not-allowed; transform: none; }
.output-panel {
display: flex;
flex-direction: column;
}
.output-container {
flex-grow: 1;
background: #F9FAFB;
padding: 1.5rem;
border-radius: 8px;
border: 1px solid var(--border-color);
font-family: 'JetBrains Mono', 'Courier New', Courier, monospace;
font-size: 0.875rem;
white-space: pre-wrap;
word-wrap: break-word;
overflow-y: auto;
color: #374151;
min-height: 400px;
}
/* Spinner */
.loader-container {
display: none;
align-items: center;
justify-content: center;
gap: 0.5rem;
margin-top: 1rem;
color: var(--primary);
font-weight: 500;
font-size: 0.875rem;
}
.spinner {
width: 1.5rem;
height: 1.5rem;
border: 3px solid rgba(79, 70, 229, 0.2);
border-top-color: var(--primary);
border-radius: 50%;
animation: spin 1s linear infinite;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>Qwen3.5 Vision</h1>
<p>Advanced Image Understanding Native C++ Interface</p>
</div>
<div class="main-content">
<!-- Left Panel: Controls -->
<div class="panel">
<div class="form-group">
<label>Upload Image</label>
<div class="image-upload-wrapper" id="dropZone">
<div class="upload-text" id="uploadText">
<svg style="width: 2rem; height: 2rem; margin: 0 auto 0.5rem auto; color: #9CA3AF;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-8l-4-4m0 0L8 8m4-4v12"></path></svg>
<br/>Click or drag image here
</div>
<input type="file" id="imageInput" accept="image/*" />
</div>
<img id="preview" />
</div>
<div class="form-group">
<label>Task Category</label>
<select id="category">
<option value="Query">General Query</option>
<option value="Caption">Image Captioning</option>
<option value="Point">Point Coordinates</option>
<option value="Detect">Object Detection</option>
</select>
</div>
<div class="form-group">
<label>Prompt</label>
<textarea id="prompt" placeholder="e.g., Count the total number of objects and describe their context..."></textarea>
</div>
<button id="processBtn">Process Image</button>
<div class="loader-container" id="loader">
<div class="spinner"></div>
<span>Processing through C++ Pybind11 Engine...</span>
</div>
</div>
<!-- Right Panel: Output -->
<div class="panel output-panel">
<label>Inference Result</label>
<div class="output-container" id="output">Waiting for input...</div>
</div>
</div>
</div>
<script>
let base64Image = "";
const imageInput = document.getElementById('imageInput');
const preview = document.getElementById('preview');
const uploadText = document.getElementById('uploadText');
const dropZone = document.getElementById('dropZone');
function handleFile(file) {
if (!file) return;
const reader = new FileReader();
reader.onload = function(event) {
base64Image = event.target.result;
preview.src = base64Image;
preview.style.display = 'block';
uploadText.style.display = 'none';
dropZone.style.padding = '0';
dropZone.style.border = 'none';
};
reader.readAsDataURL(file);
}
imageInput.addEventListener('change', (e) => handleFile(e.target.files[0]));
// Drag and drop handling
dropZone.addEventListener('dragover', (e) => { e.preventDefault(); dropZone.style.borderColor = 'var(--primary)'; });
dropZone.addEventListener('dragleave', () => { dropZone.style.borderColor = 'var(--border-color)'; });
dropZone.addEventListener('drop', (e) => {
e.preventDefault();
dropZone.style.borderColor = 'var(--border-color)';
if (e.dataTransfer.files.length) {
imageInput.files = e.dataTransfer.files;
handleFile(e.dataTransfer.files[0]);
}
});
document.getElementById('processBtn').addEventListener('click', async function() {
if (!base64Image) { alert("Please upload an image first."); return; }
const category = document.getElementById('category').value;
const prompt = document.getElementById('prompt').value;
document.getElementById('loader').style.display = 'flex';
document.getElementById('output').innerText = "Running inference...\nThis might take a moment depending on your hardware.";
document.getElementById('processBtn').disabled = true;
try {
const response = await fetch('/api/process', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ image: base64Image, category: category, prompt: prompt })
});
const result = await response.json();
if (result.error) {
document.getElementById('output').innerText = "Error:\n\n" + result.error;
document.getElementById('output').style.color = "#DC2626";
} else {
document.getElementById('output').innerText = result.text;
document.getElementById('output').style.color = "#374151";
}
} catch (err) {
document.getElementById('output').innerText = "Request failed:\n\n" + err;
document.getElementById('output').style.color = "#DC2626";
} finally {
document.getElementById('loader').style.display = 'none';
document.getElementById('processBtn').disabled = false;
}
});
</script>
</body>
</html>
)HTML";
bool launch_app(
const std::string& model_name,
const std::string& architecture,
int port
) {
try {
py::print("Initializing Python environment for Inference...");
// Load Python modules
py::module_ torch = py::module_::import("torch");
py::module_ transformers = py::module_::import("transformers");
// Compute Devices & Dtypes
py::object cuda = torch.attr("cuda");
std::string device = cuda.attr("is_available")().cast<bool>() ? "cuda" : "cpu";
py::object dtype = torch.attr("float16");
if (device == "cuda" && cuda.attr("is_bf16_supported")().cast<bool>()) {
dtype = torch.attr("bfloat16");
}
py::print("Loading model:", model_name, "with architecture:", architecture);
py::object ModelClass = transformers.attr(architecture.c_str());
py::object model = ModelClass.attr("from_pretrained")(
model_name,
"torch_dtype"_a=dtype,
"device_map"_a=device
).attr("eval")();
py::object processor = transformers.attr("AutoProcessor").attr("from_pretrained")(model_name);
py::print("Model loaded successfully. Starting Native C++ HTTP Server on port", port, "...");
// Start HTTP Server
httplib::Server svr;
// Route: Serve Frontend GUI
svr.Get("/", [](const httplib::Request&, httplib::Response& res) {
res.set_content(HTML_CONTENT, "text/html");
});
// Route: API Endpoint (Handled by Pybind)
svr.Post("/api/process", [&model, &processor, &device](const httplib::Request& req, httplib::Response& res) {
// Re-acquire the Global Interpreter Lock since httplib handles requests on background threads
py::gil_scoped_acquire acquire;
try {
py::module_ base64 = py::module_::import("base64");
py::module_ io = py::module_::import("io");
py::module_ PIL_Image = py::module_::import("PIL.Image");
py::module_ json = py::module_::import("json");
// Decode incoming JSON
py::object parsed = json.attr("loads")(req.body);
std::string image_b64_full = parsed["image"].cast<std::string>();
std::string category = parsed["category"].cast<std::string>();
std::string prompt = parsed["prompt"].cast<std::string>();
// Strip data URI standard: 'data:image/jpeg;base64,'
size_t comma_pos = image_b64_full.find(',');
std::string image_b64 = (comma_pos != std::string::npos) ? image_b64_full.substr(comma_pos + 1) : image_b64_full;
// Load image into PIL
py::object image_bytes = base64.attr("b64decode")(image_b64);
py::object bytes_io = io.attr("BytesIO")(image_bytes);
py::object rgb_image = PIL_Image.attr("open")(bytes_io).attr("convert")("RGB");
rgb_image.attr("thumbnail")(py::make_tuple(512, 512));
// Process Prompt
std::string full_prompt = prompt;
if (category == "Caption") full_prompt = "Provide a " + prompt + " length caption for the image.";
else if (category == "Point") full_prompt = "Provide 2d point coordinates for " + prompt + ". Report in JSON format.";
else if (category == "Detect") full_prompt = "Provide bounding box coordinates for " + prompt + ". Report in JSON format.";
py::list content;
content.append(py::dict("type"_a="image", "image"_a=rgb_image));
content.append(py::dict("type"_a="text", "text"_a=full_prompt));
py::list messages;
messages.append(py::dict("role"_a="user", "content"_a=content));
py::object text_prompt = processor.attr("apply_chat_template")(
messages, "tokenize"_a=false, "add_generation_prompt"_a=true
);
py::list texts; texts.append(text_prompt);
py::list images; images.append(rgb_image);
py::object batch_encoding = processor(
"text"_a=texts,
"images"_a=images,
"return_tensors"_a="pt",
"padding"_a=true
);
py::object inputs = batch_encoding.attr("to")(device);
py::dict inputs_dict = py::dict(inputs);
py::object generated_ids = model.attr("generate")(
**inputs_dict,
"max_new_tokens"_a=512
);
// Strip the input prompt from generated output ids
py::object input_ids = inputs.attr("get")("input_ids");
py::int_ input_len = py::len(input_ids[py::int_(0)]);
py::object output_ids = generated_ids[py::make_tuple(py::int_(0), py::slice(input_len, py::none(), py::none()))];
// Final Decoding
py::object out_text = processor.attr("decode")(
output_ids,
"skip_special_tokens"_a=true,
"clean_up_tokenization_spaces"_a=false
);
// Format successful return packet
py::dict result;
result["text"] = out_text;
std::string json_res = json.attr("dumps")(result).cast<std::string>();
res.set_content(json_res, "application/json");
} catch (const std::exception& e) {
std::string error_msg = std::string(R"({"error": ")") + e.what() + R"("})";
res.set_content(error_msg, "application/json");
}
});
std::string url = "http://localhost:" + std::to_string(port);
std::cout << "\n=============================================\n";
std::cout << "App ready! Automatically opening: " << url << "\n";
std::cout << "=============================================\n\n";
// Launch browser in a detached thread slightly delayed so the server is up
std::thread([url]() {
std::this_thread::sleep_for(std::chrono::milliseconds(500));
#if defined(_WIN32)
std::string cmd = "start " + url;
system(cmd.c_str());
#elif defined(__APPLE__)
std::string cmd = "open " + url;
system(cmd.c_str());
#else
std::string cmd = "xdg-open " + url;
system(cmd.c_str());
#endif
}).detach();
// Release the Python GIL for the duration of the server lifecycle so incoming requests can acquire it
py::gil_scoped_release release;
// Block execution and run the server
svr.listen("0.0.0.0", port);
return true;
} catch (const py::error_already_set& e) {
std::cerr << "Python error occurred: " << e.what() << std::endl;
return false;
} catch (const std::exception& e) {
std::cerr << "C++ error occurred: " << e.what() << std::endl;
return false;
}
}
} // namespace vision_app

Xet Storage Details

Size:
19.6 kB
·
Xet hash:
7f3c47919d6a60628cce6f6a615f982c25a4c3a68772241bed5da3941f8f6d1e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.