19.6 kB

	#include "vision_app.hpp"
	#include <iostream>
	#include <string>
	#include <cstdlib>
	#include <thread>
	#include <chrono>
	#include <pybind11/embed.h>
	#include <pybind11/stl.h>

	#include "httplib.h"

	namespace py = pybind11;
	using namespace pybind11::literals;

	namespace vision_app {

	const char* HTML_CONTENT = R"HTML(
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Qwen3.5-Vision C++ App</title>
	<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
	<style>
	:root {
	--primary: #4F46E5;
	--primary-hover: #4338CA;
	--bg-color: #F3F4F6;
	--panel-bg: #FFFFFF;
	--text-main: #111827;
	--text-muted: #6B7280;
	--border-color: #E5E7EB;
	--border-radius: 12px;
	}

	body {
	font-family: 'Inter', sans-serif;
	background-color: var(--bg-color);
	color: var(--text-main);
	margin: 0;
	padding: 2rem;
	min-height: 100vh;
	display: flex;
	justify-content: center;
	}

	.container {
	max-width: 1200px;
	width: 100%;
	display: flex;
	flex-direction: column;
	gap: 2rem;
	}

	.header {
	text-align: center;
	}

	.header h1 {
	margin: 0;
	font-size: 2.5rem;
	font-weight: 700;
	color: var(--text-main);
	letter-spacing: -0.025em;
	}

	.header p {
	margin: 0.5rem 0 0;
	color: var(--text-muted);
	font-size: 1.125rem;
	}

	.main-content {
	display: grid;
	grid-template-columns: 1fr 1.2fr;
	gap: 2rem;
	}

	@media (max-width: 768px) {
	.main-content {
	grid-template-columns: 1fr;
	}
	}

	.panel {
	background: var(--panel-bg);
	padding: 2rem;
	border-radius: var(--border-radius);
	box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
	border: 1px solid var(--border-color);
	}

	.form-group {
	margin-bottom: 1.5rem;
	}

	label {
	font-weight: 500;
	display: block;
	margin-bottom: 0.5rem;
	color: var(--text-main);
	font-size: 0.875rem;
	}

	.image-upload-wrapper {
	position: relative;
	border: 2px dashed var(--border-color);
	border-radius: var(--border-radius);
	padding: 2rem;
	text-align: center;
	cursor: pointer;
	transition: all 0.2s ease;
	background: #F9FAFB;
	}

	.image-upload-wrapper:hover {
	border-color: var(--primary);
	background: #EEF2FF;
	}

	.image-upload-wrapper input[type="file"] {
	position: absolute;
	top: 0; left: 0; width: 100%; height: 100%;
	opacity: 0; cursor: pointer;
	}

	.upload-text {
	color: var(--text-muted);
	font-size: 0.875rem;
	pointer-events: none;
	}

	#preview {
	max-width: 100%;
	max-height: 300px;
	display: none;
	margin-top: 1rem;
	border-radius: 8px;
	object-fit: contain;
	box-shadow: 0 1px 3px rgba(0,0,0,0.1);
	}

	select, textarea {
	width: 100%;
	padding: 0.75rem 1rem;
	border-radius: 8px;
	border: 1px solid var(--border-color);
	font-family: inherit;
	font-size: 0.875rem;
	box-sizing: border-box;
	transition: border-color 0.2s;
	background: #FFFFFF;
	}

	select:focus, textarea:focus {
	outline: none;
	border-color: var(--primary);
	box-shadow: 0 0 0 3px rgba(79, 70, 229, 0.1);
	}

	textarea { resize: vertical; min-height: 100px; }

	button {
	background: var(--primary);
	color: white;
	border: none;
	padding: 0.875rem 1.5rem;
	font-size: 1rem;
	font-weight: 500;
	border-radius: 8px;
	cursor: pointer;
	width: 100%;
	transition: background-color 0.2s, transform 0.1s;
	}

	button:hover { background: var(--primary-hover); }
	button:active { transform: scale(0.98); }
	button:disabled { background: #9CA3AF; cursor: not-allowed; transform: none; }

	.output-panel {
	display: flex;
	flex-direction: column;
	}

	.output-container {
	flex-grow: 1;
	background: #F9FAFB;
	padding: 1.5rem;
	border-radius: 8px;
	border: 1px solid var(--border-color);
	font-family: 'JetBrains Mono', 'Courier New', Courier, monospace;
	font-size: 0.875rem;
	white-space: pre-wrap;
	word-wrap: break-word;
	overflow-y: auto;
	color: #374151;
	min-height: 400px;
	}

	/* Spinner */
	.loader-container {
	display: none;
	align-items: center;
	justify-content: center;
	gap: 0.5rem;
	margin-top: 1rem;
	color: var(--primary);
	font-weight: 500;
	font-size: 0.875rem;
	}

	.spinner {
	width: 1.5rem;
	height: 1.5rem;
	border: 3px solid rgba(79, 70, 229, 0.2);
	border-top-color: var(--primary);
	border-radius: 50%;
	animation: spin 1s linear infinite;
	}

	@keyframes spin {
	to { transform: rotate(360deg); }
	}
	</style>
	</head>
	<body>
	<div class="container">
	<div class="header">
	<h1>Qwen3.5 Vision</h1>
	<p>Advanced Image Understanding Native C++ Interface</p>
	</div>

	<div class="main-content">
	<!-- Left Panel: Controls -->
	<div class="panel">
	<div class="form-group">
	<label>Upload Image</label>
	<div class="image-upload-wrapper" id="dropZone">
	<div class="upload-text" id="uploadText">
	<svg style="width: 2rem; height: 2rem; margin: 0 auto 0.5rem auto; color: #9CA3AF;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-8l-4-4m0 0L8 8m4-4v12"></path></svg>
	<br/>Click or drag image here
	</div>
	<input type="file" id="imageInput" accept="image/*" />
	</div>
	<img id="preview" />
	</div>

	<div class="form-group">
	<label>Task Category</label>
	<select id="category">
	<option value="Query">General Query</option>
	<option value="Caption">Image Captioning</option>
	<option value="Point">Point Coordinates</option>
	<option value="Detect">Object Detection</option>
	</select>
	</div>

	<div class="form-group">
	<label>Prompt</label>
	<textarea id="prompt" placeholder="e.g., Count the total number of objects and describe their context..."></textarea>
	</div>

	<button id="processBtn">Process Image</button>
	<div class="loader-container" id="loader">
	<div class="spinner"></div>
	<span>Processing through C++ Pybind11 Engine...</span>
	</div>
	</div>

	<!-- Right Panel: Output -->
	<div class="panel output-panel">
	<label>Inference Result</label>
	<div class="output-container" id="output">Waiting for input...</div>
	</div>
	</div>
	</div>

	<script>
	let base64Image = "";
	const imageInput = document.getElementById('imageInput');
	const preview = document.getElementById('preview');
	const uploadText = document.getElementById('uploadText');
	const dropZone = document.getElementById('dropZone');

	function handleFile(file) {
	if (!file) return;
	const reader = new FileReader();
	reader.onload = function(event) {
	base64Image = event.target.result;
	preview.src = base64Image;
	preview.style.display = 'block';
	uploadText.style.display = 'none';
	dropZone.style.padding = '0';
	dropZone.style.border = 'none';
	};
	reader.readAsDataURL(file);
	}

	imageInput.addEventListener('change', (e) => handleFile(e.target.files[0]));

	// Drag and drop handling
	dropZone.addEventListener('dragover', (e) => { e.preventDefault(); dropZone.style.borderColor = 'var(--primary)'; });
	dropZone.addEventListener('dragleave', () => { dropZone.style.borderColor = 'var(--border-color)'; });
	dropZone.addEventListener('drop', (e) => {
	e.preventDefault();
	dropZone.style.borderColor = 'var(--border-color)';
	if (e.dataTransfer.files.length) {
	imageInput.files = e.dataTransfer.files;
	handleFile(e.dataTransfer.files[0]);
	}
	});

	document.getElementById('processBtn').addEventListener('click', async function() {
	if (!base64Image) { alert("Please upload an image first."); return; }

	const category = document.getElementById('category').value;
	const prompt = document.getElementById('prompt').value;

	document.getElementById('loader').style.display = 'flex';
	document.getElementById('output').innerText = "Running inference...\nThis might take a moment depending on your hardware.";
	document.getElementById('processBtn').disabled = true;

	try {
	const response = await fetch('/api/process', {
	method: 'POST',
	headers: { 'Content-Type': 'application/json' },
	body: JSON.stringify({ image: base64Image, category: category, prompt: prompt })
	});

	const result = await response.json();
	if (result.error) {
	document.getElementById('output').innerText = "Error:\n\n" + result.error;
	document.getElementById('output').style.color = "#DC2626";
	} else {
	document.getElementById('output').innerText = result.text;
	document.getElementById('output').style.color = "#374151";
	}
	} catch (err) {
	document.getElementById('output').innerText = "Request failed:\n\n" + err;
	document.getElementById('output').style.color = "#DC2626";
	} finally {
	document.getElementById('loader').style.display = 'none';
	document.getElementById('processBtn').disabled = false;
	}
	});
	</script>
	</body>
	</html>
	)HTML";


	bool launch_app(
	const std::string& model_name,
	const std::string& architecture,
	int port
	) {
	try {
	py::print("Initializing Python environment for Inference...");

	// Load Python modules
	py::module_ torch = py::module_::import("torch");
	py::module_ transformers = py::module_::import("transformers");

	// Compute Devices & Dtypes
	py::object cuda = torch.attr("cuda");
	std::string device = cuda.attr("is_available")().cast<bool>() ? "cuda" : "cpu";
	py::object dtype = torch.attr("float16");

	if (device == "cuda" && cuda.attr("is_bf16_supported")().cast<bool>()) {
	dtype = torch.attr("bfloat16");
	}

	py::print("Loading model:", model_name, "with architecture:", architecture);
	py::object ModelClass = transformers.attr(architecture.c_str());

	py::object model = ModelClass.attr("from_pretrained")(
	model_name,
	"torch_dtype"_a=dtype,
	"device_map"_a=device
	).attr("eval")();

	py::object processor = transformers.attr("AutoProcessor").attr("from_pretrained")(model_name);

	py::print("Model loaded successfully. Starting Native C++ HTTP Server on port", port, "...");

	// Start HTTP Server
	httplib::Server svr;

	// Route: Serve Frontend GUI
	svr.Get("/", [](const httplib::Request&, httplib::Response& res) {
	res.set_content(HTML_CONTENT, "text/html");
	});

	// Route: API Endpoint (Handled by Pybind)
	svr.Post("/api/process", [&model, &processor, &device](const httplib::Request& req, httplib::Response& res) {

	// Re-acquire the Global Interpreter Lock since httplib handles requests on background threads
	py::gil_scoped_acquire acquire;

	try {
	py::module_ base64 = py::module_::import("base64");
	py::module_ io = py::module_::import("io");
	py::module_ PIL_Image = py::module_::import("PIL.Image");
	py::module_ json = py::module_::import("json");

	// Decode incoming JSON
	py::object parsed = json.attr("loads")(req.body);
	std::string image_b64_full = parsed["image"].cast<std::string>();
	std::string category = parsed["category"].cast<std::string>();
	std::string prompt = parsed["prompt"].cast<std::string>();

	// Strip data URI standard: 'data:image/jpeg;base64,'
	size_t comma_pos = image_b64_full.find(',');
	std::string image_b64 = (comma_pos != std::string::npos) ? image_b64_full.substr(comma_pos + 1) : image_b64_full;

	// Load image into PIL
	py::object image_bytes = base64.attr("b64decode")(image_b64);
	py::object bytes_io = io.attr("BytesIO")(image_bytes);
	py::object rgb_image = PIL_Image.attr("open")(bytes_io).attr("convert")("RGB");
	rgb_image.attr("thumbnail")(py::make_tuple(512, 512));

	// Process Prompt
	std::string full_prompt = prompt;
	if (category == "Caption") full_prompt = "Provide a " + prompt + " length caption for the image.";
	else if (category == "Point") full_prompt = "Provide 2d point coordinates for " + prompt + ". Report in JSON format.";
	else if (category == "Detect") full_prompt = "Provide bounding box coordinates for " + prompt + ". Report in JSON format.";

	py::list content;
	content.append(py::dict("type"_a="image", "image"_a=rgb_image));
	content.append(py::dict("type"_a="text", "text"_a=full_prompt));

	py::list messages;
	messages.append(py::dict("role"_a="user", "content"_a=content));

	py::object text_prompt = processor.attr("apply_chat_template")(
	messages, "tokenize"_a=false, "add_generation_prompt"_a=true
	);

	py::list texts; texts.append(text_prompt);
	py::list images; images.append(rgb_image);

	py::object batch_encoding = processor(
	"text"_a=texts,
	"images"_a=images,
	"return_tensors"_a="pt",
	"padding"_a=true
	);

	py::object inputs = batch_encoding.attr("to")(device);
	py::dict inputs_dict = py::dict(inputs);

	py::object generated_ids = model.attr("generate")(
	**inputs_dict,
	"max_new_tokens"_a=512
	);

	// Strip the input prompt from generated output ids
	py::object input_ids = inputs.attr("get")("input_ids");
	py::int_ input_len = py::len(input_ids[py::int_(0)]);
	py::object output_ids = generated_ids[py::make_tuple(py::int_(0), py::slice(input_len, py::none(), py::none()))];

	// Final Decoding
	py::object out_text = processor.attr("decode")(
	output_ids,
	"skip_special_tokens"_a=true,
	"clean_up_tokenization_spaces"_a=false
	);

	// Format successful return packet
	py::dict result;
	result["text"] = out_text;
	std::string json_res = json.attr("dumps")(result).cast<std::string>();

	res.set_content(json_res, "application/json");

	} catch (const std::exception& e) {
	std::string error_msg = std::string(R"({"error": ")") + e.what() + R"("})";
	res.set_content(error_msg, "application/json");
	}
	});

	std::string url = "http://localhost:" + std::to_string(port);
	std::cout << "\n=============================================\n";
	std::cout << "App ready! Automatically opening: " << url << "\n";
	std::cout << "=============================================\n\n";

	// Launch browser in a detached thread slightly delayed so the server is up
	std::thread([url]() {
	std::this_thread::sleep_for(std::chrono::milliseconds(500));
	#if defined(_WIN32)
	std::string cmd = "start " + url;
	system(cmd.c_str());
	#elif defined(__APPLE__)
	std::string cmd = "open " + url;
	system(cmd.c_str());
	#else
	std::string cmd = "xdg-open " + url;
	system(cmd.c_str());
	#endif
	}).detach();

	// Release the Python GIL for the duration of the server lifecycle so incoming requests can acquire it
	py::gil_scoped_release release;

	// Block execution and run the server
	svr.listen("0.0.0.0", port);

	return true;
	} catch (const py::error_already_set& e) {
	std::cerr << "Python error occurred: " << e.what() << std::endl;
	return false;
	} catch (const std::exception& e) {
	std::cerr << "C++ error occurred: " << e.what() << std::endl;
	return false;
	}
	}

	} // namespace vision_app

Xet Storage Details

Size:: 19.6 kB
Xet hash:: 7f3c47919d6a60628cce6f6a615f982c25a4c3a68772241bed5da3941f8f6d1e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.