In this tutorial, we build an end-to-end implementation around Qwen 3.6-35B-A3B and explore how a modern multimodal MoE model can be used in practical workflows. We begin by setting up the environment, loading the model adaptively based on available GPU memory, and creating a reusable chat framework that supports both standard responses and explicit thinking traces. From there, we work through important capabilities such as thinking-budget control, streamed generation with separated reasoning and answers, vision input handling, tool calling, structured JSON generation, MoE routing inspection, benchmarking, retrieval-augmented generation, and session persistence. Through this process, we run the model for inference and also examine how to design a robust application layer on top of Qwen 3.6 for real experimentation and advanced prototyping. Copy CodeCopiedUse a different Browser import subprocess, sys def _pip(*a): subprocess.check_call([sys.executable, “-m”, “pip”, “install”, “-q”, *a]) _pip(“–upgrade”, “pip”) _pip(“–upgrade”, “transformers>=4.48.0”, “accelerate>=1.2.0”, “bitsandbytes>=0.44.0”, “pillow”, “requests”, “sentencepiece”, “qwen-vl-utils[decord]”, “sentence-transformers”, “jsonschema”) import torch, os, json, time, re, gc, io, threading, textwrap, warnings from collections import Counter from typing import Any, Optional warnings.filterwarnings(“ignore”) assert torch.cuda.is_available(), “GPU required. Switch runtime to A100 / L4.” p = torch.cuda.get_device_properties(0) VRAM_GB = p.total_memory / 1e9 print(f”GPU: {p.name} | VRAM: {VRAM_GB:.1f} GB | CUDA {torch.version.cuda} | torch {torch.__version__}”) if VRAM_GB >= 75: LOAD_MODE = “bf16” elif VRAM_GB >= 40: LOAD_MODE = “int8” else: LOAD_MODE = “int4” try: import flash_attn ATTN_IMPL = “flash_attention_2” except Exception: ATTN_IMPL = “sdpa” print(f”-> mode={LOAD_MODE} attn={ATTN_IMPL}”) from transformers import ( AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList, ) MODEL_ID = “Qwen/Qwen3.6-35B-A3B” kwargs = dict(device_map=”auto”, trust_remote_code=True, low_cpu_mem_usage=True, attn_implementation=ATTN_IMPL, torch_dtype=torch.bfloat16) if LOAD_MODE == “int8”: kwargs[“quantization_config”] = BitsAndBytesConfig(load_in_8bit=True) elif LOAD_MODE == “int4”: kwargs[“quantization_config”] = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type=”nf4″, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True) print(“Loading processor…”) processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) print(f”Loading model in {LOAD_MODE} (first run downloads ~70GB) …”) t0 = time.time() model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, **kwargs); model.eval() print(f”Loaded in {time.time()-t0:.0f}s | VRAM used: {torch.cuda.memory_allocated()/1e9:.1f} GB”) SAMPLING = { “thinking_general”: dict(temperature=1.0, top_p=0.95, top_k=20, presence_penalty=1.5), “thinking_coding”: dict(temperature=0.6, top_p=0.95, top_k=20, presence_penalty=0.0), “instruct_general”: dict(temperature=0.7, top_p=0.80, top_k=20, presence_penalty=1.5), “instruct_reason”: dict(temperature=1.0, top_p=1.00, top_k=40, presence_penalty=2.0), } THINK_OPEN, THINK_CLOSE = “<think>”, “</think>” def split_thinking(text: str): if THINK_OPEN in text and THINK_CLOSE in text: a = text.index(THINK_OPEN) + len(THINK_OPEN); b = text.index(THINK_CLOSE) return text[a:b].strip(), text[b + len(THINK_CLOSE):].strip() if THINK_CLOSE in text: b = text.index(THINK_CLOSE) return text[:b].strip(), text[b + len(THINK_CLOSE):].strip() return “”, text.strip() We set up the full environment required to run Qwen 3.6-35B-A3B in Google Colab and installed all supporting libraries for quantization, multimodal processing, retrieval, and schema validation. We then probe the available GPU, dynamically select the loading mode based on VRAM, and configure the attention backend so the model runs as efficiently as possible on the given hardware. After that, we load the processor and model from Hugging Face and define the core sampling presets and the thinking-splitting utility, which lay the foundation for all later interactions. Copy CodeCopiedUse a different Browser class QwenChat: def __init__(self, model, processor, system=None, tools=None): self.model, self.processor = model, processor self.tokenizer = processor.tokenizer self.history: list[dict] = [] if system: self.history.append({“role”: “system”, “content”: system}) self.tools = tools def user(self, content): self.history.append({“role”:”user”,”content”:content}); return self def assistant(self, content, reasoning=””): m = {“role”:”assistant”,”content”:content} if reasoning: m[“reasoning_content”] = reasoning self.history.append(m); return self def tool_result(self, name, result): self.history.append({“role”:”tool”,”name”:name, “content”: result if isinstance(result, str) else json.dumps(result)}) return self def _inputs(self, enable_thinking, preserve_thinking): return self.processor.apply_chat_template( self.history, tools=self.tools, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors=”pt”, enable_thinking=enable_thinking, preserve_thinking=preserve_thinking, ).to(self.model.device) def generate(self, *, enable_thinking=True, preserve_thinking=False, max_new_tokens=2048, preset=”thinking_general”, stopping_criteria=None, append_to_history=True): inp = self._inputs(enable_thinking, preserve_thinking) cfg = SAMPLING[preset] gk = dict(**inp, max_new_tokens=max_new_tokens, do_sample=True, temperature=cfg[“temperature”], top_p=cfg[“top_p”], top_k=cfg[“top_k”], repetition_penalty=1.0, pad_token_id=self.tokenizer.pad_token_id or self.tokenizer.eos_token_id) if stopping_criteria is not None: gk[“stopping_criteria”] = stopping_criteria with torch.inference_mode(): out = self.model.generate(**gk) raw = self.tokenizer.decode(out[0, inp[“input_ids”].shape[-1]:], skip_special_tokens=True) think, ans = split_thinking(raw) if append_to_history: self.assistant(ans, reasoning=think) return think, ans def stream(self, *, enable_thinking=True, preserve_thinking=False, max_new_tokens=2048, preset=”thinking_general”, on_thinking=None, on_answer=None): inp = self._inputs(enable_thinking, preserve_thinking) cfg = SAMPLING[preset] streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) gk = dict(**inp, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, temperature=cfg[“temperature”], top_p=cfg[“top_p”], top_k=cfg[“top_k”], pad_token_id=self.tokenizer.pad_token_id or self.tokenizer.eos_token_id) t = threading.Thread(target=self.model.generate, kwargs=gk); t.start() buf, in_think = “”, enable_thinking think_text, answer_text = “”, “” for piece in streamer: buf += piece if in_think: if THINK_CLOSE in buf: close_at = buf.index(THINK_CLOSE) resid = buf[:close_at] if on_thinking: on_thinking(resid[len(think_text):]) think_text = resid buf = buf[close_at + len(THINK_CLOSE):] in_think = False if buf and on_answer: on_answer(buf) answer_text = buf; buf = “” else: if on_thinking: on_thinking(piece) think_text += piece else: if on_answer: on_answer(piece) answer_text += piece t.join() self.assistant(answer_text.strip(), reasoning=think_text.strip()) return think_text.strip(), answer_text.strip() def save(self, path): with open(path, “w”) as f: json.dump({“history”: self.history, “tools”: self.tools}, f, indent=2) @classmethod def load(cls, model, processor, path): with open(path) as f: data = json.load(f) c = cls(model, processor, tools=data.get(“tools”)) c.history = data[“history”]; return c class ThinkingBudget(StoppingCriteria): def __init__(self, tokenizer, budget: int): self.budget = budget self.open_ids = tokenizer.encode(THINK_OPEN, add_special_tokens=False) self.close_ids = tokenizer.encode(THINK_CLOSE, add_special_tokens=False) self.start = None def _find(self, seq, needle): n = len(needle) for i in range(len(seq)-n+1): if seq[i:i+n] == needle: return i return None def __call__(self, input_ids, scores, **kwargs): seq = input_ids[0].tolist() if self.start is None: idx = self._find(seq, self.open_ids) if idx is not None: self.start = idx + len(self.open_ids) return False if self._find(seq[self.start:], self.close_ids) is not None: return False return (len(seq) – self.start) >= self.budget TOOL_CALL_RE = re.compile(r”<tool_call>s*({.*?})s*</tool_call>”, re.S) def run_calculate(expr: str) -> str: if any(c not in “0123456789+-*/().% ” for c in expr): return json.dumps({“error”:”illegal chars”}) try: return json.dumps({“result”: eval(expr, {“__builtins__”: {}}, {})}) except Exception as e: return json.dumps({“error”: str(e)}) _DOCS = { “qwen3.6”: “Qwen3.6-35B-A3B is a 35B MoE with 3B active params and 262k native context.”, “deltanet”: “Gated DeltaNet is a linear-attention variant used in Qwen3.6’s hybrid layers.”, “moe”: “Qwen3.6 uses 256 experts with 8 routed + 1 shared per token.”, } def run_search_docs(q): hits = [v for k,v in _DOCS.items() if k in q.lower()] return json.dumps({“results”: hits or [“no hits”]}) def run_get_time(): import datetime as dt return json.dumps({“iso”: dt.datetime.utcnow().isoformat()+”Z”}) TOOL_FNS = { “calculate”: lambda a: run_calculate(a[“expression”]), “search_docs”: lambda a: run_search_docs(a[“query”]), “get_time”: lambda a: run_get_time(), } TOOLS_SCHEMA = [ {“type”:”function”,”function”:{“name”:”calculate”,”description”:”Evaluate arithmetic.”, “parameters”:{“type”:”object”,”properties”:{“expression”:{“type”:”string”}},”required”:[“expression”]}}}, {“type”:”function”,”function”:{“name”:”search_docs”,”description”:”Search internal docs.”, “parameters”:{“type”:”object”,”properties”:{“query”:{“type”:”string”}},”required”:[“query”]}}}, {“type”:”function”,”function”:{“name”:”get_time”,”description”:”Get current UTC time.”, “parameters”:{“type”:”object”,”properties”:{}}}}, ] We build the main QwenChat conversation manager, which handles message history, tool messages, chat template formatting, standard generation, streaming generation, and session persistence. We also define the ThinkingBudget stopping criterion to