In [1]:
!pip install qwen-vl-utils
import torch
from qwen_vl_utils import process_vision_info
Requirement already satisfied: qwen-vl-utils in /usr/local/lib/python3.12/dist-packages (0.0.14) Requirement already satisfied: av in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (17.0.1) Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (26.1) Requirement already satisfied: pillow in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (11.3.0) Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (2.32.4) Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (3.4.7) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (3.13) Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (2.5.0) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (2026.4.22)
In [2]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
# default: Load the model on the available device(s)
model = Qwen3VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-2B-Instruct", dtype="float16", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:93: UserWarning: The secret `HF_TOKEN` does not exist in your Colab secrets. To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session. You will be able to reuse this secret in all of your notebooks. Please note that authentication is recommended but still optional to access public models or datasets. warnings.warn( Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Can we rewrite the generation process? I want to understand how the image is input into the model and processed.
In [25]:
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
]
Here's the pipeline
structured message contains the necessary dummy image padding that we will use to drop in the image embedding later
In [26]:
structured_message = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# <|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n
extracting the PIL image from the text. This is how Qwen do this
In [27]:
image_inputs, video_inputs, metadata = process_vision_info(
messages,
return_video_kwargs=True,
return_video_metadata=True,
)
# additional video information but not used for now
In [30]:
image_inputs
Out[30]:
[<PIL.Image.Image image mode=RGB size=2044x1372>]
processing everything to get tokenized inputs
In [28]:
inputs = processor(
text=structured_message,
images=image_inputs,
videos=video_inputs,
**metadata,
return_tensors="pt",
padding=True,
).to(model.device)
# inputs will contains:
# input_ids
# attention_mask
# pixel_values (N, dim) - patchify + flatten image pixels
# image_grid_thw (B, H, W) - information to duplicate the image padding and hot swap image embedding later
time to input the model
In [29]:
input_ids = inputs['input_ids']
question = input_ids.clone()
max_token = 100
with torch.no_grad():
for i in range(max_token):
out = model(input_ids=input_ids, pixel_values=inputs['pixel_values'], image_grid_thw=inputs['image_grid_thw'])
logits = out['logits'][:, -1, :]
next_tok = torch.argmax(logits, dim=-1)
input_ids = torch.cat((input_ids, next_tok[None]), dim=-1)
answer = processor.batch_decode(input_ids[:, question.shape[1]:])
print(answer)
['This is a heartwarming photograph capturing a tender moment between a woman and her dog on a serene beach at sunset.\n\nThe scene is set on a wide, sandy beach with the ocean stretching out to the horizon. The sun is low, casting a warm, golden glow across the water and the sand, creating a soft, peaceful atmosphere. Gentle waves can be seen breaking in the distance.\n\nIn the foreground, a woman with long, dark hair is sitting on the sand, smiling warmly at her dog']
of course, this is the bare minumum to interact with the VLM. We will dive deeper later.