!pip install qwen-vl-utils
import torch
from qwen_vl_utils import process_vision_info

Requirement already satisfied: qwen-vl-utils in /usr/local/lib/python3.12/dist-packages (0.0.14)
Requirement already satisfied: av in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (17.0.1)
Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (26.1)
Requirement already satisfied: pillow in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (11.3.0)
Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from qwen-vl-utils) (2.32.4)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (3.4.7)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (3.13)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->qwen-vl-utils) (2026.4.22)

from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

# default: Load the model on the available device(s)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct", dtype="float16", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:93: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

structured_message = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# <|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n

image_inputs, video_inputs, metadata = process_vision_info(
                    messages,
                    return_video_kwargs=True,
                    return_video_metadata=True,
                    )
# additional video information but not used for now

image_inputs

[<PIL.Image.Image image mode=RGB size=2044x1372>]

inputs = processor(
    text=structured_message,
    images=image_inputs,
    videos=video_inputs,
    **metadata,
    return_tensors="pt",
    padding=True,
).to(model.device)

# inputs will contains:
# input_ids
# attention_mask
# pixel_values (N, dim) - patchify + flatten image pixels
# image_grid_thw (B, H, W) - information to duplicate the image padding and hot swap image embedding later

input_ids = inputs['input_ids']
question = input_ids.clone()
max_token = 100
with torch.no_grad():
  for i in range(max_token):
    out = model(input_ids=input_ids, pixel_values=inputs['pixel_values'], image_grid_thw=inputs['image_grid_thw'])
    logits = out['logits'][:, -1, :]
    next_tok = torch.argmax(logits, dim=-1)
    input_ids = torch.cat((input_ids, next_tok[None]), dim=-1)

answer = processor.batch_decode(input_ids[:, question.shape[1]:])
print(answer)

['This is a heartwarming photograph capturing a tender moment between a woman and her dog on a serene beach at sunset.\n\nThe scene is set on a wide, sandy beach with the ocean stretching out to the horizon. The sun is low, casting a warm, golden glow across the water and the sand, creating a soft, peaceful atmosphere. Gentle waves can be seen breaking in the distance.\n\nIn the foreground, a woman with long, dark hair is sitting on the sand, smiling warmly at her dog']