agentic-search/try6b.py

15 lines
490 B
Python

from mlx_lm import load, generate, stream_generate
model, tokenizer = load("mlx-community/Qwen3-0.6B-8bit")
prompt = "/nothink How do I make best use Qwen3 0.6B modell?"
if tokenizer.chat_template is not None:
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
messages, add_generation_prompt=True
)
for response in stream_generate(model, tokenizer, prompt, max_tokens=2048):
print(response.text, end="", flush=True)
print()