15 lines
490 B
Python
15 lines
490 B
Python
from mlx_lm import load, generate, stream_generate
|
|
|
|
model, tokenizer = load("mlx-community/Qwen3-0.6B-8bit")
|
|
|
|
prompt = "/nothink How do I make best use Qwen3 0.6B modell?"
|
|
|
|
if tokenizer.chat_template is not None:
|
|
messages = [{"role": "user", "content": prompt}]
|
|
prompt = tokenizer.apply_chat_template(
|
|
messages, add_generation_prompt=True
|
|
)
|
|
|
|
for response in stream_generate(model, tokenizer, prompt, max_tokens=2048):
|
|
print(response.text, end="", flush=True)
|
|
print() |