exportVLLM_MODEL=Qwen2.5-Coder-32B-GPTQ-Int4
# 启动
python3 -m vllm.entrypoints.openai.api_server --model$VLLM_MODEL--device=auto --enforce-eager --tensor-parallel-size=1 --max-model-len=4096--dtype=float16 --block-size=32 --trust-remote-code --port=9000# 测试curl-X POST "http://127.0.0.1:9000/v1/chat/completions"\-H"Authorization: Bearer xxxx"\-H"Content-Type: application/json"\-d'{"model": "$VLLM_MODEL","messages": [{"role": "user", "content": "What are some fun things to do in New York?"}],"max_tokens": 2048,"temperature": 0.0,"stream": false
}'