开源领域的桌面端 Manus?最懂打工人的智能体?DeepChat !
(xinference-env) root@6138 :/home/pc# xinference launch --model-name QwQ-32B --model-type LLM --model-engine vLLM --model-format awq --size-in-billions 32 --quantization Int4 --n-gpu auto --replica 1 --n-worker 1 --reasoning-content false
Launch model name: QwQ-32B with kwargs: {'reasoning-content': False}
Traceback (most recent call last):
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/connection.py", line 203, in _new_conn
sock = connection.create_connection(
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
raise err
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/connectionpool.py", line 791, in urlopen
response = self._make_request(
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/connectionpool.py", line 497, in _make_request
conn.request(
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/connection.py", line 395, in request
self.endheaders()
File "/usr/lib/python3.10/http/client.py", line 1278, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/lib/python3.10/http/client.py", line 1038, in _send_output
self.send(msg)
File "/usr/lib/python3.10/http/client.py", line 976, in send
self.connect()
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/connection.py", line 243, in connect
self.sock = self._new_conn()
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/connection.py", line 218, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7ff740b07430>: Failed to establish a new connection: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/pc/xinference-env/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
resp = conn.urlopen(
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/connectionpool.py", line 845, in urlopen
retries = retries.increment(
File "/home/pc/xinference-env/lib/python3.10/site-packages/urllib3/util/retry.py", line 515, in increment
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=9997): Max retries exceeded with url: /v1/cluster/auth (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff740b07430>: Failed to establish a new connection: [Errno 111] Connection refused'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/pc/xinference-env/bin/xinference", line 8, in <module>
sys.exit(cli())
File "/home/pc/xinference-env/lib/python3.10/site-packages/click/core.py", line 1161, in __call__
return self.main(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/click/core.py", line 1082, in main
rv = self.invoke(ctx)
File "/home/pc/xinference-env/lib/python3.10/site-packages/click/core.py", line 1697, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/pc/xinference-env/lib/python3.10/site-packages/click/core.py", line 1443, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/pc/xinference-env/lib/python3.10/site-packages/click/core.py", line 788, in invoke
return __callback(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/click/decorators.py", line 33, in new_func
return f(get_current_context(), *args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/deploy/cmdline.py", line 924, in model_launch
client = RESTfulClient(base_url=endpoint, api_key=api_key)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/client/restful/restful_client.py", line 828, in __init__
self._check_cluster_authenticated()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/client/restful/restful_client.py", line 846, in _check_cluster_authenticated
response = requests.get(url)
File "/home/pc/xinference-env/lib/python3.10/site-packages/requests/api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
resp = self.send(prep, **send_kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
r = adapter.send(request, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/requests/adapters.py", line 700, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=9997): Max retries exceeded with url: /v1/cluster/auth (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff740b07430>: Failed to establish a new connection: [Errno 111] Connection refused'))
(xinference-env) root@6138 :/home/pc# xinference --host 0.0.0.0 --port 9997
/home/pc/xinference-env/lib/python3.10/site-packages/xinference/deploy/cmdline.py:166: DeprecationWarning: Starting a local 'xinference' cluster via the 'xinference' command line is deprecated and will be removed in a future release. Please use the new 'xinference-local' command.
warnings.warn(
INFO 03-24 20:30:30 __init__.py:190] Automatically detected platform cuda.
2025-03-24 20:30:31,698 xinference.core.supervisor 4007711 INFO Xinference supervisor 0.0.0.0:49460 started
2025-03-24 20:30:31,718 xinference.core.worker 4007711 INFO Purge cache directory: /root/.xinference/cache
2025-03-24 20:30:31,719 xinference.core.worker 4007711 INFO Connected to supervisor as a fresh worker
2025-03-24 20:30:31,733 xinference.core.worker 4007711 INFO Xinference worker 0.0.0.0:49460 started
2025-03-24 20:30:38,484 xinference.api.restful_api 4007478 INFO Starting Xinference at endpoint: http://0.0.0.0:9997
2025-03-24 20:30:38,616 uvicorn.error 4007478 INFO Uvicorn running on http://0.0.0.0:9997 (Press CTRL+C to quit)
2025-03-25 08:38:32,227 xinference.core.worker 4007711 INFO [request 7a78173a-0911-11f0-809b-1fb5340f90ef] Enter launch_builtin_model, args: <xinference.core.worker.WorkerActor object at 0x7ed2aa70b4c0>, kwargs: model_uid=QwQ-32B-0,model_name=QwQ-32B,model_size_in_billions=32,model_format=awq,quantization=Int4,model_engine=vLLM,model_type=LLM,n_gpu=auto,request_limits=None,peft_model_config=None,gpu_idx=None,download_hub=None,model_path=None,xavier_config=None,reasoning_content=False
2025-03-25 08:38:32,973 xinference.model.llm.llm_family 4007711 INFO Caching from Modelscope: Qwen/QwQ-32B-AWQ
ERROR:bitsandbytes.cextension:Could not load bitsandbytes native library: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Traceback (most recent call last):
File "/home/pc/xinference-env/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 85, in <module>
lib = get_native_library()
File "/home/pc/xinference-env/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 64, in get_native_library
cuda_specs = get_cuda_specs()
File "/home/pc/xinference-env/lib/python3.10/site-packages/bitsandbytes/cuda_specs.py", line 38, in get_cuda_specs
highest_compute_capability=(get_compute_capabilities()[-1]),
File "/home/pc/xinference-env/lib/python3.10/site-packages/bitsandbytes/cuda_specs.py", line 19, in get_compute_capabilities
return sorted(torch.cuda.get_device_capability(torch.cuda.device(i)) for i in range(torch.cuda.device_count()))
File "/home/pc/xinference-env/lib/python3.10/site-packages/bitsandbytes/cuda_specs.py", line 19, in <genexpr>
return sorted(torch.cuda.get_device_capability(torch.cuda.device(i)) for i in range(torch.cuda.device_count()))
File "/home/pc/xinference-env/lib/python3.10/site-packages/torch/cuda/__init__.py", line 509, in get_device_capability
prop = get_device_properties(device)
File "/home/pc/xinference-env/lib/python3.10/site-packages/torch/cuda/__init__.py", line 523, in get_device_properties
_lazy_init() # will define _get_device_properties
File "/home/pc/xinference-env/lib/python3.10/site-packages/torch/cuda/__init__.py", line 305, in _lazy_init
raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
WARNING:bitsandbytes.cextension:
CUDA Setup failed despite CUDA being available. Please run the following command to get more information:
python -m bitsandbytes
Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
and open an issue at: https://github.com/bitsandbytes-foundation/bitsandbytes/issues
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/QwQ-32B-AWQ
2025-03-25 08:38:35,242 - modelscope - INFO - Got 18 files, start to download ...
Downloading [config.json]: 100%|████████████████████████████████████████████████████████████████████████████████| 863/863 [00:00<00:00, 2.32kB/s]
Downloading [configuration.json]: 100%|█████████████████████████████████████████████████████████████████████████| 73.0/73.0 [00:00<00:00, 152B/s]
Downloading [added_tokens.json]: 100%|██████████████████████████████████████████████████████████████████████████| 707/707 [00:00<00:00, 1.39kB/s]
Downloading [generation_config.json]: 100%|███████████████████████████████████████████████████████████████████████| 243/243 [00:00<00:00, 488B/s]
Downloading [LICENSE]: 100%|████████████████████████████████████████████████████████████████████████████████| 11.1k/11.1k [00:00<00:00, 22.9kB/s]
Downloading [figures/benchmark.jpg]: 100%|█████████████████████████████████████████████████████████████████████| 313k/313k [00:00<00:00, 569kB/s]
Downloading [merges.txt]: 100%|█████████████████████████████████████████████████████████████████████████████| 1.59M/1.59M [00:00<00:00, 2.99MB/s]
Downloading [special_tokens_map.json]: 100%|████████████████████████████████████████████████████████████████████| 613/613 [00:00<00:00, 1.39kB/s]
Downloading [README.md]: 100%|██████████████████████████████████████████████████████████████████████████████| 7.17k/7.17k [00:00<00:00, 14.8kB/s]
Downloading [model.safetensors.index.json]: 100%|██████████████████████████████████████████████████████████████| 133k/133k [00:00<00:00, 209kB/s]
Downloading [tokenizer_config.json]: 100%|██████████████████████████████████████████████████████████████████| 8.10k/8.10k [00:00<00:00, 18.5kB/s]
Downloading [vocab.json]: 100%|█████████████████████████████████████████████████████████████████████████████| 2.65M/2.65M [00:00<00:00, 3.42MB/s]
Downloading [tokenizer.json]: 100%|█████████████████████████████████████████████████████████████████████████| 6.71M/6.71M [00:01<00:00, 6.25MB/s]
Downloading [model-00005-of-00005.safetensors]: 100%|███████████████████████████████████████████████████████| 3.24G/3.24G [02:56<00:00, 19.7MB/s]
Downloading [model-00001-of-00005.safetensors]: 100%|███████████████████████████████████████████████████████| 3.67G/3.67G [03:23<00:00, 19.4MB/s]
Downloading [model-00003-of-00005.safetensors]: 100%|███████████████████████████████████████████████████████| 3.68G/3.68G [03:24<00:00, 19.3MB/s]
Downloading [model-00002-of-00005.safetensors]: 100%|███████████████████████████████████████████████████████| 3.71G/3.71G [03:26<00:00, 19.3MB/s]
Downloading [model-00004-of-00005.safetensors]: 100%|███████████████████████████████████████████████████████| 3.71G/3.71G [03:26<00:00, 19.3MB/s]
Processing 18 items: 100%|████████████████████████████████████████████████████████████████████████████████████| 18.0/18.0 [03:26<00:00, 11.5s/it]
2025-03-25 08:42:01,869 - modelscope - INFO - Download model 'Qwen/QwQ-32B-AWQ' successfully.██████████████▉| 3.71G/3.71G [03:26<00:00, 19.9MB/s]
INFO 03-25 08:42:07 __init__.py:190] Automatically detected platform cuda. | 32.0M/3.67G [00:02<03:35, 18.2MB/s]
2025-03-25 08:42:08,195 xinference.core.model 304284 INFO Start requests handler.████████████████▋ | 3.25G/3.67G [02:56<00:22, 20.0MB/s]
2025-03-25 08:42:08,200 xinference.model.llm.vllm.core 304284 INFO Loading QwQ-32B with following model config: {'tokenizer_mode': 'auto', 'trust_remote_code': True, 'tensor_parallel_size': 1, 'block_size': 16, 'swap_space': 4, 'gpu_memory_utilization': 0.9, 'max_num_seqs': 256, 'quantization': None, 'max_model_len': None, 'guided_decoding_backend': 'outlines', 'scheduling_policy': 'fcfs'}Enable lora: False. Lora count: 0.
2025-03-25 08:42:08,212 transformers.configuration_utils 304284 INFO loading configuration file /root/.xinference/cache/QwQ-32B-awq-32b/config.json
loading configuration file /root/.xinference/cache/QwQ-32B-awq-32b/config.json
2025-03-25 08:42:08,212 transformers.configuration_utils 304284 INFO loading configuration file /root/.xinference/cache/QwQ-32B-awq-32b/config.json
loading configuration file /root/.xinference/cache/QwQ-32B-awq-32b/config.json
2025-03-25 08:42:08,213 transformers.configuration_utils 304284 INFO Model config Qwen2Config {
"_name_or_path": "/root/.xinference/cache/QwQ-32B-awq-32b",
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 27648,
"max_position_embeddings": 40960,
"max_window_layers": 64,
"model_type": "qwen2",
"num_attention_heads": 40,
"num_hidden_layers": 64,
"num_key_value_heads": 8,
"quantization_config": {
"bits": 4,
"group_size": 128,
"modules_to_not_convert": null,
"quant_method": "awq",
"version": "gemm",
"zero_point": true
},
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 1000000.0,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.48.3",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 152064
}
Model config Qwen2Config {
"_name_or_path": "/root/.xinference/cache/QwQ-32B-awq-32b",
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 27648,
"max_position_embeddings": 40960,
"max_window_layers": 64,
"model_type": "qwen2",
"num_attention_heads": 40,
"num_hidden_layers": 64,
"num_key_value_heads": 8,
"quantization_config": {
"bits": 4,
"group_size": 128,
"modules_to_not_convert": null,
"quant_method": "awq",
"version": "gemm",
"zero_point": true
},
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 1000000.0,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.48.3",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 152064
}
2025-03-25 08:42:08,214 transformers.models.auto.image_processing_auto 304284 INFO Could not locate the image processor configuration file, will try to use the model config instead.
Could not locate the image processor configuration file, will try to use the model config instead.
INFO 03-25 08:42:15 config.py:542] This model supports multiple tasks: {'classify', 'reward', 'score', 'generate', 'embed'}. Defaulting to 'generate'.
WARNING 03-25 08:42:15 config.py:621] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
WARNING 03-25 08:42:15 arg_utils.py:1135] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.
INFO 03-25 08:42:15 config.py:1556] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 03-25 08:42:15 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/root/.xinference/cache/QwQ-32B-awq-32b', speculative_config=None, tokenizer='/root/.xinference/cache/QwQ-32B-awq-32b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/root/.xinference/cache/QwQ-32B-awq-32b, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,
2025-03-25 08:42:15,870 transformers.tokenization_utils_base 304284 INFO loading file vocab.json
loading file vocab.json
2025-03-25 08:42:15,870 transformers.tokenization_utils_base 304284 INFO loading file merges.txt
loading file merges.txt
2025-03-25 08:42:15,870 transformers.tokenization_utils_base 304284 INFO loading file tokenizer.json
loading file tokenizer.json
2025-03-25 08:42:15,870 transformers.tokenization_utils_base 304284 INFO loading file added_tokens.json
loading file added_tokens.json
2025-03-25 08:42:15,870 transformers.tokenization_utils_base 304284 INFO loading file special_tokens_map.json
loading file special_tokens_map.json
2025-03-25 08:42:15,870 transformers.tokenization_utils_base 304284 INFO loading file tokenizer_config.json
loading file tokenizer_config.json
2025-03-25 08:42:15,870 transformers.tokenization_utils_base 304284 INFO loading file chat_template.jinja
loading file chat_template.jinja
2025-03-25 08:42:16,167 transformers.tokenization_utils_base 304284 INFO Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2025-03-25 08:42:16,257 transformers.generation.configuration_utils 304284 INFO loading configuration file /root/.xinference/cache/QwQ-32B-awq-32b/generation_config.json
loading configuration file /root/.xinference/cache/QwQ-32B-awq-32b/generation_config.json
2025-03-25 08:42:16,258 transformers.generation.configuration_utils 304284 INFO Generate config GenerationConfig {
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"pad_token_id": 151643,
"temperature": 0.6,
"top_k": 40,
"top_p": 0.95
}
Generate config GenerationConfig {
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"pad_token_id": 151643,
"temperature": 0.6,
"top_k": 40,
"top_p": 0.95
}
INFO 03-25 08:42:16 cuda.py:179] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 03-25 08:42:16 cuda.py:227] Using XFormers backend.
[rank0]:[W325 08:42:16.728141501 ProcessGroupGloo.cpp:715] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())
INFO 03-25 08:42:16 model_runner.py:1110] Starting to load model /root/.xinference/cache/QwQ-32B-awq-32b...
Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 20% Completed | 1/5 [00:03<00:12, 3.22s/it]
Loading safetensors checkpoint shards: 40% Completed | 2/5 [00:06<00:10, 3.49s/it]
Loading safetensors checkpoint shards: 60% Completed | 3/5 [00:10<00:07, 3.51s/it]
Loading safetensors checkpoint shards: 80% Completed | 4/5 [00:13<00:03, 3.32s/it]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:16<00:00, 3.30s/it]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:16<00:00, 3.34s/it]
INFO 03-25 08:42:34 model_runner.py:1115] Loading model weights took 18.1467 GB
INFO 03-25 08:42:37 worker.py:267] Memory profiling takes 3.34 seconds
INFO 03-25 08:42:37 worker.py:267] the current vLLM instance can use total_gpu_memory (21.66GiB) x gpu_memory_utilization (0.90) = 19.50GiB
INFO 03-25 08:42:37 worker.py:267] model weights take 18.15GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 1.41GiB; the rest of the memory reserved for KV Cache is -0.11GiB.
INFO 03-25 08:42:38 executor_base.py:110] # CUDA blocks: 0, # CPU blocks: 1024
INFO 03-25 08:42:38 executor_base.py:115] Maximum concurrency for 40960 tokens per request: 0.00x
2025-03-25 08:42:38,041 xinference.core.worker 4007711 ERROR Failed to load model QwQ-32B-0
Traceback (most recent call last):
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/worker.py", line 926, in launch_builtin_model
await model_ref.load()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 667, in send
result = await self._run_coro(message.message_id, coro)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/api.py", line 384, in __on_receive__
return await super().__on_receive__(message) # type: ignore
File "xoscar/core.pyx", line 558, in __on_receive__
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.__on_receive__
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive__
result = await result
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/model.py", line 466, in load
self._model.load()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 330, in load
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 644, in from_engine_args
engine = cls(
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 594, in __init__
self.engine = self._engine_class(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 267, in __init__
super().__init__(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 276, in __init__
self._initialize_kv_caches()
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 429, in _initialize_kv_caches
self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 121, in initialize_cache
self.collective_rpc("initialize_cache",
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 51, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/utils.py", line 2220, in run_method
return func(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/worker/worker.py", line 291, in initialize_cache
raise_if_cache_size_invalid(num_gpu_blocks,
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/worker/worker.py", line 539, in raise_if_cache_size_invalid
raise ValueError("No available memory for the cache blocks. "
ValueError: [address=0.0.0.0:33337, pid=304284] No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
2025-03-25 08:42:38,234 xinference.core.worker 4007711 ERROR [request 7a78173a-0911-11f0-809b-1fb5340f90ef] Leave launch_builtin_model, error: [address=0.0.0.0:33337, pid=304284] No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine., elapsed time: 246 s
Traceback (most recent call last):
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/utils.py", line 93, in wrapped
ret = await func(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/worker.py", line 926, in launch_builtin_model
await model_ref.load()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 667, in send
result = await self._run_coro(message.message_id, coro)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/api.py", line 384, in __on_receive__
return await super().__on_receive__(message) # type: ignore
File "xoscar/core.pyx", line 558, in __on_receive__
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.__on_receive__
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive__
result = await result
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/model.py", line 466, in load
self._model.load()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 330, in load
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 644, in from_engine_args
engine = cls(
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 594, in __init__
self.engine = self._engine_class(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 267, in __init__
super().__init__(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 276, in __init__
self._initialize_kv_caches()
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 429, in _initialize_kv_caches
self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 121, in initialize_cache
self.collective_rpc("initialize_cache",
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 51, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/utils.py", line 2220, in run_method
return func(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/worker/worker.py", line 291, in initialize_cache
raise_if_cache_size_invalid(num_gpu_blocks,
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/worker/worker.py", line 539, in raise_if_cache_size_invalid
raise ValueError("No available memory for the cache blocks. "
ValueError: [address=0.0.0.0:33337, pid=304284] No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
2025-03-25 08:42:38,252 xinference.api.restful_api 4007478 ERROR [address=0.0.0.0:33337, pid=304284] No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
Traceback (most recent call last):
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/api/restful_api.py", line 1002, in launch_model
model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 667, in send
result = await self._run_coro(message.message_id, coro)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/api.py", line 384, in __on_receive__
return await super().__on_receive__(message) # type: ignore
File "xoscar/core.pyx", line 558, in __on_receive__
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.__on_receive__
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive__
result = await result
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/supervisor.py", line 1190, in launch_builtin_model
await _launch_model()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/supervisor.py", line 1125, in _launch_model
subpool_address = await _launch_one_model(
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/supervisor.py", line 1083, in _launch_one_model
subpool_address = await worker_ref.launch_builtin_model(
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 667, in send
result = await self._run_coro(message.message_id, coro)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/api.py", line 384, in __on_receive__
return await super().__on_receive__(message) # type: ignore
File "xoscar/core.pyx", line 558, in __on_receive__
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.__on_receive__
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive__
result = await result
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/utils.py", line 93, in wrapped
ret = await func(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/worker.py", line 926, in launch_builtin_model
await model_ref.load()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 667, in send
result = await self._run_coro(message.message_id, coro)
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/home/pc/xinference-env/lib/python3.10/site-packages/xoscar/api.py", line 384, in __on_receive__
return await super().__on_receive__(message) # type: ignore
File "xoscar/core.pyx", line 558, in __on_receive__
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.__on_receive__
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive__
result = await result
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/core/model.py", line 466, in load
self._model.load()
File "/home/pc/xinference-env/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 330, in load
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 644, in from_engine_args
engine = cls(
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 594, in __init__
self.engine = self._engine_class(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 267, in __init__
super().__init__(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 276, in __init__
self._initialize_kv_caches()
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 429, in _initialize_kv_caches
self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 121, in initialize_cache
self.collective_rpc("initialize_cache",
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 51, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/utils.py", line 2220, in run_method
return func(*args, **kwargs)
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/worker/worker.py", line 291, in initialize_cache
raise_if_cache_size_invalid(num_gpu_blocks,
File "/home/pc/xinference-env/lib/python3.10/site-packages/vllm/worker/worker.py", line 539, in raise_if_cache_size_invalid
raise ValueError("No available memory for the cache blocks. "
ValueError: [address=0.0.0.0:33337, pid=304284] No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
2025-03-25 08:47:37,313 xinference.core.cache_tracker 4007711 WARNING Not record version info for model_name: CogVideoX-2b
2025-03-25 08:47:37,315 xinference.core.cache_tracker 4007711 WARNING Not record version info for model_name: CogVideoX-5b
2025-03-25 08:47:37,316 xinference.core.cache_tracker 4007711 WARNING Not record version info for model_name: HunyuanVideo
2025-03-25 08:50:00,673 xinference.core.cache_tracker 4007711 WARNING Not record version info for model_name: CogVideoX-2b
2025-03-25 08:50:00,675 xinference.core.cache_tracker 4007711 WARNING Not record version info for model_name: CogVideoX-5b
2025-03-25 08:50:00,677 xinference.core.cache_tracker 4007711 WARNING Not record version info for model_name: HunyuanVideo