Text Generation
Transformers
Safetensors
English
Chinese
deepseek_v3
conversational
custom_code
text-generation-inference
4-bit precision
awq
Instructions to use jasonyux/DeepSeek-R1-0528-AWQ with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use jasonyux/DeepSeek-R1-0528-AWQ with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="jasonyux/DeepSeek-R1-0528-AWQ", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("jasonyux/DeepSeek-R1-0528-AWQ", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("jasonyux/DeepSeek-R1-0528-AWQ", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use jasonyux/DeepSeek-R1-0528-AWQ with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "jasonyux/DeepSeek-R1-0528-AWQ" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "jasonyux/DeepSeek-R1-0528-AWQ", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/jasonyux/DeepSeek-R1-0528-AWQ
- SGLang
How to use jasonyux/DeepSeek-R1-0528-AWQ with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "jasonyux/DeepSeek-R1-0528-AWQ" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "jasonyux/DeepSeek-R1-0528-AWQ", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "jasonyux/DeepSeek-R1-0528-AWQ" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "jasonyux/DeepSeek-R1-0528-AWQ", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use jasonyux/DeepSeek-R1-0528-AWQ with Docker Model Runner:
docker model run hf.co/jasonyux/DeepSeek-R1-0528-AWQ
| #!/usr/bin/env python3 | |
| """ | |
| Deep debugging of the chat template issue. | |
| """ | |
| import transformers | |
| from transformers import AutoTokenizer | |
| import jinja2 | |
| import json | |
| MODEL_PATH = "/home/hotaisle/workspace/models/DeepSeek-R1-0528" | |
| print(f"Transformers version: {transformers.__version__}") | |
| print("-" * 60) | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
| # Test 1: Check if the tokenizer supports custom kwargs | |
| print("\nTest 1: Checking tokenizer's apply_chat_template signature") | |
| import inspect | |
| sig = inspect.signature(tokenizer.apply_chat_template) | |
| print(f"Parameters: {list(sig.parameters.keys())}") | |
| # Test 2: Try to apply template manually with Jinja2 | |
| print("\n\nTest 2: Manual Jinja2 template application") | |
| try: | |
| from jinja2 import Environment, BaseLoader | |
| # Create Jinja2 environment | |
| env = Environment(loader=BaseLoader()) | |
| template_str = tokenizer.chat_template | |
| template = env.from_string(template_str) | |
| # Prepare variables | |
| messages = [{"role": "user", "content": "What is 2+2?"}] | |
| # Test with enable_thinking=False | |
| output = template.render( | |
| messages=messages, | |
| bos_token=tokenizer.bos_token, | |
| eos_token=tokenizer.eos_token, | |
| add_generation_prompt=True, | |
| enable_thinking=False # This is what we're testing | |
| ) | |
| print(f"Manual render with enable_thinking=False:") | |
| print(f"Output ends with: {repr(output[-130:])}") | |
| print(f"Contains empty think block: {'<think>\\n\\n</think>\\n\\n' in output}") | |
| except Exception as e: | |
| print(f"Error in manual rendering: {e}") | |
| # Test 3: Check the exact template condition | |
| print("\n\nTest 3: Analyzing template condition") | |
| template_str = tokenizer.chat_template | |
| enable_thinking_idx = template_str.find("enable_thinking") | |
| if enable_thinking_idx != -1: | |
| # Extract the condition | |
| start = template_str.rfind("{%", 0, enable_thinking_idx) | |
| end = template_str.find("%}", enable_thinking_idx) + 2 | |
| condition = template_str[start:end] | |
| print(f"Found condition: {condition}") | |
| # Check for potential issues | |
| if "is false" in condition: | |
| print("✓ Uses 'is false' (correct for Jinja2)") | |
| elif "== false" in condition: | |
| print("⚠ Uses '== false' (might need 'is false')") | |
| elif "== False" in condition: | |
| print("⚠ Uses '== False' (Python style, might need 'is false')") | |
| # Test 4: Try different ways to pass the parameter | |
| print("\n\nTest 4: Testing different parameter passing methods") | |
| # Method 1: Direct kwargs (what we've been trying) | |
| try: | |
| result1 = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=False | |
| ) | |
| print("Method 1 (kwargs): Works") | |
| except Exception as e: | |
| print(f"Method 1 (kwargs): Error - {e}") | |
| # Method 2: Through a dict | |
| try: | |
| result2 = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| **{"enable_thinking": False} | |
| ) | |
| print("Method 2 (dict unpacking): Works") | |
| except Exception as e: | |
| print(f"Method 2 (dict unpacking): Error - {e}") | |
| # Test 5: Check if newer transformers supports it | |
| print("\n\nTest 5: Checking transformers version compatibility") | |
| print(f"Current version: {transformers.__version__}") | |
| print("Note: Custom chat template parameters require transformers >= 4.34.0") | |
| # Parse version | |
| version_parts = transformers.__version__.split('.') | |
| major = int(version_parts[0]) | |
| minor = int(version_parts[1].split('.')[0] if '.' in version_parts[1] else version_parts[1]) | |
| if major > 4 or (major == 4 and minor >= 34): | |
| print("✓ Version should support custom parameters") | |
| else: | |
| print("✗ Version too old for custom parameters!") | |
| # Test 6: Alternative - modify the template to always inject empty think | |
| print("\n\nTest 6: Testing a simpler template modification") | |
| print("If all else fails, you could modify the template to always inject empty think") | |
| print("when a specific string is in the user message, like 'NOTHINK'") | |