Subhadip007 commited on
Commit
76e224e
·
1 Parent(s): ff94536

feat: overhaul model routing — GLM-5.1 primary, strict 4-model fallback chain, purge all stale refs

Browse files
Files changed (3) hide show
  1. config/settings.py +1 -1
  2. src/api/main.py +1 -1
  3. src/rag/llm_client.py +34 -19
config/settings.py CHANGED
@@ -82,7 +82,7 @@ GROQ_API_KEY = os.getenv('GROQ_API_KEY') # Loaded from .env
82
  HF_API_KEY = os.getenv('HF_API_KEY')
83
  if HF_API_KEY:
84
  os.environ["HF_TOKEN"] = HF_API_KEY
85
- LLM_MODEL_NAME = 'llama-3.3-70b-versatile' # Groq model ID
86
  LLM_TEMPERATURE = 0.1 # Low = More factual/consistent
87
  LLM_MAX_TOKENS = 2048 # Max response tokens
88
 
 
82
  HF_API_KEY = os.getenv('HF_API_KEY')
83
  if HF_API_KEY:
84
  os.environ["HF_TOKEN"] = HF_API_KEY
85
+ LLM_MODEL_NAME = 'zai-org/GLM-5.1' # Primary model ID
86
  LLM_TEMPERATURE = 0.1 # Low = More factual/consistent
87
  LLM_MAX_TOKENS = 2048 # Max response tokens
88
 
src/api/main.py CHANGED
@@ -155,7 +155,7 @@ async def health_check(request: Request) -> HealthResponse:
155
 
156
  return HealthResponse(
157
  status = "healthy",
158
- model = "llama-3.3-70b-versatile",
159
  vector_db_size = qdrant_size,
160
  bm25_index_size = bm25_size,
161
  version = "1.0.0",
 
155
 
156
  return HealthResponse(
157
  status = "healthy",
158
+ model = "zai-org/GLM-5.1",
159
  vector_db_size = qdrant_size,
160
  bm25_index_size = bm25_size,
161
  version = "1.0.0",
src/rag/llm_client.py CHANGED
@@ -12,12 +12,34 @@ from config.settings import (
12
 
13
  logger = get_logger(__name__)
14
 
 
 
 
 
 
 
 
 
 
15
  class MultiModelClient:
16
  """
17
- Multi-model LLM client with Qwen primary and Groq backup.
18
- Supports code routing based on keywords.
 
 
 
 
 
19
  """
20
 
 
 
 
 
 
 
 
 
21
  def __init__(self):
22
  if GROQ_API_KEY:
23
  self.groq_client = Groq(api_key=GROQ_API_KEY)
@@ -26,18 +48,9 @@ class MultiModelClient:
26
 
27
  self.hf_api_key = HF_API_KEY
28
 
29
- self.primary_model = "Qwen/Qwen3.5-9B"
30
- self.secondary_model = "llama-3.3-70b-versatile"
31
- self.code_model = "Qwen/Qwen2.5-Coder-7B-Instruct"
32
-
33
- self.code_keywords = ["code", "implement", "function", "class", "python", "algorithm", "write a", "script"]
34
-
35
- def get_model_for_query(self, question: str):
36
- q_lower = question.lower()
37
- if any(kw in q_lower for kw in self.code_keywords):
38
- return [self.code_model, self.primary_model, self.secondary_model]
39
- return [self.primary_model, self.secondary_model]
40
-
41
  def _call_hf(self, model_id, messages, temperature, max_tokens, stream=False):
42
  if not self.hf_api_key:
43
  raise ValueError("HF_API_KEY not configured")
@@ -108,6 +121,9 @@ class MultiModelClient:
108
  else:
109
  return response.choices[0].message.content
110
 
 
 
 
111
  def generate(
112
  self,
113
  system_prompt: str,
@@ -119,26 +135,25 @@ class MultiModelClient:
119
  stream: bool = False
120
  ):
121
  """
122
- Generate response trying models in priority order.
123
  Returns a tuple of (result, model_used).
124
  If stream=True, result is a generator.
125
  Otherwise, result is a string.
126
  """
127
- models_to_try = self.get_model_for_query(original_query)
128
  messages = [{"role": "system", "content": system_prompt}]
129
  if history:
130
  messages.extend(history)
131
  messages.append({"role": "user", "content": user_prompt})
132
 
133
- for model in models_to_try:
134
  try:
135
- is_hf = "Qwen" in model
136
  logger.info(f"Attempting model: {model}")
137
  if is_hf:
138
  out = self._call_hf(model, messages, temperature, max_tokens, stream)
139
  else:
140
  out = self._call_groq(model, messages, temperature, max_tokens, stream)
141
-
142
  logger.info(f"Model {model} selected successfully.")
143
  return out, model
144
  except Exception as e:
 
12
 
13
  logger = get_logger(__name__)
14
 
15
+ # ---------------------------------------------------------------------------
16
+ # Model registry — single source of truth for every model ID in the system
17
+ # ---------------------------------------------------------------------------
18
+ # HF models are called via the HuggingFace Router endpoint.
19
+ # Groq models are called via the Groq SDK.
20
+ HF_MODELS = {"zai-org/GLM-5.1", "Qwen/Qwen3.5-9B", "Qwen/Qwen2.5-Coder-7B-Instruct"}
21
+ GROQ_MODELS = {"llama-3.3-70b-versatile"}
22
+
23
+
24
  class MultiModelClient:
25
  """
26
+ Multi-model LLM client with strict linear fallback.
27
+
28
+ Fallback order (never changes regardless of query content):
29
+ 1. zai-org/GLM-5.1 (HF — primary)
30
+ 2. Qwen/Qwen3.5-9B (HF — first fallback)
31
+ 3. llama-3.3-70b-versatile (Groq — second fallback)
32
+ 4. Qwen/Qwen2.5-Coder-7B-Instruct (HF — final fallback)
33
  """
34
 
35
+ # Strict, ordered fallback chain — do NOT re-order at runtime
36
+ MODEL_CHAIN = [
37
+ "zai-org/GLM-5.1",
38
+ "Qwen/Qwen3.5-9B",
39
+ "llama-3.3-70b-versatile",
40
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
41
+ ]
42
+
43
  def __init__(self):
44
  if GROQ_API_KEY:
45
  self.groq_client = Groq(api_key=GROQ_API_KEY)
 
48
 
49
  self.hf_api_key = HF_API_KEY
50
 
51
+ # ------------------------------------------------------------------
52
+ # Transport helpers
53
+ # ------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
54
  def _call_hf(self, model_id, messages, temperature, max_tokens, stream=False):
55
  if not self.hf_api_key:
56
  raise ValueError("HF_API_KEY not configured")
 
121
  else:
122
  return response.choices[0].message.content
123
 
124
+ # ------------------------------------------------------------------
125
+ # Public API
126
+ # ------------------------------------------------------------------
127
  def generate(
128
  self,
129
  system_prompt: str,
 
135
  stream: bool = False
136
  ):
137
  """
138
+ Generate response trying models in strict fallback order.
139
  Returns a tuple of (result, model_used).
140
  If stream=True, result is a generator.
141
  Otherwise, result is a string.
142
  """
 
143
  messages = [{"role": "system", "content": system_prompt}]
144
  if history:
145
  messages.extend(history)
146
  messages.append({"role": "user", "content": user_prompt})
147
 
148
+ for model in self.MODEL_CHAIN:
149
  try:
150
+ is_hf = model in HF_MODELS
151
  logger.info(f"Attempting model: {model}")
152
  if is_hf:
153
  out = self._call_hf(model, messages, temperature, max_tokens, stream)
154
  else:
155
  out = self._call_groq(model, messages, temperature, max_tokens, stream)
156
+
157
  logger.info(f"Model {model} selected successfully.")
158
  return out, model
159
  except Exception as e: