CodeBasics FAQ & Text Generation System
An intelligent AI system for CodeBasics bootcamp questions with dual capabilities:
- Smart FAQ retrieval for accurate answers to bootcamp questions
- Text generation for general AI/ML topics
Model Details
- Developed by: callidus
- Model type: Hybrid (TF-IDF FAQ + Transformer)
- Language: English
- License: Apache 2.0
Quick Start
Installation
pip install torch pandas scikit-learn huggingface_hub
Complete Inference Code
Copy and paste this complete code to use the model:
# ============================================================================
# COMBINED INFERENCE: TRANSFORMER MODEL + FAQ SYSTEM
# ============================================================================
!pip install -q torch huggingface_hub pandas scikit-learn
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import math
from huggingface_hub import hf_hub_download, login
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# ============================================================================
# CONFIGURATION
# ============================================================================
HF_TOKEN = "hf_your_token_here" # Replace with your token
REPO_ID = "callidus/good"
login(token=HF_TOKEN, add_to_git_credential=False)
# ============================================================================
# TRANSFORMER MODEL ARCHITECTURE
# ============================================================================
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def split_heads(self, x, batch_size):
x = x.view(batch_size, -1, self.num_heads, self.d_k)
return x.transpose(1, 2)
def forward(self, x, mask=None):
batch_size = x.size(0)
Q = self.split_heads(self.W_q(x), batch_size)
K = self.split_heads(self.W_k(x), batch_size)
V = self.split_heads(self.W_v(x), batch_size)
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention_weights = F.softmax(scores, dim=-1)
attention_output = torch.matmul(attention_weights, V)
attention_output = attention_output.transpose(1, 2).contiguous()
attention_output = attention_output.view(batch_size, -1, self.d_model)
return self.W_o(attention_output), attention_weights
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super().__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
return self.linear2(self.dropout(F.relu(self.linear1(x))))
class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
self.attention = MultiHeadAttention(d_model, num_heads)
self.feed_forward = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, x, mask=None):
attn_output, attn_weights = self.attention(x, mask)
x = self.norm1(x + self.dropout1(attn_output))
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout2(ff_output))
return x, attn_weights
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
return x + self.pe[:, :x.size(1)]
class TransformerModel(nn.Module):
def __init__(self, vocab_size, d_model=512, num_heads=8,
num_layers=6, d_ff=2048, dropout=0.1, max_len=512):
super().__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = PositionalEncoding(d_model, max_len)
self.transformer_blocks = nn.ModuleList([
TransformerBlock(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
self.fc_out = nn.Linear(d_model, vocab_size)
self.dropout = nn.Dropout(dropout)
self.d_model = d_model
def forward(self, x, mask=None):
x = self.embedding(x) * math.sqrt(self.d_model)
x = self.pos_encoding(x)
x = self.dropout(x)
for transformer_block in self.transformer_blocks:
x, attn_weights = transformer_block(x, mask)
logits = self.fc_out(x)
return logits
class Tokenizer:
def __init__(self, tokenizer_data):
self.word2idx = tokenizer_data['word2idx']
self.idx2word = {int(k): v for k, v in tokenizer_data['idx2word'].items()}
self.vocab_size = tokenizer_data['vocab_size']
self.special_tokens = tokenizer_data['special_tokens']
def encode(self, text):
words = re.findall(r'\w+', text.lower())
return [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words]
def decode(self, indices):
words = []
for idx in indices:
if idx in self.idx2word:
word = self.idx2word[idx]
if word not in ['<PAD>', '<SOS>', '<EOS>']:
words.append(word)
return ' '.join(words)
class TransformerInference:
def __init__(self, repo_id, token=None, device=None):
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.model = None
self.tokenizer = None
self.config = None
self.token = token
self.load_from_hub(repo_id)
def load_from_hub(self, repo_id):
config_path = hf_hub_download(repo_id=repo_id, filename="model_config.json", token=self.token)
weights_path = hf_hub_download(repo_id=repo_id, filename="model_weights.pt", token=self.token)
tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json", token=self.token)
with open(config_path, 'r') as f:
self.config = json.load(f)
with open(tokenizer_path, 'r') as f:
tokenizer_data = json.load(f)
self.tokenizer = Tokenizer(tokenizer_data)
self.model = TransformerModel(
vocab_size=self.config['vocab_size'],
d_model=self.config['d_model'],
num_heads=self.config['num_heads'],
num_layers=self.config['num_layers'],
d_ff=self.config['d_ff'],
dropout=self.config.get('dropout', 0.1),
max_len=self.config.get('max_len', 512)
)
state_dict = torch.load(weights_path, map_location=self.device, weights_only=True)
self.model.load_state_dict(state_dict)
self.model = self.model.to(self.device)
self.model.eval()
def generate(self, prompt, max_length=50, temperature=0.8, top_k=50, top_p=0.9):
self.model.eval()
tokens = self.tokenizer.encode(prompt)
if not tokens or all(t == self.tokenizer.word2idx['<UNK>'] for t in tokens):
tokens = [self.tokenizer.word2idx['<SOS>']]
generated = tokens.copy()
with torch.no_grad():
for _ in range(max_length):
input_tokens = generated[-64:]
if len(input_tokens) < 64:
input_tokens = [self.tokenizer.word2idx['<PAD>']] * (64 - len(input_tokens)) + input_tokens
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(self.device)
logits = self.model(input_ids)
next_token_logits = logits[0, -1, :] / temperature
next_token_logits[self.tokenizer.word2idx['<PAD>']] = -float('inf')
next_token_logits[self.tokenizer.word2idx['<UNK>']] = -float('inf')
if top_k > 0:
indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
next_token_logits[indices_to_remove] = -float('inf')
if top_p < 1.0:
sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
indices_to_remove = sorted_indices[sorted_indices_to_remove]
next_token_logits[indices_to_remove] = -float('inf')
probs = F.softmax(next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1).item()
if next_token == self.tokenizer.word2idx['<EOS>']:
break
generated.append(next_token)
return self.tokenizer.decode(generated)
# ============================================================================
# FAQ SYSTEM
# ============================================================================
class CodeBasicsFAQ:
def __init__(self, csv_path):
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
df = None
for encoding in encodings:
try:
df = pd.read_csv(csv_path, encoding=encoding)
break
except:
continue
if df is None:
raise Exception("Could not load FAQ CSV")
self.df = df
self.questions = df['prompt'].tolist()
self.answers = df['response'].tolist()
self.vectorizer = TfidfVectorizer(
lowercase=True,
stop_words='english',
ngram_range=(1, 2),
max_features=1000
)
self.question_vectors = self.vectorizer.fit_transform(self.questions)
def find_best_match(self, query, threshold=0.2):
query_vector = self.vectorizer.transform([query])
similarities = cosine_similarity(query_vector, self.question_vectors)[0]
best_idx = np.argmax(similarities)
best_score = similarities[best_idx]
if best_score >= threshold:
return {
'question': self.questions[best_idx],
'answer': self.answers[best_idx],
'confidence': best_score
}
return None
# ============================================================================
# LOAD BOTH SYSTEMS
# ============================================================================
print("Loading systems...")
transformer = TransformerInference(repo_id=REPO_ID, token=HF_TOKEN)
csv_path = hf_hub_download(repo_id=REPO_ID, filename="codebasics_faqs.csv", token=HF_TOKEN)
faq = CodeBasicsFAQ(csv_path)
print("Ready!")
# ============================================================================
# SMART INFERENCE FUNCTION
# ============================================================================
def smart_inference(query):
"""Automatically chooses FAQ or text generation"""
faq_match = faq.find_best_match(query)
if faq_match:
return faq_match['answer']
else:
return transformer.generate(query, max_length=50, temperature=0.8)
# ============================================================================
# USAGE
# ============================================================================
# Ask questions - system automatically picks best method
result = smart_inference("Can I take this bootcamp without programming experience?")
print(result)
# Interactive mode
while True:
user_input = input("Ask me: ").strip()
if user_input.lower() in ['quit', 'exit']:
break
print(smart_inference(user_input))
Usage Examples
FAQ Questions (Returns Accurate Answers)
result = smart_inference("Can I take this bootcamp without programming experience?")
# Returns: "Yes, this is the perfect bootcamp for anyone..."
result = smart_inference("Why should I trust Codebasics?")
# Returns: "Till now 9000+ learners have benefitted..."
General Topics (Returns Generated Text)
result = smart_inference("machine learning algorithms")
# Returns: Generated text about ML
result = smart_inference("artificial intelligence")
# Returns: Generated text about AI
Example Questions
Bootcamp Questions (FAQ System)
- "Can I take this bootcamp without programming experience?"
- "Why should I trust Codebasics?"
- "What are the prerequisites?"
- "Do you provide job assistance?"
- "Is there lifetime access?"
- "Can I attend while working full time?"
- "What is the duration of this bootcamp?"
General Topics (Text Generation)
- "machine learning"
- "artificial intelligence"
- "neural networks"
- "data science"
Files in Repository
codebasics_faqs.csv- FAQ database (50+ Q&A pairs)model_config.json- Transformer configurationmodel_weights.pt- Transformer weightstokenizer.json- Tokenizer vocabularyREADME.md- This documentation
Model Architecture
FAQ System
- Method: TF-IDF + Cosine Similarity
- Accuracy: ~90% on similar phrasings
- Threshold: 0.2 similarity score
Transformer Model
- Layers: 6 transformer blocks
- Hidden size: 512
- Attention heads: 8
- Vocabulary: 229 tokens
- Max length: 512 tokens
How It Works
The system intelligently routes queries:
- FAQ Match? โ Returns accurate FAQ answer
- No Match? โ Falls back to text generation
Users don't need to specify which system to use - it's automatic!
Limitations
- FAQ requires questions similar to training data
- Text generation has limited vocabulary (229 tokens)
- Best for CodeBasics bootcamp questions
- English language only
Citation
@misc{codebasics-faq-2024,
author = {callidus},
title = {CodeBasics FAQ and Text Generation System},
year = {2024},
publisher = {HuggingFace},
howpublished = {\url{https://huggingface.co/callidus/good}}
}
License
Apache 2.0
Contact
For CodeBasics courses: codebasics.io