tommytracx commited on
Commit
9104d9c
·
verified ·
1 Parent(s): 36a53a4

Add tokenization_ollama.py

Browse files
Files changed (1) hide show
  1. tokenization_ollama.py +125 -0
tokenization_ollama.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ NeuralQuantum Ollama Tokenizer for Hugging Face Transformers
3
+ """
4
+
5
+ import json
6
+ from typing import List, Optional, Union
7
+ from transformers import PreTrainedTokenizer
8
+
9
+
10
+ class NeuralQuantumOllamaTokenizer(PreTrainedTokenizer):
11
+ """Tokenizer for NeuralQuantum Ollama model"""
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_file=None,
16
+ merges_file=None,
17
+ tokenizer_file=None,
18
+ unk_token="<|endoftext|>",
19
+ bos_token="<|endoftext|>",
20
+ eos_token="<|endoftext|>",
21
+ pad_token="<|endoftext|>",
22
+ quantum_token="<|quantum|>",
23
+ classical_token="<|classical|>",
24
+ system_token="<|system|>",
25
+ user_token="<|user|>",
26
+ assistant_token="<|assistant|>",
27
+ add_prefix_space=False,
28
+ **kwargs
29
+ ):
30
+ # Simple vocabulary for demonstration
31
+ vocab = {
32
+ "<|endoftext|>": 0,
33
+ "<|quantum|>": 1,
34
+ "<|classical|>": 2,
35
+ "<|system|>": 3,
36
+ "<|user|>": 4,
37
+ "<|assistant|>": 5,
38
+ }
39
+
40
+ # Add basic vocabulary
41
+ for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"):
42
+ vocab[char] = i + 6
43
+
44
+ # Set vocab before calling super().__init__
45
+ self._vocab = vocab
46
+ self._ids_to_tokens = {v: k for k, v in vocab.items()}
47
+
48
+ super().__init__(
49
+ unk_token=unk_token,
50
+ bos_token=bos_token,
51
+ eos_token=eos_token,
52
+ pad_token=pad_token,
53
+ add_prefix_space=add_prefix_space,
54
+ **kwargs
55
+ )
56
+
57
+ self.quantum_token = quantum_token
58
+ self.classical_token = classical_token
59
+ self.system_token = system_token
60
+ self.user_token = user_token
61
+ self.assistant_token = assistant_token
62
+
63
+ @property
64
+ def vocab_size(self):
65
+ return len(self._vocab)
66
+
67
+ def get_vocab(self):
68
+ return dict(self._vocab)
69
+
70
+ def _tokenize(self, text):
71
+ """Basic tokenization - split by whitespace and characters"""
72
+ tokens = []
73
+ current_token = ""
74
+
75
+ for char in text:
76
+ if char.isspace():
77
+ if current_token:
78
+ tokens.append(current_token)
79
+ current_token = ""
80
+ else:
81
+ current_token += char
82
+
83
+ if current_token:
84
+ tokens.append(current_token)
85
+
86
+ return tokens
87
+
88
+ def _convert_token_to_id(self, token):
89
+ """Convert token to ID"""
90
+ return self._vocab.get(token, self._vocab[self.unk_token])
91
+
92
+ def _convert_id_to_token(self, index):
93
+ """Convert ID to token"""
94
+ return self._ids_to_tokens.get(index, self.unk_token)
95
+
96
+ def convert_tokens_to_string(self, tokens):
97
+ """Convert tokens back to string"""
98
+ return " ".join(tokens)
99
+
100
+ def save_vocabulary(self, save_directory, filename_prefix=None):
101
+ """Save vocabulary to files"""
102
+ vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json"
103
+ vocab_path = f"{save_directory}/{vocab_file}"
104
+
105
+ with open(vocab_path, 'w') as f:
106
+ json.dump(self._vocab, f, indent=2)
107
+
108
+ return (vocab_path,)
109
+
110
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
111
+ """Build input with special tokens for Ollama format"""
112
+ if token_ids_1 is None:
113
+ return token_ids_0 + [self.eos_token_id]
114
+ return token_ids_0 + token_ids_1 + [self.eos_token_id]
115
+
116
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
117
+ """Get special tokens mask"""
118
+ if already_has_special_tokens:
119
+ return super().get_special_tokens_mask(
120
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
121
+ )
122
+
123
+ if token_ids_1 is not None:
124
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
125
+ return [1] + ([0] * len(token_ids_0)) + [1]