pengzhendong commited on
Commit
6139032
·
1 Parent(s): 0a34f48

update images

Browse files
.gitattributes CHANGED
@@ -1,35 +1,49 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
 
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.json filter=lfs diff=lfs merge=lfs -text
11
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
14
  *.onnx filter=lfs diff=lfs merge=lfs -text
15
  *.ot filter=lfs diff=lfs merge=lfs -text
16
  *.parquet filter=lfs diff=lfs merge=lfs -text
17
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
18
  *.pt filter=lfs diff=lfs merge=lfs -text
19
  *.pth filter=lfs diff=lfs merge=lfs -text
20
  *.rar filter=lfs diff=lfs merge=lfs -text
 
21
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
22
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
23
  *.tflite filter=lfs diff=lfs merge=lfs -text
24
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
25
  *.xz filter=lfs diff=lfs merge=lfs -text
26
  *.zip filter=lfs diff=lfs merge=lfs -text
27
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
28
+ *.tfevents* filter=lfs diff=lfs merge=lfs -text
29
+ *.db* filter=lfs diff=lfs merge=lfs -text
30
+ *.ark* filter=lfs diff=lfs merge=lfs -text
31
+ **/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
32
+ **/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
33
+ **/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
34
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
35
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
36
+ *.gguf* filter=lfs diff=lfs merge=lfs -text
37
+ *.ggml filter=lfs diff=lfs merge=lfs -text
38
+ *.llamafile* filter=lfs diff=lfs merge=lfs -text
39
+ *.pt2 filter=lfs diff=lfs merge=lfs -text
40
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
41
+ *.npy filter=lfs diff=lfs merge=lfs -text
42
+ *.npz filter=lfs diff=lfs merge=lfs -text
43
+ *.pickle filter=lfs diff=lfs merge=lfs -text
44
+ *.pkl filter=lfs diff=lfs merge=lfs -text
45
+ *.png filter=lfs diff=lfs merge=lfs -text
46
+ *.tar filter=lfs diff=lfs merge=lfs -text
47
+ *.wasm filter=lfs diff=lfs merge=lfs -text
48
  *.zst filter=lfs diff=lfs merge=lfs -text
49
  *tfevents* filter=lfs diff=lfs merge=lfs -text
Qwen3-0.6B/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:660db3b73d788119c04535e48cf9be5f55bc3100841a718637ae695b442f27dd
3
+ size 726
Qwen3-0.6B/generation_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2325da0f15bb848e018c5ae071b7943332e9f871d6b60e2ed22ca97d4cb993d2
3
+ size 239
Qwen3-0.6B/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Qwen3-0.6B/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b7828c3b5b16f2f3a6a784d7174c8d8af6b41ef639c79e655ccd6a1b84d7e87
3
+ size 1503300296
Qwen3-0.6B/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
Qwen3-0.6B/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5d09f07b48c3086c508b30d1c9114bd1189145b74e982a265350c923acd8101
3
+ size 9732
Qwen3-0.6B/vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
3
+ size 2776833
README.md CHANGED
@@ -1,3 +1,134 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # Fun-ASR
6
+
7
+ 「[简体中文](README_zh.md)」|「English」
8
+
9
+ Fun-ASR is an end-to-end speech recognition large model launched by Tongyi Lab. It is trained on tens of millions of hours of real speech data, possessing powerful contextual understanding capabilities and industry adaptability. It supports low-latency real-time transcription and covers 31 languages. It excels in vertical domains such as education and finance, accurately recognizing professional terminology and industry expressions, effectively addressing challenges like "hallucination" generation and language confusion, achieving "clear hearing, understanding meaning, and accurate writing."
10
+
11
+ <div align="center">
12
+ <img src="images/funasr-v2.png">
13
+ </div>
14
+
15
+ <div align="center">
16
+ <h4>
17
+ <a href="https://funaudiollm.github.io/Fun-ASR/"> Homepage </a>
18
+ |<a href="#core-features"> Core Features </a>
19
+ |<a href="#performance-evaluation"> Performance Evaluation </a>
20
+ |<a href="#environment-setup"> Environment Setup </a>
21
+ |<a href="#usage-tutorial"> Usage Tutorial </a>
22
+
23
+ </h4>
24
+
25
+ Model Repository: [modelscope](https://www.modelscope.cn/models/FunAudioLLM/Fun-ASR-Nano-2512), [huggingface](https://huggingface.co/FunAudioLLM/FunASR)
26
+
27
+ Online Experience:
28
+ [ModelScope Community Space](https://modelscope.cn/studios/FunAudioLLM/Fun-ASR-Nano), [huggingface space](https://huggingface.co/spaces/FunAudioLLM/FunASR)
29
+
30
+ </div>
31
+
32
+ # Core Features 🎯
33
+
34
+ **Fun-ASR** focuses on high-precision speech recognition, multi-language support, and industry customization capabilities
35
+
36
+ - **Far-field High-noise Recognition:** Deeply optimized for far-distance sound pickup and high-noise scenarios (such as conference rooms, in-vehicle environments, industrial sites, etc.), improving recognition accuracy to **93%**.
37
+ - **Chinese Dialects and Regional Accents:**
38
+ - Supports **7 major dialects**: Wu, Cantonese, Min, Hakka, Gan, Xiang, Jin
39
+ - Covers **26 regional accents**: including Henan, Shaanxi, Hubei, Sichuan, Chongqing, Yunnan, Guizhou, Guangdong, Guangxi and more than 20 other regions
40
+ - **Multi-language Free Speech:** Supports recognition of **31 languages**, with focused optimization on East and Southeast Asian languages, supporting free language switching and mixed recognition.
41
+ - **Music Background Lyric Recognition:** Enhanced speech recognition performance under music background interference, supporting accurate recognition of lyric content in songs.
42
+
43
+ # Environment Setup 🐍
44
+
45
+ ```shell
46
+ pip install -r requirements.txt
47
+ ```
48
+
49
+ <a name="usage-tutorial"></a>
50
+
51
+ # TODO
52
+
53
+ - [ ] Support returning timestamps
54
+ - [ ] Support speaker diarization
55
+ - [ ] Support model training
56
+
57
+ # Usage 🛠️
58
+
59
+ ## Inference
60
+
61
+ ### Using funasr for inference
62
+
63
+ ```python
64
+ from funasr import AutoModel
65
+
66
+
67
+ def main():
68
+ model_dir = "FunAudioLLM/fun-asr-nano"
69
+ model = AutoModel(
70
+ model=model_dir,
71
+ trust_remote_code=True,
72
+ remote_code="./model.py",
73
+ device="cuda:0",
74
+ )
75
+
76
+ wav_path = f"{model.model_path}/example/zh.mp3"
77
+ res = model.generate(input=[wav_path], cache={}, batch_size=1)
78
+ text = res[0]["text"]
79
+ print(text)
80
+
81
+ model = AutoModel(
82
+ model=model_dir,
83
+ trust_remote_code=True,
84
+ vad_model="fsmn-vad",
85
+ vad_kwargs={"max_single_segment_time": 30000},
86
+ remote_code="./model.py",
87
+ device="cuda:0",
88
+ )
89
+ res = model.generate(input=[wav_path], cache={}, batch_size=1)
90
+ text = res[0]["text"]
91
+ print(text)
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
96
+ ```
97
+
98
+ ### Direct Inference
99
+
100
+ ```python
101
+ from model import FunASRNano
102
+
103
+
104
+ def main():
105
+ model_dir = "FunAudioLLM/fun-asr-nano"
106
+ m, kwargs = FunASRNano.from_pretrained(model=model_dir, device="cuda:0")
107
+ m.eval()
108
+
109
+ wav_path = f"{kwargs['model_path']}/example/zh.mp3"
110
+ res = m.inference(data_in=[wav_path], **kwargs)
111
+ text = res[0][0]["text"]
112
+ print(text)
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
117
+ ```
118
+
119
+ <details><summary> Parameter Description (click to expand) </summary>
120
+
121
+ - `model_dir`: Model name or local disk model path.
122
+ - `trust_remote_code`: Whether to trust remote code for loading custom model implementations.
123
+ - `remote_code`: Specify the location of specific model code (e.g., `model.py` in the current directory), supporting both absolute and relative paths.
124
+ - `device`: Specify the device to use, such as "cuda:0" or "cpu".
125
+
126
+ </details>
127
+
128
+ # Performance Evaluation 📝
129
+
130
+ We compared the multi-language speech recognition performance of Fun-ASR with other models on open-source benchmark datasets (including AISHELL-1, AISHELL-2, Wenetspeech, Librispeech, and Common Voice).
131
+
132
+ <div align="center">
133
+ <img src="images/compare_en.png" width="800" />
134
+ </div>
README_zh.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # Fun-ASR
6
+
7
+ 「简体中文」|「[English](README.md)」
8
+
9
+ Fun-ASR 是通义实验室推出的端到端语音识别大模型,是基于数千万小时真实语音数据训练而成,具备强大的上下文理解能力与行业适应性,支持低延迟实时听写,并且覆盖 31 个语种。在教育、金融等垂直领域表现出色,能准确识别专业术语与行业表达,有效应对"幻觉"生成和语种混淆等挑战,实现"听得清、懂其意、写得准"。
10
+
11
+ <div align="center">
12
+ <img src="images/funasr-v2.png">
13
+ </div>
14
+
15
+ <div align="center">
16
+ <h4>
17
+ <a href="https://funaudiollm.github.io/Fun-ASR/"> Homepage </a>
18
+ |<a href="#核心特性"> 核心特性 </a>
19
+ |<a href="#性能评测"> 性能评测 </a>
20
+ |<a href="#环境安装"> 环境安装 </a>
21
+ |<a href="#用法教程"> 用法教程 </a>
22
+
23
+ </h4>
24
+
25
+ 模型仓库:[modelscope](https://www.modelscope.cn/models/FunAudioLLM/Fun-ASR-Nano-2512),[huggingface(coming)](https://huggingface.co/FunAudioLLM/FunASR)
26
+
27
+ 在线体验:
28
+ [魔搭社区创空间](https://modelscope.cn/studios/FunAudioLLM/Fun-ASR-Nano),[huggingface space(coming)](https://huggingface.co/spaces/FunAudioLLM/FunASR)
29
+
30
+ </div>
31
+
32
+ # 核心特性 🎯
33
+
34
+ **Fun-ASR** 专注于高精度语音识别、多语言支持和行业定制化能力
35
+
36
+ - **远场高噪声识别:** 针对远距离拾音及高噪声场景(如会议室、车载环境、工业现场等)进行深度优化,识别准确率提升至 **93%**。
37
+ - **中文方言与地方口音:**
38
+ - 支持 **7 大方言**:吴语、粤语、闽语、客家话、赣语、湘语、晋语
39
+ - 覆盖 **26 个地区口音**:包括河南、陕西、湖北、四川、重庆、云南、贵州、广东、广西等 20 多个地区
40
+ - **多语言自由说:** 支持 **31 种语言**识别,重点优化东亚与东南亚语种,支持语种自由切换和混合识别。
41
+ - **音乐背景歌词识别:** 强化在音乐背景干扰下的语音识别性能,支持对歌曲中歌词内容的精准识别。
42
+
43
+ # 环境安装 🐍
44
+
45
+ ```shell
46
+ pip install -r requirements.txt
47
+ ```
48
+
49
+ <a name="用法教程"></a>
50
+
51
+ # TODO
52
+
53
+ - [ ] 支持返回时间戳
54
+ - [ ] 支持区分说话人识别
55
+ - [ ] 支持模型训练
56
+
57
+ # 用法 🛠️
58
+
59
+ ## 推理
60
+
61
+ ### 使用 funasr 推理
62
+
63
+ ```python
64
+ from funasr import AutoModel
65
+
66
+
67
+ def main():
68
+ model_dir = "FunAudioLLM/fun-asr-nano"
69
+ model = AutoModel(
70
+ model=model_dir,
71
+ trust_remote_code=True,
72
+ remote_code="./model.py",
73
+ device="cuda:0",
74
+ )
75
+
76
+ wav_path = f"{model.model_path}/example/zh.mp3"
77
+ res = model.generate(input=[wav_path], cache={}, batch_size=1)
78
+ text = res[0]["text"]
79
+ print(text)
80
+
81
+ model = AutoModel(
82
+ model=model_dir,
83
+ trust_remote_code=True,
84
+ vad_model="fsmn-vad",
85
+ vad_kwargs={"max_single_segment_time": 30000},
86
+ remote_code="./model.py",
87
+ device="cuda:0",
88
+ )
89
+ res = model.generate(input=[wav_path], cache={}, batch_size=1)
90
+ text = res[0]["text"]
91
+ print(text)
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
96
+ ```
97
+
98
+ ### 直接推理
99
+
100
+ ```python
101
+ from model import FunASRNano
102
+
103
+
104
+ def main():
105
+ model_dir = "FunAudioLLM/fun-asr-nano"
106
+ m, kwargs = FunASRNano.from_pretrained(model=model_dir, device="cuda:0")
107
+ m.eval()
108
+
109
+ wav_path = f"{kwargs['model_path']}/example/zh.mp3"
110
+ res = m.inference(data_in=[wav_path], **kwargs)
111
+ text = res[0][0]["text"]
112
+ print(text)
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
117
+ ```
118
+
119
+ <details><summary> 参数说明(点击展开)</summary>
120
+
121
+ - `model_dir`:模型名称,或本地磁盘中的模型路径。
122
+ - `trust_remote_code`:是否信任远程代码,用于加载自定义模型实现。
123
+ - `remote_code`:指定模型具体代码的位置(例如,当前目录下的 `model.py`),支持绝对路径与相对路径。
124
+ - `device`:指定使用的设备,如 "cuda:0" 或 "cpu"。
125
+
126
+ </details>
127
+
128
+ # 性能评测 📝
129
+
130
+ 我们在开源基准数据集、中文方言测试集和工业测试集上,比较了 Fun-ASR 与其他模型的多语言语音识别性能。Fun-ASR 模型均具有明显的效果优势。
131
+
132
+ <div align="center">
133
+ <img src="images/compare_zh.png" width="800" />
134
+ </div>
config.yaml ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: FunASRNano
2
+ model_conf:
3
+ lsm_weight: 0.1
4
+ length_normalized_loss: true
5
+ audio_encoder: iic/SenseVoiceSmall
6
+ audio_encoder_conf:
7
+ hub: ms
8
+ freeze: true
9
+ freeze_layer_num: -1
10
+ feat_permute: true
11
+ llm: Qwen3-0.6b
12
+ llm_conf:
13
+ hub: hf
14
+ freeze: true
15
+ llm_dtype: bf16
16
+ init_param_path: Qwen3-0.6B
17
+ use_lora: false
18
+ lora_conf:
19
+ freeze_lora: true
20
+ task_type: CAUSAL_LM
21
+ r: 16
22
+ lora_alpha: 32
23
+ lora_dropout: 0.05
24
+ bias: none
25
+ target_modules:
26
+ - q_proj
27
+ - v_proj
28
+ init_param_path: ""
29
+ audio_adaptor: Transformer
30
+ audio_adaptor_conf:
31
+ downsample_rate: 1
32
+ ffn_dim: 2048
33
+ llm_dim: 1024
34
+ encoder_dim: 512
35
+ n_layer: 2
36
+ freeze: true
37
+ ctc_decoder: Transformer
38
+ detach_ctc_decoder: true
39
+ ctc_decoder_conf:
40
+ downsample_rate: 1
41
+ ffn_dim: 2048
42
+ llm_dim: 512
43
+ encoder_dim: 512
44
+ n_layer: 5
45
+ freeze: false
46
+ ctc_weight: 1.0
47
+ ctc_conf:
48
+ dropout_rate: 0.0
49
+ ctc_type: builtin
50
+ reduce: true
51
+ ignore_nan_grad: true
52
+ frontend: WavFrontend
53
+ frontend_conf:
54
+ fs: 16000
55
+ window: hamming
56
+ n_mels: 80
57
+ frame_length: 25
58
+ frame_shift: 10
59
+ lfr_m: 7
60
+ lfr_n: 6
61
+ cmvn_file: null
62
+ train_conf:
63
+ use_lora: ${llm_conf.use_lora}
64
+ accum_grad: 1
65
+ grad_clip: 5
66
+ max_epoch: 2
67
+ keep_nbest_models: 200
68
+ log_interval: 100
69
+ effective_save_name_excludes:
70
+ - llm.
71
+ resume: true
72
+ validate_interval: 2000
73
+ save_checkpoint_interval: 2000
74
+ avg_nbest_model: 100
75
+ use_bf16: false
76
+ use_deepspeed: true
77
+ deepspeed_config: null
78
+ save_init_model: false
79
+ optim: adamw
80
+ optim_conf:
81
+ lr: 5.0e-06
82
+ weight_decay: 0.0
83
+ scheduler: warmuplr
84
+ scheduler_conf:
85
+ warmup_steps: 2500
86
+ dataset: FunASRNano
87
+ dataset_conf:
88
+ index_ds: FunASRNano
89
+ batch_sampler: BatchSampler
90
+ batch_type: token
91
+ batch_size: 6000
92
+ max_token_length: 3500
93
+ shuffle: true
94
+ sort_size: 1024
95
+ batch_size_scale_ratio_max: 2
96
+ num_workers: 4
97
+ audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
98
+ audio_encoder_downsample_rate: 6
99
+ data_split_num: 256
100
+ batch_size_sample_max: 10
101
+ retry: 2000
102
+ batch_size_token_max: 6000
103
+ max_source_length: 12000
104
+ max_target_length: 2048
105
+ preprocessor_text: TextPreprocessHasRepeatedWords
106
+ preprocessor_text_conf:
107
+ max_ngram_length: 15
108
+ max_occurrences: 10
109
+ prompt_classes: MultiContextPrompt
110
+ prompt_conf:
111
+ max_neg_hotwords_num: 0
112
+ min_neg_hotwords_num: 0
113
+ use_hist: false
114
+ use_one_pass_result: true
115
+ use_hotwords: true
116
+ use_asr_hotwords: true
117
+ chinese_hotwords_list: null
118
+ english_hotwords_list: null
119
+ ctc_tokenizer: SenseVoiceTokenizer
120
+ ctc_target_normalize: true
121
+ ctc_tokenizer_conf:
122
+ vocab_path: null
123
+ is_multilingual: true
124
+ num_languages: 8749
125
+ min_source_length: 10
126
+ batch_size_scale_threshold: 3000
127
+ use_dynamic_output_ratio: 0.0
128
+ tokenizer: HuggingfaceTokenizer
129
+ tokenizer_conf:
130
+ init_param_path: ${llm_conf.init_param_path}
131
+ enable_tf32: true
132
+ debug: false
133
+ train_data_set_list: null
134
+ valid_data_set_list: null
135
+ init_param: null
136
+ output_dir: null
configuration.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b64a3a55d35bcbe2cf4d31f2d3ef25a423d3ba2ebff203298c27fa055f3c7612
3
+ size 398
example/en.mp3 ADDED
Binary file (57.4 kB). View file
 
example/ja.mp3 ADDED
Binary file (57.8 kB). View file
 
example/ko.mp3 ADDED
Binary file (27.9 kB). View file
 
example/yue.mp3 ADDED
Binary file (31.2 kB). View file
 
example/zh.mp3 ADDED
Binary file (45 kB). View file
 
images/compare_en.png ADDED

Git LFS Details

  • SHA256: 9c30014920ba46df9dca1030acdc4ca15311e75b3b26e5a00ea6ae80a019c451
  • Pointer size: 131 Bytes
  • Size of remote file: 214 kB
images/compare_zh.png ADDED

Git LFS Details

  • SHA256: 8c56704ba4aed9756884251e3a499e44784b4dac2bb5252afe118e37498ea728
  • Pointer size: 131 Bytes
  • Size of remote file: 162 kB
images/funasr-v2.png ADDED

Git LFS Details

  • SHA256: 0663f9c2fd384ac840df8045e76a52cccd3eb963fe3d351c63ae7fbbf686cd99
  • Pointer size: 131 Bytes
  • Size of remote file: 252 kB
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6caabc91f77998f97fadca160a488ece73dfab78c292c6af561fe31f12b2ddb
3
+ size 467729307
multilingual.tiktoken ADDED
The diff for this file is too large to render. See raw diff