Instructions to use speechbrain/tts-fastspeech2-internal-alignment-ljspeech with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- speechbrain
How to use speechbrain/tts-fastspeech2-internal-alignment-ljspeech with speechbrain:
# interface not specified in config.json
- Notebooks
- Google Colab
- Kaggle
| # ################################ | |
| # Model: Fastspeech2 Internal Alignment | |
| # Authors: Yingzhi Wang | |
| # ################################ | |
| # Input parameters | |
| lexicon: | |
| - "AA" | |
| - "AE" | |
| - "AH" | |
| - "AO" | |
| - "AW" | |
| - "AY" | |
| - "B" | |
| - "CH" | |
| - "D" | |
| - "DH" | |
| - "EH" | |
| - "ER" | |
| - "EY" | |
| - "F" | |
| - "G" | |
| - "HH" | |
| - "IH" | |
| - "IY" | |
| - "JH" | |
| - "K" | |
| - "L" | |
| - "M" | |
| - "N" | |
| - "NG" | |
| - "OW" | |
| - "OY" | |
| - "P" | |
| - "R" | |
| - "S" | |
| - "SH" | |
| - "T" | |
| - "TH" | |
| - "UH" | |
| - "UW" | |
| - "V" | |
| - "W" | |
| - "Y" | |
| - "Z" | |
| - "ZH" | |
| - "-" | |
| - "!" | |
| - "'" | |
| - "(" | |
| - ")" | |
| - "," | |
| - "." | |
| - ":" | |
| - ";" | |
| - "?" | |
| - " " | |
| n_symbols: 52 #fixed depending on symbols in the lexicon (+1 for a dummy symbol used for padding, +1 for unknown) | |
| padding_idx: 0 | |
| n_mel_channels: 80 | |
| hidden_channels: 512 | |
| # Encoder parameters | |
| enc_num_layers: 4 | |
| enc_num_head: 2 | |
| enc_d_model: !ref <hidden_channels> | |
| enc_ffn_dim: 1024 | |
| enc_k_dim: !ref <hidden_channels> | |
| enc_v_dim: !ref <hidden_channels> | |
| enc_dropout: 0.2 | |
| # Aligner parameters | |
| in_query_channels: 80 | |
| in_key_channels: !ref <hidden_channels> | |
| attn_channels: 80 | |
| temperature: 0.0005 | |
| # Decoder parameters | |
| dec_num_layers: 4 | |
| dec_num_head: 2 | |
| dec_d_model: !ref <hidden_channels> | |
| dec_ffn_dim: 1024 | |
| dec_k_dim: !ref <hidden_channels> | |
| dec_v_dim: !ref <hidden_channels> | |
| dec_dropout: 0.2 | |
| # Postnet parameters | |
| postnet_embedding_dim: 512 | |
| postnet_kernel_size: 5 | |
| postnet_n_convolutions: 5 | |
| postnet_dropout: 0.2 | |
| # Common | |
| normalize_before: True | |
| ffn_type: 1dcnn #1dcnn or ffn | |
| ffn_cnn_kernel_size_list: [9, 1] | |
| # Variance predictor | |
| dur_pred_kernel_size: 3 | |
| pitch_pred_kernel_size: 3 | |
| energy_pred_kernel_size: 3 | |
| variance_predictor_dropout: 0.5 | |
| # Model | |
| model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2WithAlignment | |
| enc_num_layers: !ref <enc_num_layers> | |
| enc_num_head: !ref <enc_num_head> | |
| enc_d_model: !ref <enc_d_model> | |
| enc_ffn_dim: !ref <enc_ffn_dim> | |
| enc_k_dim: !ref <enc_k_dim> | |
| enc_v_dim: !ref <enc_v_dim> | |
| enc_dropout: !ref <enc_dropout> | |
| in_query_channels: !ref <in_query_channels> | |
| in_key_channels: !ref <in_key_channels> | |
| attn_channels: !ref <attn_channels> | |
| temperature: !ref <temperature> | |
| dec_num_layers: !ref <dec_num_layers> | |
| dec_num_head: !ref <dec_num_head> | |
| dec_d_model: !ref <dec_d_model> | |
| dec_ffn_dim: !ref <dec_ffn_dim> | |
| dec_k_dim: !ref <dec_k_dim> | |
| dec_v_dim: !ref <dec_v_dim> | |
| dec_dropout: !ref <dec_dropout> | |
| normalize_before: !ref <normalize_before> | |
| ffn_type: !ref <ffn_type> | |
| ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> | |
| n_char: !ref <n_symbols> | |
| n_mels: !ref <n_mel_channels> | |
| postnet_embedding_dim: !ref <postnet_embedding_dim> | |
| postnet_kernel_size: !ref <postnet_kernel_size> | |
| postnet_n_convolutions: !ref <postnet_n_convolutions> | |
| postnet_dropout: !ref <postnet_dropout> | |
| padding_idx: !ref <padding_idx> | |
| dur_pred_kernel_size: !ref <dur_pred_kernel_size> | |
| pitch_pred_kernel_size: !ref <pitch_pred_kernel_size> | |
| energy_pred_kernel_size: !ref <energy_pred_kernel_size> | |
| variance_predictor_dropout: !ref <variance_predictor_dropout> | |
| input_encoder: !new:speechbrain.dataio.encoder.TextEncoder | |
| modules: | |
| model: !ref <model> | |
| pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
| loadables: | |
| model: !ref <model> |