# Import 
from speechbrain.pretrained import SepformerSeparation

# Create and download pretrained model
speech_separator = SepformerSeparation.from_hparams(
    source='speechbrain/sepformer-wsj02mix',        # Model name
    savedir='pretrained_models/sepformer-wsj02mix', 
    run_opts={'device':'cuda'}                      # Use GPU 
)

# Apply model to example audio
estimated_sources = speech_separator.separate_file(
    path='speechbrain/sepformer-wsj02mix/test_mixture.wav'
)


# Estimate source from the mixure signal
estimated_sources = speech_separator.separate_batch(
    torch.from_numpy(mixture.data).float()[None,:]
)


# Create audio containers for source1 and source2
source1 = AudioContainer(
    data=tensor_to_numpy(estimated_sources[:, :, 0]), 
    fs=8000
)
source2 = AudioContainer(
    data=tensor_to_numpy(estimated_sources[:, :, 1]), 
    fs=8000
)


from speechbrain.pretrained import SepformerSeparation

speech_enhancer = SepformerSeparation.from_hparams(
    source='speechbrain/sepformer-whamr-enhancement', 
    savedir='pretrained_models/sepformer-whamr-enhancement'
)

est_sources = speech_enhancer.separate_file(
    path='speechbrain/sepformer-whamr-enhancement/example_whamr.wav'
)


noisy_estimated_sources = speech_separator.separate_batch(
    torch.from_numpy(noisy_mixture.data).float()[None,:]
)


estimated_source = speech_enhancer.separate_batch(
    torch.from_numpy(noisy_source1.data).float()[None,:]
)


estimated_source = speech_enhancer.separate_batch(
    torch.from_numpy(noisy_source2.data).float()[None,:]
)


# Import
from speechbrain.pretrained import EncoderClassifier

# Create and download pretrained model
language_id = EncoderClassifier.from_hparams(
    source="speechbrain/lang-id-voxlingua107-ecapa", 
    savedir='pretrained_models/lang-id-voxlingua107-ecapa',
    run_opts={"device":"cuda"}
)

# Identify language
signal = language_id.load_audio("https://omniglot.com/soundfiles/udhr/udhr_fi.mp3")
lang = language_id.classify_batch(signal)[-1][0]

Identified language: Finnish


# Identify languages for both sources
prediction1 = language_id.classify_batch(
    wavs=torch.from_numpy(source1.data).float()
)
prediction2 = language_id.classify_batch(
    wavs=torch.from_numpy(source2.data).float()
)

Identified language for source 1: French
Identified language for source 2: English


# Import
from speechbrain.pretrained import EncoderDecoderASR

# Create and download pretrained model
ASR = EncoderDecoderASR.from_hparams(
    source='speechbrain/asr-wav2vec2-commonvoice-en',     
    savedir='pretrained_models/asr-wav2vec2-commonvoice-en',
    run_opts={'device':'cuda'}       # for GPU   
)

# Apply speech recognition
text = ASR.transcribe_file('speechbrain/asr-wav2vec2-commonvoice-en/example.wav')

Recognized speech: THE BIRCH CANOE SLID ON SMOOTH PLANKS


from speechbrain.pretrained import EncoderASR, EncoderDecoderASR

# Create models with languages (English and French)
asr_models = {
    'fr': EncoderASR.from_hparams(
        source='speechbrain/asr-wav2vec2-commonvoice-fr', 
        savedir='pretrained_models/asr-wav2vec2-commonvoice-fr',
        run_opts={"device":"cuda"}     # for GPU
    ),
    'en': EncoderDecoderASR.from_hparams(
        source='speechbrain/asr-wav2vec2-commonvoice-en', 
        savedir='pretrained_models/asr-wav2vec2-commonvoice-en',
        run_opts={"device":"cuda"}     # for GPU
    ),
}


pred_str1, pred_tokens1 = asr_models[lang1].transcribe_batch(
    wavs=torch.from_numpy(source1.resample(16000).data).float().unsqueeze(0), 
    wav_lens=torch.tensor([1.0])
)
text1 = pred_str1[0]

Recognized content for source 1 [fr]: 
TOUS LES ÊTRES HUMAINS NAISSENT LIBRES ET ÉGAUX EN DIGNITÉ ET EN DROIT ILS SONT DOUÉS DE RAISONS ET DE CONSCIENCE ET DOIVENT AGIR LES UNS ENVERS LES AUTRES DANS UN ESPRIT DE FRATERNITÉ


pred_str2, pred_tokens2 = asr_models[lang2].transcribe_batch(
    wavs=torch.from_numpy(source2.resample(16000).data).float().unsqueeze(0), 
    wav_lens=torch.tensor([1.0])
)
text2 = pred_str2[0]

Recognized content for source 2 [en]: 
ALL HUMAN BEINGS ON FREE AND EQUAL IN DIGNITY AND RIGHT THEY ARE ENDOWED WITH REASON AND CONSCIENCE AND SHOULD ACT TOWARDS ONE ANOTHER IN A SPIRIT OF BROTHERHOOD


from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
d = ModelDownloader('pretrained_models') # Data downloader

# Create and download pretrained model
text2speech = Text2Speech(
    **d.download_and_unpack(
        'kan-bayashi/ljspeech_tts_train_vits_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave'
    ), device="cuda"
)
# Apply model to text
speech = text2speech(text2)["wav"]

ALL HUMAN BEINGS ON FREE AND EQUAL IN DIGNITY AND RIGHT THEY ARE ENDOWED WITH REASON AND CONSCIENCE AND SHOULD ACT TOWARDS ONE ANOTHER IN A SPIRIT OF BROTHERHOOD


# Import
from speechbrain.pretrained import VAD

# Create and download pretrained model
VAD = VAD.from_hparams(
    source='speechbrain/vad-crdnn-libriparty',           # Model name
    savedir='pretrained_models/vad-crdnn-libriparty'
)

# Detect segments with speech, segment boudaries in seconds
speech_segments = VAD.get_speech_segments(    
    audio_file='speechbrain/vad-crdnn-libriparty/example_vad.wav'
)

Start	 Stop
-----	 ----
14.3s	 17.3s
18.1s	 21.6s
28.6s	 36.9s


# Computing speech vs non speech probabilities
prob_chunks = VAD.get_speech_prob_chunk(
    wavs=torch.from_numpy(mixture.data).float()    
)

# Apply a threshold to get candidate speech segments
prob_threshold = VAD.apply_threshold(prob_chunks).float()

# Comupute the boundaries of the speech segments
boundaries = VAD.get_boundaries(prob_threshold, output_value="seconds")

# Merge short segments
boundaries = VAD.merge_close_segments(boundaries, close_th=0.25)

# Remove short segments
boundaries = VAD.remove_short_segments(boundaries, len_th=0.25)

Speech AI
Examples

Introduction¶

Key tools used:¶

Speech separation¶

SepFormer¶

Usage¶

Example¶

Estimate sound sources¶

Source 1¶

Source 2¶

Speech enhancement¶

SepFormer for Speech Enhancement¶

Usage¶

Example with two source in noisy environment¶

Separate sound sources¶

Source 1¶

Source 2¶

Enhanced source 1¶

Enhanced source 2¶

Language identification¶

ECAPA-TDNN Spoken Language Identification¶

Usage¶

Example¶

Speech recognition¶

Encoder-Decoder based Speech Recognition¶

Usage¶

Example¶

Apply speech recognition¶

Text to speech¶

Tacotron¶

Example¶

Voice activity detection¶

CRNN Voice activity detection¶

Usage¶

Example¶

Process the signal¶

Visualize speech segments¶

Speech AIExamples

Introduction¶

Key tools used:¶

Speech separation¶

SepFormer¶

Usage¶

Example¶

Estimate sound sources¶

Source 1¶

Source 2¶

Speech enhancement¶

SepFormer for Speech Enhancement¶

Usage¶

Example with two source in noisy environment¶

Separate sound sources¶

Source 1¶

Source 2¶

Enhanced source 1¶

Enhanced source 2¶

Language identification¶

ECAPA-TDNN Spoken Language Identification¶

Usage¶

Example¶

Speech recognition¶

Encoder-Decoder based Speech Recognition¶

Usage¶

Example¶

Apply speech recognition¶

Text to speech¶

Tacotron¶

Example¶

Voice activity detection¶

CRNN Voice activity detection¶

Usage¶

Example¶

Process the signal¶

Visualize speech segments¶

Speech AI
Examples