Speech Recognition Basics: Building a Modern Speech-to-Text System
Objective
Build a practical speech recognition system using modern deep learning approaches. You’ll learn to process audio data, implement speech-to-text conversion, and understand how to use both local models and cloud APIs for real-world applications.
Learning Outcomes
By completing this project, you will:
- Master fundamental audio processing and feature extraction techniques
- Implement modern speech recognition pipelines using deep learning
- Learn to use industry-standard speech recognition APIs
- Understand evaluation metrics specific to speech recognition
- Gain practical experience with real-world audio data
- Learn to handle challenges like noise and different accents
Skills Gained
- Processing and analyzing audio data using modern libraries
- Implementing speech recognition using deep learning approaches
- Using cloud-based speech recognition APIs effectively
- Building end-to-end audio processing pipelines
- Evaluating speech recognition systems
- Handling real-world audio challenges
Tools Required
# Core libraries
pip install torch torchaudio
pip install transformers
pip install librosa
pip install soundfile
pip install google-cloud-speech
pip install pyaudio
pip install jiwer # for WER calculation
# Visualization
pip install matplotlib
pip install seaborn
Project Structure
speech_recognition/
│
├── data/
│ ├── LibriSpeech/
│ │ ├── train-clean-100/
│ │ └── test-clean/
│ └── custom_audio/
│
├── src/
│ ├── audio_processing.py
│ ├── feature_extraction.py
│ ├── model.py
│ ├── cloud_apis.py
│ └── evaluation.py
│
└── notebooks/
├── 1_audio_exploration.ipynb
├── 2_model_training.ipynb
└── 3_evaluation.ipynb
Steps and Tasks
1. Data Acquisition and Setup
First, let’s download and set up the LibriSpeech dataset:
import torchaudio
# Download LibriSpeech dataset (clean subset)
train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url="train-clean-100", download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url="test-clean", download=True)
# Check dataset info
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
Basic audio exploration:
import librosa
import matplotlib.pyplot as plt
def explore_audio(waveform, sample_rate, title="Waveform"):
"""Plot waveform and spectrogram"""
plt.figure(figsize=(15, 5))
# Plot waveform
plt.subplot(1, 2, 1)
plt.plot(waveform)
plt.title("Waveform")
# Plot spectrogram
plt.subplot(1, 2, 2)
spec = librosa.feature.melspectrogram(y=waveform, sr=sample_rate)
librosa.display.specshow(librosa.power_to_db(spec), y_axis='mel', x_axis='time')
plt.title("Mel Spectrogram")
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()
Click to view advanced audio analysis
class AudioAnalyzer:
def __init__(self, sample_rate=16000):
self.sample_rate = sample_rate
def analyze_audio_file(self, file_path):
"""Comprehensive audio analysis"""
# Load audio
waveform, sr = librosa.load(file_path, sr=self.sample_rate)
# Basic properties
duration = librosa.get_duration(y=waveform, sr=sr)
rms = librosa.feature.rms(y=waveform)
# Extract features
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)
# Zero crossing rate
zcr = librosa.feature.zero_crossing_rate(waveform)
return {
'duration': duration,
'rms_energy': np.mean(rms),
'mfccs': mfccs,
'spectral_centroids': spectral_centroids,
'spectral_rolloff': spectral_rolloff,
'zero_crossing_rate': np.mean(zcr)
}
def plot_features(self, features):
"""Visualize extracted features"""
fig, axes = plt.subplots(3, 1, figsize=(15, 10))
# Plot MFCCs
librosa.display.specshow(features['mfccs'], ax=axes[0])
axes[0].set_title('MFCCs')
# Plot spectral features
axes[1].plot(features['spectral_centroids'][0])
axes[1].set_title('Spectral Centroid')
# Plot energy
axes[2].plot(features['rms_energy'])
axes[2].set_title('RMS Energy')
plt.tight_layout()
plt.show()
2. Audio Processing Pipeline
Create a robust audio processing pipeline:
class AudioProcessor:
def __init__(self, target_sample_rate=16000, duration=10):
self.target_sample_rate = target_sample_rate
self.duration = duration
def preprocess_audio(self, waveform, sample_rate):
"""Basic audio preprocessing"""
# Resample if needed
if sample_rate != self.target_sample_rate:
waveform = torchaudio.functional.resample(
waveform, sample_rate, self.target_sample_rate
)
# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Normalize
waveform = waveform / waveform.abs().max()
return waveform
Click to view advanced audio processing
class AdvancedAudioProcessor:
def __init__(self, target_sample_rate=16000, duration=10):
self.target_sample_rate = target_sample_rate
self.duration = duration
# Initialize audio augmentation
self.augmentation = nn.Sequential(
torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
torchaudio.transforms.TimeMasking(time_mask_param=100)
)
def apply_noise_reduction(self, waveform):
"""Apply noise reduction using spectral subtraction"""
# Implementation of spectral subtraction
return denoised_waveform
def apply_augmentation(self, waveform):
"""Apply audio augmentation"""
return self.augmentation(waveform)
def extract_features(self, waveform):
"""Extract audio features"""
# Mel spectrogram
mel_spec = torchaudio.transforms.MelSpectrogram(
sample_rate=self.target_sample_rate,
n_mels=128
)(waveform)
# Log-Mel spectrogram
log_mel_spec = torchaudio.transforms.AmplitudeToDB()(mel_spec)
return log_mel_spec
3. Model Implementation
We’ll use the Wav2Vec2 model from HuggingFace for speech recognition:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
class SpeechRecognizer:
def __init__(self):
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
def transcribe(self, waveform):
"""Transcribe audio to text"""
# Preprocess
inputs = self.processor(
waveform,
sampling_rate=16000,
return_tensors="pt",
padding=True
)
# Get logits
with torch.no_grad():
logits = self.model(inputs.input_values).logits
# Decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = self.processor.batch_decode(predicted_ids)
return transcription[0]
Click to view advanced model implementations
class AdvancedSpeechRecognizer:
def __init__(self, model_name="facebook/wav2vec2-large-960h-lv60-self"):
self.processor = Wav2Vec2Processor.from_pretrained(model_name)
self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
def transcribe_with_timestamps(self, waveform):
"""Transcribe audio with word-level timestamps"""
# Implementation of CTC decoding with timestamps
pass
def transcribe_with_confidence(self, waveform):
"""Transcribe audio with confidence scores"""
# Get logits and compute probabilities
inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = self.model(inputs.input_values).logits
probs = torch.softmax(logits, dim=-1)
# Decode with confidence scores
predictions = []
confidences = []
return predictions, confidences
4. Cloud API Integration
Implement Google Cloud Speech-to-Text API:
from google.cloud import speech
def transcribe_audio_google(audio_path):
"""Transcribe audio using Google Cloud Speech-to-Text"""
client = speech.SpeechClient()
# Read audio file
with open(audio_path, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
)
# Perform transcription
response = client.recognize(config=config, audio=audio)
return " ".join(result.alternatives[0].transcript
for result in response.results)
Click to view multi-API implementation
class CloudSpeechRecognizer:
def __init__(self):
self.google_client = speech.SpeechClient()
# Initialize other API clients as needed
def transcribe_google(self, audio_path, language="en-US"):
"""Transcribe using Google Cloud"""
# Implementation as above
def transcribe_azure(self, audio_path, language="en-US"):
"""Transcribe using Azure Speech Services"""
# Azure implementation
def transcribe_aws(self, audio_path, language="en-US"):
"""Transcribe using Amazon Transcribe"""
# AWS implementation
def transcribe_multiple(self, audio_path):
"""Transcribe using multiple APIs and compare results"""
results = {
'google': self.transcribe_google(audio_path),
'azure': self.transcribe_azure(audio_path),
'aws': self.transcribe_aws(audio_path)
}
return results
5. Evaluation
Implement comprehensive evaluation metrics:
from jiwer import wer, mer, wil
def evaluate_transcription(reference, hypothesis):
"""Evaluate transcription using multiple metrics"""
metrics = {
'WER': wer(reference, hypothesis),
'MER': mer(reference, hypothesis),
'WIL': wil(reference, hypothesis)
}
return metrics
# Example usage
reference = "the quick brown fox jumps over the lazy dog"
hypothesis = "the quick brown fox jumps over the lazy"
metrics = evaluate_transcription(reference, hypothesis)
print(metrics)
Click to view advanced evaluation tools
class TranscriptionEvaluator:
def __init__(self):
self.metrics_history = []
def evaluate_batch(self, references, hypotheses):
"""Evaluate batch of transcriptions"""
results = []
for ref, hyp in zip(references, hypotheses):
metrics = self.evaluate_single(ref, hyp)
results.append(metrics)
# Compute average metrics
avg_metrics = {
metric: np.mean([r[metric] for r in results])
for metric in results[0].keys()
}
self.metrics_history.append(avg_metrics)
return avg_metrics
def plot_metrics_history(self):
"""Plot metrics over time"""
metrics_df = pd.DataFrame(self.metrics_history)
plt.figure(figsize=(12, 6))
for metric in metrics_df.columns:
plt.plot(metrics_df[metric], label=metric)
plt.title('Transcription Metrics Over Time')
plt.xlabel('Batch')
plt.ylabel('Score')
plt.legend()
plt.show()
6. Real-Time Speech Recognition
Implement real-time speech recognition:
import pyaudio
import wave
class RealtimeSpeechRecognizer:
def __init__(self):
self.recognizer = SpeechRecognizer()
self.chunk = 1024
self.format = pyaudio.paFloat32
self.channels = 1
self.rate = 16000
def record_audio(self, seconds=5):
"""Record audio from microphone"""
p = pyaudio.PyAudio()
stream = p.open(format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer=self.chunk)
frames = []
for _ in range(0, int(self.rate / self.chunk * seconds)):
data = stream.read(self.chunk)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
return b''.join(frames)
[details=“Click to view streaming implementation”]
class StreamingSpeechRecognizer:
def __init__(self):
self.recognizer = SpeechRecognizer()
self.audio_config = speech.StreamingRecognitionConfig(
config=speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
enable_automatic_punctuation=True,
),