Speech Recognition Basics: Building a Simple Speech-to-Text System

Speech Recognition Basics: Building a Modern Speech-to-Text System

Objective

Build a practical speech recognition system using modern deep learning approaches. You’ll learn to process audio data, implement speech-to-text conversion, and understand how to use both local models and cloud APIs for real-world applications.


Learning Outcomes

By completing this project, you will:

  • Master fundamental audio processing and feature extraction techniques
  • Implement modern speech recognition pipelines using deep learning
  • Learn to use industry-standard speech recognition APIs
  • Understand evaluation metrics specific to speech recognition
  • Gain practical experience with real-world audio data
  • Learn to handle challenges like noise and different accents

Skills Gained

  • Processing and analyzing audio data using modern libraries
  • Implementing speech recognition using deep learning approaches
  • Using cloud-based speech recognition APIs effectively
  • Building end-to-end audio processing pipelines
  • Evaluating speech recognition systems
  • Handling real-world audio challenges

Tools Required

# Core libraries
pip install torch torchaudio
pip install transformers
pip install librosa
pip install soundfile
pip install google-cloud-speech
pip install pyaudio
pip install jiwer  # for WER calculation

# Visualization
pip install matplotlib
pip install seaborn

Project Structure

speech_recognition/
│
├── data/
│   ├── LibriSpeech/
│   │   ├── train-clean-100/
│   │   └── test-clean/
│   └── custom_audio/
│
├── src/
│   ├── audio_processing.py
│   ├── feature_extraction.py
│   ├── model.py
│   ├── cloud_apis.py
│   └── evaluation.py
│
└── notebooks/
    ├── 1_audio_exploration.ipynb
    ├── 2_model_training.ipynb
    └── 3_evaluation.ipynb

Steps and Tasks

1. Data Acquisition and Setup

First, let’s download and set up the LibriSpeech dataset:

import torchaudio

# Download LibriSpeech dataset (clean subset)
train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url="train-clean-100", download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url="test-clean", download=True)

# Check dataset info
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Basic audio exploration:

import librosa
import matplotlib.pyplot as plt

def explore_audio(waveform, sample_rate, title="Waveform"):
    """Plot waveform and spectrogram"""
    plt.figure(figsize=(15, 5))
    
    # Plot waveform
    plt.subplot(1, 2, 1)
    plt.plot(waveform)
    plt.title("Waveform")
    
    # Plot spectrogram
    plt.subplot(1, 2, 2)
    spec = librosa.feature.melspectrogram(y=waveform, sr=sample_rate)
    librosa.display.specshow(librosa.power_to_db(spec), y_axis='mel', x_axis='time')
    plt.title("Mel Spectrogram")
    plt.colorbar(format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()
Click to view advanced audio analysis
class AudioAnalyzer:
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
        
    def analyze_audio_file(self, file_path):
        """Comprehensive audio analysis"""
        # Load audio
        waveform, sr = librosa.load(file_path, sr=self.sample_rate)
        
        # Basic properties
        duration = librosa.get_duration(y=waveform, sr=sr)
        rms = librosa.feature.rms(y=waveform)
        
        # Extract features
        mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
        spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)
        spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)
        
        # Zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(waveform)
        
        return {
            'duration': duration,
            'rms_energy': np.mean(rms),
            'mfccs': mfccs,
            'spectral_centroids': spectral_centroids,
            'spectral_rolloff': spectral_rolloff,
            'zero_crossing_rate': np.mean(zcr)
        }
    
    def plot_features(self, features):
        """Visualize extracted features"""
        fig, axes = plt.subplots(3, 1, figsize=(15, 10))
        
        # Plot MFCCs
        librosa.display.specshow(features['mfccs'], ax=axes[0])
        axes[0].set_title('MFCCs')
        
        # Plot spectral features
        axes[1].plot(features['spectral_centroids'][0])
        axes[1].set_title('Spectral Centroid')
        
        # Plot energy
        axes[2].plot(features['rms_energy'])
        axes[2].set_title('RMS Energy')
        
        plt.tight_layout()
        plt.show()

2. Audio Processing Pipeline

Create a robust audio processing pipeline:

class AudioProcessor:
    def __init__(self, target_sample_rate=16000, duration=10):
        self.target_sample_rate = target_sample_rate
        self.duration = duration
        
    def preprocess_audio(self, waveform, sample_rate):
        """Basic audio preprocessing"""
        # Resample if needed
        if sample_rate != self.target_sample_rate:
            waveform = torchaudio.functional.resample(
                waveform, sample_rate, self.target_sample_rate
            )
            
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
            
        # Normalize
        waveform = waveform / waveform.abs().max()
        
        return waveform
Click to view advanced audio processing
class AdvancedAudioProcessor:
    def __init__(self, target_sample_rate=16000, duration=10):
        self.target_sample_rate = target_sample_rate
        self.duration = duration
        
        # Initialize audio augmentation
        self.augmentation = nn.Sequential(
            torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
            torchaudio.transforms.TimeMasking(time_mask_param=100)
        )
        
    def apply_noise_reduction(self, waveform):
        """Apply noise reduction using spectral subtraction"""
        # Implementation of spectral subtraction
        return denoised_waveform
    
    def apply_augmentation(self, waveform):
        """Apply audio augmentation"""
        return self.augmentation(waveform)
    
    def extract_features(self, waveform):
        """Extract audio features"""
        # Mel spectrogram
        mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.target_sample_rate,
            n_mels=128
        )(waveform)
        
        # Log-Mel spectrogram
        log_mel_spec = torchaudio.transforms.AmplitudeToDB()(mel_spec)
        
        return log_mel_spec

3. Model Implementation

We’ll use the Wav2Vec2 model from HuggingFace for speech recognition:

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

class SpeechRecognizer:
    def __init__(self):
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        
    def transcribe(self, waveform):
        """Transcribe audio to text"""
        # Preprocess
        inputs = self.processor(
            waveform, 
            sampling_rate=16000, 
            return_tensors="pt", 
            padding=True
        )
        
        # Get logits
        with torch.no_grad():
            logits = self.model(inputs.input_values).logits
            
        # Decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)
        
        return transcription[0]
Click to view advanced model implementations
class AdvancedSpeechRecognizer:
    def __init__(self, model_name="facebook/wav2vec2-large-960h-lv60-self"):
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
        
    def transcribe_with_timestamps(self, waveform):
        """Transcribe audio with word-level timestamps"""
        # Implementation of CTC decoding with timestamps
        pass
    
    def transcribe_with_confidence(self, waveform):
        """Transcribe audio with confidence scores"""
        # Get logits and compute probabilities
        inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt")
        with torch.no_grad():
            logits = self.model(inputs.input_values).logits
            probs = torch.softmax(logits, dim=-1)
            
        # Decode with confidence scores
        predictions = []
        confidences = []
        
        return predictions, confidences

4. Cloud API Integration

Implement Google Cloud Speech-to-Text API:

from google.cloud import speech

def transcribe_audio_google(audio_path):
    """Transcribe audio using Google Cloud Speech-to-Text"""
    client = speech.SpeechClient()
    
    # Read audio file
    with open(audio_path, "rb") as audio_file:
        content = audio_file.read()
        
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )
    
    # Perform transcription
    response = client.recognize(config=config, audio=audio)
    
    return " ".join(result.alternatives[0].transcript 
                   for result in response.results)
Click to view multi-API implementation
class CloudSpeechRecognizer:
    def __init__(self):
        self.google_client = speech.SpeechClient()
        # Initialize other API clients as needed
        
    def transcribe_google(self, audio_path, language="en-US"):
        """Transcribe using Google Cloud"""
        # Implementation as above
        
    def transcribe_azure(self, audio_path, language="en-US"):
        """Transcribe using Azure Speech Services"""
        # Azure implementation
        
    def transcribe_aws(self, audio_path, language="en-US"):
        """Transcribe using Amazon Transcribe"""
        # AWS implementation
        
    def transcribe_multiple(self, audio_path):
        """Transcribe using multiple APIs and compare results"""
        results = {
            'google': self.transcribe_google(audio_path),
            'azure': self.transcribe_azure(audio_path),
            'aws': self.transcribe_aws(audio_path)
        }
        return results

5. Evaluation

Implement comprehensive evaluation metrics:

from jiwer import wer, mer, wil

def evaluate_transcription(reference, hypothesis):
    """Evaluate transcription using multiple metrics"""
    metrics = {
        'WER': wer(reference, hypothesis),
        'MER': mer(reference, hypothesis),
        'WIL': wil(reference, hypothesis)
    }
    
    return metrics

# Example usage
reference = "the quick brown fox jumps over the lazy dog"
hypothesis = "the quick brown fox jumps over the lazy"
metrics = evaluate_transcription(reference, hypothesis)
print(metrics)
Click to view advanced evaluation tools
class TranscriptionEvaluator:
    def __init__(self):
        self.metrics_history = []
        
    def evaluate_batch(self, references, hypotheses):
        """Evaluate batch of transcriptions"""
        results = []
        for ref, hyp in zip(references, hypotheses):
            metrics = self.evaluate_single(ref, hyp)
            results.append(metrics)
        
        # Compute average metrics
        avg_metrics = {
            metric: np.mean([r[metric] for r in results])
            for metric in results[0].keys()
        }
        
        self.metrics_history.append(avg_metrics)
        return avg_metrics
    
    def plot_metrics_history(self):
        """Plot metrics over time"""
        metrics_df = pd.DataFrame(self.metrics_history)
        
        plt.figure(figsize=(12, 6))
        for metric in metrics_df.columns:
            plt.plot(metrics_df[metric], label=metric)
            
        plt.title('Transcription Metrics Over Time')
        plt.xlabel('Batch')
        plt.ylabel('Score')
        plt.legend()
        plt.show()

6. Real-Time Speech Recognition

Implement real-time speech recognition:

import pyaudio
import wave

class RealtimeSpeechRecognizer:
    def __init__(self):
        self.recognizer = SpeechRecognizer()
        self.chunk = 1024
        self.format = pyaudio.paFloat32
        self.channels = 1
        self.rate = 16000
        
    def record_audio(self, seconds=5):
        """Record audio from microphone"""
        p = pyaudio.PyAudio()
        
        stream = p.open(format=self.format,
                       channels=self.channels,
                       rate=self.rate,
                       input=True,
                       frames_per_buffer=self.chunk)
        
        frames = []
        for _ in range(0, int(self.rate / self.chunk * seconds)):
            data = stream.read(self.chunk)
            frames.append(data)
            
        stream.stop_stream()
        stream.close()
        p.terminate()
        
        return b''.join(frames)

[details=“Click to view streaming implementation”]

class StreamingSpeechRecognizer:
    def __init__(self):
        self.recognizer = SpeechRecognizer()
        self.audio_config = speech.StreamingRecognitionConfig(
            config=speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=16000,
                language_code="en-US",
                enable_automatic_punctuation=True,
            ),