Restoring voice_codec.py

2025-07-07 00:47:17 +02:00 · 2025-07-07 00:47:17 +02:00 · 07361b8448
commit 07361b8448
parent f8a7aa0147
1 changed files with 714 additions and 0 deletions
--- a/protocol_prototype/DryBox/voice_codec.py
+++ b/protocol_prototype/DryBox/voice_codec.py
@ -0,0 +1,714 @@
+"""
+Voice codec integration for encrypted voice over GSM.
+Implements Codec2 compression with FSK modulation for transmitting
+encrypted voice data over standard GSM voice channels.
+"""
+
+import array
+import math
+import struct
+from typing import Optional, Tuple, List
+from dataclasses import dataclass
+from enum import IntEnum
+
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ImportError:
+    HAS_NUMPY = False
+
+# ANSI colors
+RED = "\033[91m"
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+BLUE = "\033[94m"
+RESET = "\033[0m"
+
+
+class Codec2Mode(IntEnum):
+    """Codec2 bitrate modes."""
+    MODE_3200 = 0  # 3200 bps
+    MODE_2400 = 1  # 2400 bps
+    MODE_1600 = 2  # 1600 bps
+    MODE_1400 = 3  # 1400 bps
+    MODE_1300 = 4  # 1300 bps
+    MODE_1200 = 5  # 1200 bps (recommended for robustness)
+    MODE_700C = 6  # 700 bps
+
+
+@dataclass
+class Codec2Frame:
+    """Represents a single Codec2 compressed voice frame."""
+    mode: Codec2Mode
+    bits: bytes
+    timestamp: float
+    frame_number: int
+
+
+class Codec2Wrapper:
+    """
+    Wrapper for Codec2 voice codec.
+    In production, this would use py_codec2 or ctypes bindings to libcodec2.
+    This is a simulation interface for protocol development.
+    """
+
+    # Frame sizes in bits for each mode
+    FRAME_BITS = {
+        Codec2Mode.MODE_3200: 64,
+        Codec2Mode.MODE_2400: 48,
+        Codec2Mode.MODE_1600: 64,
+        Codec2Mode.MODE_1400: 56,
+        Codec2Mode.MODE_1300: 52,
+        Codec2Mode.MODE_1200: 48,
+        Codec2Mode.MODE_700C: 28
+    }
+
+    # Frame duration in ms
+    FRAME_MS = {
+        Codec2Mode.MODE_3200: 20,
+        Codec2Mode.MODE_2400: 20,
+        Codec2Mode.MODE_1600: 40,
+        Codec2Mode.MODE_1400: 40,
+        Codec2Mode.MODE_1300: 40,
+        Codec2Mode.MODE_1200: 40,
+        Codec2Mode.MODE_700C: 40
+    }
+
+    def __init__(self, mode: Codec2Mode = Codec2Mode.MODE_1200):
+        """
+        Initialize Codec2 wrapper.
+
+        Args:
+            mode: Codec2 bitrate mode (default 1200 bps for robustness)
+        """
+        self.mode = mode
+        self.frame_bits = self.FRAME_BITS[mode]
+        self.frame_bytes = (self.frame_bits + 7) // 8
+        self.frame_ms = self.FRAME_MS[mode]
+        self.frame_samples = int(8000 * self.frame_ms / 1000)  # 8kHz sampling
+        self.frame_counter = 0
+
+        # Quiet initialization - no print
+
+    def encode(self, audio_samples) -> Optional[Codec2Frame]:
+        """
+        Encode PCM audio samples to Codec2 frame.
+
+        Args:
+            audio_samples: PCM samples (8kHz, 16-bit signed)
+
+        Returns:
+            Codec2Frame or None if insufficient samples
+        """
+        if len(audio_samples) < self.frame_samples:
+            return None
+
+        # In production: call codec2_encode(state, bits, samples)
+        # Simulation: create pseudo-compressed data
+        compressed = self._simulate_compression(audio_samples[:self.frame_samples])
+
+        frame = Codec2Frame(
+            mode=self.mode,
+            bits=compressed,
+            timestamp=self.frame_counter * self.frame_ms / 1000.0,
+            frame_number=self.frame_counter
+        )
+
+        self.frame_counter += 1
+        return frame
+
+    def decode(self, frame: Codec2Frame):
+        """
+        Decode Codec2 frame to PCM audio samples.
+
+        Args:
+            frame: Codec2 compressed frame
+
+        Returns:
+            PCM samples (8kHz, 16-bit signed)
+        """
+        if frame.mode != self.mode:
+            raise ValueError(f"Frame mode {frame.mode} doesn't match decoder mode {self.mode}")
+
+        # In production: call codec2_decode(state, samples, bits)
+        # Simulation: decompress to audio
+        return self._simulate_decompression(frame.bits)
+
+    def _simulate_compression(self, samples) -> bytes:
+        """Simulate Codec2 compression (for testing)."""
+        # Convert to list if needed
+        if hasattr(samples, 'tolist'):
+            sample_list = samples.tolist()
+        elif hasattr(samples, '__iter__'):
+            sample_list = list(samples)
+        else:
+            sample_list = samples
+
+        # Extract basic features for simulation
+        if HAS_NUMPY and hasattr(samples, '__array__'):
+            # Convert to numpy array if needed
+            np_samples = np.asarray(samples, dtype=np.float32)
+            if len(np_samples) > 0:
+                mean_square = np.mean(np_samples ** 2)
+                energy = np.sqrt(mean_square) if not np.isnan(mean_square) else 0.0
+                zero_crossings = np.sum(np.diff(np.sign(np_samples)) != 0)
+            else:
+                energy = 0.0
+                zero_crossings = 0
+        else:
+            # Manual calculation without numpy
+            if sample_list and len(sample_list) > 0:
+                energy = math.sqrt(sum(s**2 for s in sample_list) / len(sample_list))
+                zero_crossings = sum(1 for i in range(1, len(sample_list))
+                                   if (sample_list[i-1] >= 0) != (sample_list[i] >= 0))
+            else:
+                energy = 0.0
+                zero_crossings = 0
+
+        # Pack into bytes (simplified)
+        # Ensure values are valid
+        energy_int = max(0, min(65535, int(energy)))
+        zc_int = max(0, min(65535, int(zero_crossings)))
+        data = struct.pack('<HH', energy_int, zc_int)
+
+        # Pad to expected frame size
+        data += b'\x00' * (self.frame_bytes - len(data))
+
+        return data[:self.frame_bytes]
+
+    def _simulate_decompression(self, compressed: bytes):
+        """Simulate Codec2 decompression (for testing)."""
+        # Unpack features
+        if len(compressed) >= 4:
+            energy, zero_crossings = struct.unpack('<HH', compressed[:4])
+        else:
+            energy, zero_crossings = 1000, 100
+
+        # Generate synthetic speech-like signal
+        if HAS_NUMPY:
+            t = np.linspace(0, self.frame_ms/1000, self.frame_samples)
+
+            # Base frequency from zero crossings
+            freq = zero_crossings * 10  # Simplified mapping
+
+            # Generate harmonics
+            signal = np.zeros(self.frame_samples)
+            for harmonic in range(1, 4):
+                signal += np.sin(2 * np.pi * freq * harmonic * t) / harmonic
+
+            # Apply energy envelope
+            signal *= energy / 10000.0
+
+            # Convert to 16-bit PCM
+            return (signal * 32767).astype(np.int16)
+        else:
+            # Manual generation without numpy
+            samples = []
+            freq = zero_crossings * 10
+
+            for i in range(self.frame_samples):
+                t = i / 8000.0  # 8kHz sample rate
+                value = 0
+                for harmonic in range(1, 4):
+                    value += math.sin(2 * math.pi * freq * harmonic * t) / harmonic
+
+                value *= energy / 10000.0
+                # Clamp to 16-bit range
+                sample = int(value * 32767)
+                sample = max(-32768, min(32767, sample))
+                samples.append(sample)
+
+            return array.array('h', samples)
+
+
+class FSKModem:
+    """
+    4-FSK modem for transmitting digital data over voice channels.
+    Designed to survive GSM/AMR/EVS vocoders.
+    """
+
+    def __init__(self, sample_rate: int = 8000, baud_rate: int = 600):
+        """
+        Initialize FSK modem.
+
+        Args:
+            sample_rate: Audio sample rate (Hz)
+            baud_rate: Symbol rate (baud)
+        """
+        self.sample_rate = sample_rate
+        self.baud_rate = baud_rate
+        self.samples_per_symbol = int(sample_rate / baud_rate)
+
+        # 4-FSK frequencies (300-3400 Hz band)
+        self.frequencies = [
+            600,   # 00
+            1200,  # 01
+            1800,  # 10
+            2400   # 11
+        ]
+
+        # Preamble for synchronization (800 Hz, 100ms)
+        self.preamble_freq = 800
+        self.preamble_duration = 0.1  # seconds
+
+        # Quiet initialization - no print
+
+    def modulate(self, data: bytes, add_preamble: bool = True):
+        """
+        Modulate binary data to FSK audio signal.
+
+        Args:
+            data: Binary data to modulate
+            add_preamble: Whether to add synchronization preamble
+
+        Returns:
+            Audio signal (normalized float32 array or list)
+        """
+        # Convert bytes to dibits (2-bit symbols)
+        symbols = []
+        for byte in data:
+            symbols.extend([
+                (byte >> 6) & 0x03,
+                (byte >> 4) & 0x03,
+                (byte >> 2) & 0x03,
+                byte & 0x03
+            ])
+
+        # Generate audio signal
+        signal = []
+
+        # Add preamble
+        if add_preamble:
+            preamble_samples = int(self.preamble_duration * self.sample_rate)
+            if HAS_NUMPY:
+                t = np.arange(preamble_samples) / self.sample_rate
+                preamble = np.sin(2 * np.pi * self.preamble_freq * t)
+                signal.extend(preamble)
+            else:
+                for i in range(preamble_samples):
+                    t = i / self.sample_rate
+                    value = math.sin(2 * math.pi * self.preamble_freq * t)
+                    signal.append(value)
+
+        # Modulate symbols
+        for symbol in symbols:
+            freq = self.frequencies[symbol]
+            if HAS_NUMPY:
+                t = np.arange(self.samples_per_symbol) / self.sample_rate
+                tone = np.sin(2 * np.pi * freq * t)
+                signal.extend(tone)
+            else:
+                for i in range(self.samples_per_symbol):
+                    t = i / self.sample_rate
+                    value = math.sin(2 * math.pi * freq * t)
+                    signal.append(value)
+
+        # Apply smoothing to reduce clicks
+        if HAS_NUMPY:
+            audio = np.array(signal, dtype=np.float32)
+        else:
+            audio = array.array('f', signal)
+        audio = self._apply_envelope(audio)
+
+        return audio
+
+    def demodulate(self, audio) -> Tuple[bytes, float]:
+        """
+        Demodulate FSK audio signal to binary data.
+
+        Args:
+            audio: Audio signal
+
+        Returns:
+            Tuple of (demodulated data, confidence score)
+        """
+        # Find preamble
+        preamble_start = self._find_preamble(audio)
+        if preamble_start < 0:
+            return b'', 0.0
+
+        # Skip preamble
+        data_start = preamble_start + int(self.preamble_duration * self.sample_rate)
+
+        # Demodulate symbols
+        symbols = []
+        confidence_scores = []
+
+        pos = data_start
+        while pos + self.samples_per_symbol <= len(audio):
+            symbol_audio = audio[pos:pos + self.samples_per_symbol]
+            symbol, confidence = self._demodulate_symbol(symbol_audio)
+            symbols.append(symbol)
+            confidence_scores.append(confidence)
+            pos += self.samples_per_symbol
+
+        # Convert symbols to bytes
+        data = bytearray()
+        for i in range(0, len(symbols), 4):
+            if i + 3 < len(symbols):
+                byte = (symbols[i] << 6) | (symbols[i+1] << 4) | (symbols[i+2] << 2) | symbols[i+3]
+                data.append(byte)
+
+        if HAS_NUMPY and confidence_scores:
+            avg_confidence = np.mean(confidence_scores)
+        else:
+            avg_confidence = sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0.0
+        return bytes(data), avg_confidence
+
+    def _find_preamble(self, audio) -> int:
+        """Find preamble in audio signal."""
+        # Simple energy-based detection
+        window_size = int(0.01 * self.sample_rate)  # 10ms window
+
+        if HAS_NUMPY:
+            for i in range(0, len(audio) - window_size, window_size // 2):
+                window = audio[i:i + window_size]
+
+                # Check for preamble frequency
+                fft = np.fft.fft(window)
+                freqs = np.fft.fftfreq(len(window), 1/self.sample_rate)
+
+                # Find peak near preamble frequency
+                idx = np.argmax(np.abs(fft[:len(fft)//2]))
+                peak_freq = abs(freqs[idx])
+
+                if abs(peak_freq - self.preamble_freq) < 50:  # 50 Hz tolerance
+                    return i
+        else:
+            # Simple zero-crossing based detection without FFT
+            for i in range(0, len(audio) - window_size, window_size // 2):
+                window = list(audio[i:i + window_size])
+
+                # Count zero crossings
+                zero_crossings = 0
+                for j in range(1, len(window)):
+                    if (window[j-1] >= 0) != (window[j] >= 0):
+                        zero_crossings += 1
+
+                # Estimate frequency from zero crossings
+                estimated_freq = (zero_crossings * self.sample_rate) / (2 * len(window))
+
+                if abs(estimated_freq - self.preamble_freq) < 100:  # 100 Hz tolerance
+                    return i
+
+        return -1
+
+    def _demodulate_symbol(self, audio) -> Tuple[int, float]:
+        """Demodulate a single FSK symbol."""
+        if HAS_NUMPY:
+            # FFT-based demodulation
+            fft = np.fft.fft(audio)
+            freqs = np.fft.fftfreq(len(audio), 1/self.sample_rate)
+            magnitude = np.abs(fft[:len(fft)//2])
+
+            # Find energy at each FSK frequency
+            energies = []
+            for freq in self.frequencies:
+                idx = np.argmin(np.abs(freqs[:len(freqs)//2] - freq))
+                energy = magnitude[idx]
+                energies.append(energy)
+
+            # Select symbol with highest energy
+            symbol = np.argmax(energies)
+        else:
+            # Goertzel algorithm for specific frequency detection
+            audio_list = list(audio) if hasattr(audio, '__iter__') else audio
+            energies = []
+
+            for freq in self.frequencies:
+                # Goertzel algorithm
+                omega = 2 * math.pi * freq / self.sample_rate
+                coeff = 2 * math.cos(omega)
+
+                s_prev = 0
+                s_prev2 = 0
+
+                for sample in audio_list:
+                    s = sample + coeff * s_prev - s_prev2
+                    s_prev2 = s_prev
+                    s_prev = s
+
+                # Calculate magnitude
+                power = s_prev2 * s_prev2 + s_prev * s_prev - coeff * s_prev * s_prev2
+                energies.append(math.sqrt(abs(power)))
+
+            # Select symbol with highest energy
+            symbol = energies.index(max(energies))
+
+        # Confidence is ratio of strongest to second strongest
+        sorted_energies = sorted(energies, reverse=True)
+        confidence = sorted_energies[0] / (sorted_energies[1] + 1e-6)
+
+        return symbol, min(confidence, 10.0) / 10.0
+
+    def _apply_envelope(self, audio):
+        """Apply smoothing envelope to reduce clicks."""
+        # Simple raised cosine envelope
+        ramp_samples = int(0.002 * self.sample_rate)  # 2ms ramps
+
+        if len(audio) > 2 * ramp_samples:
+            if HAS_NUMPY:
+                # Fade in
+                t = np.linspace(0, np.pi/2, ramp_samples)
+                audio[:ramp_samples] *= np.sin(t) ** 2
+
+                # Fade out
+                audio[-ramp_samples:] *= np.sin(t[::-1]) ** 2
+            else:
+                # Manual fade in
+                for i in range(ramp_samples):
+                    t = (i / ramp_samples) * (math.pi / 2)
+                    factor = math.sin(t) ** 2
+                    audio[i] *= factor
+
+                # Manual fade out
+                for i in range(ramp_samples):
+                    t = ((ramp_samples - 1 - i) / ramp_samples) * (math.pi / 2)
+                    factor = math.sin(t) ** 2
+                    audio[-(i+1)] *= factor
+
+        return audio
+
+
+class VoiceProtocol:
+    """
+    Integrates voice codec and modem with the Icing protocol
+    for encrypted voice transmission over GSM.
+    """
+
+    def __init__(self, protocol_instance):
+        """
+        Initialize voice protocol handler.
+
+        Args:
+            protocol_instance: IcingProtocol instance
+        """
+        self.protocol = protocol_instance
+        self.codec = Codec2Wrapper(Codec2Mode.MODE_1200)
+        self.modem = FSKModem(sample_rate=8000, baud_rate=600)
+
+        # Voice crypto state
+        self.voice_iv_counter = 0
+        self.voice_sequence = 0
+
+        # Buffers
+        if HAS_NUMPY:
+            self.audio_buffer = np.array([], dtype=np.int16)
+        else:
+            self.audio_buffer = array.array('h')  # 16-bit signed integers
+        self.frame_buffer = []
+
+        print(f"{GREEN}[VOICE]{RESET} Voice protocol initialized")
+
+    def process_voice_input(self, audio_samples):
+        """
+        Process voice input: compress, encrypt, and modulate.
+
+        Args:
+            audio_samples: PCM audio samples (8kHz, 16-bit)
+
+        Returns:
+            Modulated audio signal ready for transmission (numpy array or array.array)
+        """
+        # Add to buffer
+        if HAS_NUMPY:
+            self.audio_buffer = np.concatenate([self.audio_buffer, audio_samples])
+        else:
+            self.audio_buffer.extend(audio_samples)
+
+        # Process complete frames
+        modulated_audio = []
+
+        while len(self.audio_buffer) >= self.codec.frame_samples:
+            # Extract frame
+            if HAS_NUMPY:
+                frame_audio = self.audio_buffer[:self.codec.frame_samples]
+                self.audio_buffer = self.audio_buffer[self.codec.frame_samples:]
+            else:
+                frame_audio = array.array('h', self.audio_buffer[:self.codec.frame_samples])
+                del self.audio_buffer[:self.codec.frame_samples]
+
+            # Compress with Codec2
+            compressed_frame = self.codec.encode(frame_audio)
+            if not compressed_frame:
+                continue
+
+            # Encrypt frame
+            encrypted = self._encrypt_voice_frame(compressed_frame)
+
+            # Add FEC
+            protected = self._add_fec(encrypted)
+
+            # Modulate to audio
+            audio_signal = self.modem.modulate(protected, add_preamble=True)
+            modulated_audio.append(audio_signal)
+
+        if modulated_audio:
+            if HAS_NUMPY:
+                return np.concatenate(modulated_audio)
+            else:
+                # Concatenate array.array objects
+                result = array.array('f')
+                for audio in modulated_audio:
+                    result.extend(audio)
+                return result
+        return None
+
+    def process_voice_output(self, modulated_audio):
+        """
+        Process received audio: demodulate, decrypt, and decompress.
+
+        Args:
+            modulated_audio: Received FSK-modulated audio
+
+        Returns:
+            Decoded PCM audio samples (numpy array or array.array)
+        """
+        # Demodulate
+        data, confidence = self.modem.demodulate(modulated_audio)
+
+        if confidence < 0.5:
+            print(f"{YELLOW}[VOICE]{RESET} Low demodulation confidence: {confidence:.2f}")
+            return None
+
+        # Remove FEC
+        frame_data = self._remove_fec(data)
+        if not frame_data:
+            return None
+
+        # Decrypt
+        compressed_frame = self._decrypt_voice_frame(frame_data)
+        if not compressed_frame:
+            return None
+
+        # Decompress
+        audio_samples = self.codec.decode(compressed_frame)
+
+        return audio_samples
+
+    def _encrypt_voice_frame(self, frame: Codec2Frame) -> bytes:
+        """Encrypt a voice frame using ChaCha20-CTR."""
+        if not self.protocol.hkdf_key:
+            raise ValueError("No encryption key available")
+
+        # Prepare frame data
+        frame_data = struct.pack('<BIH',
+            frame.mode,
+            frame.frame_number,
+            len(frame.bits)
+        ) + frame.bits
+
+        # Generate IV for this frame (ChaCha20 needs 16 bytes)
+        iv = struct.pack('<Q', self.voice_iv_counter) + b'\x00' * 8  # 8 + 8 = 16 bytes
+        self.voice_iv_counter += 1
+
+        # Encrypt using ChaCha20
+        from encryption import chacha20_encrypt
+        key = bytes.fromhex(self.protocol.hkdf_key)
+        encrypted = chacha20_encrypt(frame_data, key, iv)
+
+        # Add sequence number and IV hint
+        return struct.pack('<HQ', self.voice_sequence, self.voice_iv_counter) + encrypted
+
+    def _decrypt_voice_frame(self, data: bytes) -> Optional[Codec2Frame]:
+        """Decrypt a voice frame."""
+        if len(data) < 10:
+            return None
+
+        # Extract sequence and IV hint
+        sequence, iv_hint = struct.unpack('<HQ', data[:10])
+        encrypted = data[10:]
+
+        # Generate IV (16 bytes for ChaCha20)
+        iv = struct.pack('<Q', iv_hint) + b'\x00' * 8
+
+        # Decrypt
+        from encryption import chacha20_decrypt
+        key = bytes.fromhex(self.protocol.hkdf_key)
+
+        try:
+            decrypted = chacha20_decrypt(encrypted, key, iv)
+
+            # Parse frame
+            mode, frame_num, bits_len = struct.unpack('<BIH', decrypted[:7])
+            bits = decrypted[7:7+bits_len]
+
+            return Codec2Frame(
+                mode=Codec2Mode(mode),
+                bits=bits,
+                timestamp=0,  # Will be set by caller
+                frame_number=frame_num
+            )
+        except Exception as e:
+            print(f"{RED}[VOICE]{RESET} Decryption failed: {e}")
+            return None
+
+    def _add_fec(self, data: bytes) -> bytes:
+        """Add forward error correction."""
+        # Simple repetition code (3x) for testing
+        # In production: use convolutional code or LDPC
+        fec_data = bytearray()
+
+        for byte in data:
+            # Repeat each byte 3 times
+            fec_data.extend([byte, byte, byte])
+
+        return bytes(fec_data)
+
+    def _remove_fec(self, data: bytes) -> Optional[bytes]:
+        """Remove FEC and correct errors."""
+        if len(data) % 3 != 0:
+            return None
+
+        corrected = bytearray()
+
+        for i in range(0, len(data), 3):
+            # Majority voting
+            votes = [data[i], data[i+1], data[i+2]]
+            byte_value = max(set(votes), key=votes.count)
+            corrected.append(byte_value)
+
+        return bytes(corrected)
+
+
+# Example usage
+if __name__ == "__main__":
+    # Test Codec2 wrapper
+    print(f"\n{BLUE}=== Testing Codec2 Wrapper ==={RESET}")
+    codec = Codec2Wrapper(Codec2Mode.MODE_1200)
+
+    # Generate test audio
+    if HAS_NUMPY:
+        t = np.linspace(0, 0.04, 320)  # 40ms at 8kHz
+        test_audio = (np.sin(2 * np.pi * 440 * t) * 16384).astype(np.int16)
+    else:
+        test_audio = array.array('h')
+        for i in range(320):
+            t = i * 0.04 / 320
+            value = int(math.sin(2 * math.pi * 440 * t) * 16384)
+            test_audio.append(value)
+
+    # Encode
+    frame = codec.encode(test_audio)
+    print(f"Encoded frame: {len(frame.bits)} bytes")
+
+    # Decode
+    decoded = codec.decode(frame)
+    print(f"Decoded audio: {len(decoded)} samples")
+
+    # Test FSK modem
+    print(f"\n{BLUE}=== Testing FSK Modem ==={RESET}")
+    modem = FSKModem()
+
+    # Test data
+    test_data = b"Hello, secure voice!"
+
+    # Modulate
+    modulated = modem.modulate(test_data)
+    print(f"Modulated: {len(modulated)} samples ({len(modulated)/8000:.2f}s)")
+
+    # Demodulate
+    demodulated, confidence = modem.demodulate(modulated)
+    print(f"Demodulated: {demodulated}")
+    print(f"Confidence: {confidence:.2%}")
+    print(f"Match: {demodulated == test_data}")