From: will Date: Sun, 25 Aug 2024 03:48:42 +0000 (+0100) Subject: Optimixation and notation X-Git-Url: https://git.ozva.co.uk/?a=commitdiff_plain;h=86bfb3d7f3f36f670c70016819c4b528d468a661;p=audio-over-stft Optimixation and notation - added notes to fft.py - added lookup table for dB to pwr transformation for large speedup --- diff --git a/.gitignore b/.gitignore index db7f2ac..2bdcf07 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.npy *.jpg *.wav +__pycache__* diff --git a/camera.py b/camera.py index 2539baf..64c77d2 100644 --- a/camera.py +++ b/camera.py @@ -21,7 +21,7 @@ class VideoCapture: # read frames as soon as they are available, keeping only most recent one def reader(self): - while True: + while 1: ret, frame = self.camera.read() if not ret: break diff --git a/fft.py b/fft.py index c74c4dc..1685134 100644 --- a/fft.py +++ b/fft.py @@ -1,5 +1,16 @@ import numpy as np +import math +import time import cv2 as cv +import matplotlib.pyplot as plt + +""" +Notes: + The data is receved in int16 format (limits -32768 to 32767). It is converted to a complex array from which we can work out power. This is from 0 to over 10,000. It is converted to decibels [1]. + This decibels can be bounded to some upper and lower limit of both volume and noise floor. This also makes our calculations more predictable. + +[1] We convert to decibels via the function 20*log10(power) followed by some scaling to the required limits. We covert BACK to power via a lookup table. This is not efficient for the first conversion as the lookup table would have to be huge and have incies in the floating points. +""" class fft(): def __init__( @@ -7,46 +18,66 @@ class fft(): window_size: int, hop_size: int ): + # calculate the window and hop size, use to calulate the cosine window self.window_size = window_size self.hop_size = hop_size self.window = np.hanning(window_size) - self.lower_limit = -40 - self.upper_limit = 100 + # set the max and min numerical values for amplitude and angle to allow for easier combinations of them both self.amplitude_max = 254 self.amplitude_min = 0 self.angle_max = 254 self.angle_min = 0 + # set the upper and lower limits (in dB) that are to be displayed on the screen + self.volume_max = 100 + self.volume_min = -40 + # calulate the range of each amplitude and angle self.amplitude_relative = self.amplitude_max - self.amplitude_min self.angle_relative = self.angle_max - self.angle_min + self.volume_relative = self.volume_max - self.volume_min + + # generate lookup table for the converstion from decibels to power + a = self.volume_min + b = self.volume_relative / self.amplitude_relative + # this is the parameterized inverted function of y = (20 * log10(x) - 40) * (255/140) + log_lookup = [10 ** (((x * b) + a) / 20) for x in range(0, 256)] + self.log_lookup = np.array(log_lookup) def stft( self, data: np.ndarray ) -> np.ndarray: + # apply window and perform the fft segment = data * self.window spectrum = np.fft.fft(segment) / self.window_size + # convert the vector length to decimals and confine amplitude = np.abs(spectrum) + amplitude = 20*np.log10(amplitude) - amplitude = np.clip(amplitude, self.lower_limit, self.upper_limit) - amplitude -= self.lower_limit - amplitude *= (self.amplitude_relative / self.upper_limit) + self.amplitude_min - amplitude = np.clip(amplitude, self.amplitude_min, self.amplitude_max) + amplitude = np.clip(amplitude, self.volume_min, self.volume_max) + + # confine the amplitude within the limits specified + a = self.volume_min + b = self.amplitude_relative / self.volume_relative + c = self.amplitude_min + amplitude = ((amplitude - a) * b) + c + # convert x and y to the angle and confine angle = np.angle(spectrum) + + # confine the angle within the limits specified angle = ((angle + np.pi) * (self.angle_relative / (2 * np.pi))) + self.angle_min angle = np.clip(angle, self.angle_min, self.angle_max) + # rearrange to image format full = np.full(angle.shape, fill_value=60) - image = np.stack((full, angle, amplitude), axis=-1) image = np.array([image], dtype=np.uint8) - image = cv.cvtColor(image, cv.COLOR_HSV2BGR) return image @@ -56,24 +87,23 @@ class fft(): image: np.ndarray ) -> np.ndarray: + # split the image into constituant parts image = cv.cvtColor(image, cv.COLOR_BGR2HSV) - - amplitude = image[0][...,2].astype(np.float64) + amplitude = image[0][...,2].astype(np.uint8) angle = image[0][...,1].astype(np.float64) - amplitude -= self.amplitude_min - amplitude /= (self.amplitude_relative / self.upper_limit) - amplitude += self.lower_limit - amplitude = np.power(10, amplitude / 20) + # convert amplitude back into vector length + amplitude = self.log_lookup[amplitude] + # convert angle back into x and y angle = ((angle - self.angle_min) / (self.angle_relative / (2 * np.pi))) - np.pi + # rearrange back into fft result real = np.cos(angle) * amplitude imag = np.sin(angle) * amplitude segment = real + (1j * imag) - data = np.fft.ifft(segment * self.window_size).real + data = np.fft.ifft(segment * self.window_size).real.astype(np.int16) return data - diff --git a/loop.py b/loop.py index 90c5e27..865078c 100644 --- a/loop.py +++ b/loop.py @@ -40,15 +40,13 @@ camera = camera( (1840, 1000), device_id = 2, debug = False, - dummy = False, + dummy = True, use_lookup = False, use_files = True ) camera.calibrate() -camera.get_lookup() - transform = fft(window_size, hop_size) segment_samples = window_height * hop_size @@ -94,7 +92,6 @@ try: if segment_index == 10: cv.imwrite("sample.jpg", spectrum) camera.display(spectrum) - time.sleep(0.1) capture = camera.capture() rows = [np.array([i]) for i in capture] @@ -105,8 +102,6 @@ try: audio = np.zeros((hop_size,), dtype=np.int16) for row in recovered: - row = row.astype(np.int16) - audio[-hop_size:] += row[:hop_size] audio = np.append(audio, row[hop_size:]) @@ -114,6 +109,7 @@ try: if segment_index == segment_count: segment_index = 0 slept = 0 + time.sleep(0.1) while len(audio) > 5 * segment_samples: time.sleep(0.01) slept += 1