From: will <greenwoodw50@gmail.com>
Date: Sun, 25 Aug 2024 03:48:42 +0000 (+0100)
Subject: Optimixation and notation
X-Git-Url: https://git.ozva.co.uk/?a=commitdiff_plain;h=86bfb3d7f3f36f670c70016819c4b528d468a661;p=audio-over-stft

Optimixation and notation

- added notes to fft.py
- added lookup table for dB to pwr transformation for large speedup
---

diff --git a/.gitignore b/.gitignore
index db7f2ac..2bdcf07 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.npy
 *.jpg
 *.wav
+__pycache__*
diff --git a/camera.py b/camera.py
index 2539baf..64c77d2 100644
--- a/camera.py
+++ b/camera.py
@@ -21,7 +21,7 @@ class VideoCapture:
 
 	# read frames as soon as they are available, keeping only most recent one
 	def reader(self):
-		while True:
+		while 1:
 			ret, frame = self.camera.read()
 			if not ret:
 				break
diff --git a/fft.py b/fft.py
index c74c4dc..1685134 100644
--- a/fft.py
+++ b/fft.py
@@ -1,5 +1,16 @@
 import numpy as np
+import math
+import time
 import cv2 as cv
+import matplotlib.pyplot as plt
+
+"""
+Notes:
+	The data is receved in int16 format (limits -32768 to 32767). It is converted to a complex array from which we can work out power. This is from 0 to over 10,000. It is converted to decibels [1].
+	This decibels can be bounded to some upper and lower limit of both volume and noise floor. This also makes our calculations more predictable.
+
+[1] We convert to decibels via the function 20*log10(power) followed by some scaling to the required limits. We covert BACK to power via a lookup table. This is not efficient for the first conversion as the lookup table would have to be huge and have incies in the floating points.
+"""
 
 class fft():
 	def __init__(
@@ -7,46 +18,66 @@ class fft():
 		window_size: int,
 		hop_size: int
 	):
+		# calculate the window and hop size, use to calulate the cosine window
 		self.window_size = window_size
 		self.hop_size = hop_size
 		self.window = np.hanning(window_size)
 
-		self.lower_limit = -40
-		self.upper_limit = 100
 
+		# set the max and min numerical values for amplitude and angle to allow for easier combinations of them both
 		self.amplitude_max = 254
 		self.amplitude_min = 0
 		self.angle_max = 254
 		self.angle_min = 0
+		# set the upper and lower limits (in dB) that are to be displayed on the screen
+		self.volume_max = 100
+		self.volume_min = -40
 
+		# calulate the range of each amplitude and angle
 		self.amplitude_relative = self.amplitude_max - self.amplitude_min
 		self.angle_relative = self.angle_max - self.angle_min
+		self.volume_relative = self.volume_max - self.volume_min
+
+		# generate lookup table for the converstion from decibels to power
+		a = self.volume_min
+		b = self.volume_relative / self.amplitude_relative
 
+		# this is the parameterized inverted function of y = (20 * log10(x) - 40) * (255/140)
+		log_lookup = [10 ** (((x * b) + a) / 20) for x in range(0, 256)]
+		self.log_lookup = np.array(log_lookup)
 
 	def stft(
 		self,
 		data: np.ndarray
 	) -> np.ndarray:
 
+		# apply window and perform the fft
 		segment = data * self.window
 		spectrum = np.fft.fft(segment) / self.window_size
 
+		# convert the vector length to decimals and confine
 		amplitude = np.abs(spectrum)
+
 		amplitude = 20*np.log10(amplitude)
-		amplitude = np.clip(amplitude, self.lower_limit, self.upper_limit)
-		amplitude -= self.lower_limit
-		amplitude *= (self.amplitude_relative / self.upper_limit) + self.amplitude_min
-		amplitude = np.clip(amplitude, self.amplitude_min, self.amplitude_max)
+		amplitude = np.clip(amplitude, self.volume_min, self.volume_max)
+
+		# confine the amplitude within the limits specified
+		a = self.volume_min
+		b = self.amplitude_relative / self.volume_relative
+		c = self.amplitude_min
+		amplitude = ((amplitude - a) * b) + c
 
+		# convert x and y to the angle and confine
 		angle = np.angle(spectrum)
+
+		# confine the angle within the limits specified
 		angle = ((angle + np.pi) * (self.angle_relative / (2 * np.pi))) + self.angle_min
 		angle = np.clip(angle, self.angle_min, self.angle_max)
 
+		# rearrange to image format
 		full = np.full(angle.shape, fill_value=60)
-
 		image = np.stack((full, angle, amplitude), axis=-1)
 		image = np.array([image], dtype=np.uint8)
-
 		image = cv.cvtColor(image, cv.COLOR_HSV2BGR)
 
 		return image
@@ -56,24 +87,23 @@ class fft():
 		image: np.ndarray
 	) -> np.ndarray:
 
+		# split the image into constituant parts
 		image = cv.cvtColor(image, cv.COLOR_BGR2HSV)
-
-		amplitude = image[0][...,2].astype(np.float64)
+		amplitude = image[0][...,2].astype(np.uint8)
 		angle = image[0][...,1].astype(np.float64)
 
-		amplitude -= self.amplitude_min
-		amplitude /= (self.amplitude_relative / self.upper_limit)
-		amplitude += self.lower_limit
-		amplitude = np.power(10, amplitude / 20)
+		# convert amplitude back into vector length
+		amplitude = self.log_lookup[amplitude]
 
+		# convert angle back into x and y
 		angle = ((angle - self.angle_min) / (self.angle_relative / (2 * np.pi))) - np.pi
 
+		# rearrange back into fft result
 		real = np.cos(angle) * amplitude
 		imag = np.sin(angle) * amplitude
 		segment = real + (1j * imag)
 
-		data = np.fft.ifft(segment * self.window_size).real
+		data = np.fft.ifft(segment * self.window_size).real.astype(np.int16)
 
 		return data
 
-
diff --git a/loop.py b/loop.py
index 90c5e27..865078c 100644
--- a/loop.py
+++ b/loop.py
@@ -40,15 +40,13 @@ camera = camera(
 	(1840, 1000),
 	device_id = 2,
 	debug = False,
-	dummy = False,
+	dummy = True,
 	use_lookup = False,
 	use_files = True
 )
 
 camera.calibrate()
 
-camera.get_lookup()
-
 transform = fft(window_size, hop_size)
 
 segment_samples = window_height * hop_size
@@ -94,7 +92,6 @@ try:
 		if segment_index == 10: cv.imwrite("sample.jpg", spectrum)
 
 		camera.display(spectrum)
-		time.sleep(0.1)
 		capture = camera.capture()
 
 		rows = [np.array([i]) for i in capture]
@@ -105,8 +102,6 @@ try:
 			audio = np.zeros((hop_size,), dtype=np.int16)
 
 		for row in recovered:
-			row = row.astype(np.int16)
-
 			audio[-hop_size:] += row[:hop_size]
 			audio = np.append(audio, row[hop_size:])
 
@@ -114,6 +109,7 @@ try:
 		if segment_index == segment_count: segment_index = 0
 
 		slept = 0
+		time.sleep(0.1)
 		while len(audio) > 5 * segment_samples:
 			time.sleep(0.01)
 			slept += 1