]> OzVa Git service - audio-over-stft/commitdiff
Fixed buzzing
authorwill <greenwoodw50@gmail.com>
Sun, 1 Sep 2024 13:49:26 +0000 (14:49 +0100)
committerwill <greenwoodw50@gmail.com>
Sun, 1 Sep 2024 13:49:26 +0000 (14:49 +0100)
+ added brightness, contrast, temerature and tint bindings

camera.py
fft.py
loop.py

index f4259f988eba60b2f313ca7a18669a723cb5438a..325a2d250290a0864498068fffff7080a97cbf24 100644 (file)
--- a/camera.py
+++ b/camera.py
@@ -45,10 +45,14 @@ class camera():
                window_height: int,
                display_size: tuple,
                device_id: int = 0,
+               brightness: float = 1.,
+               contrast: float = 0.,
+               temperature: float = 0.,
+               tint: float = 0.,
                debug: bool = True,
                dummy: bool = False,
                use_lookup: bool = False,
-               use_files: bool = False
+               use_files: bool = False,
        ):
 
                self.window_size = window_size
@@ -118,49 +122,49 @@ class camera():
                        print("calibration failed")
                        quit()
 
-               if self.use_files == True:
+               if self.use_lookup == True:
                        self.get_lookup()
 
        def get_lookup(
                self
        ) -> None:
 
-               if self.use_lookup == True:
-                       return
-
                if self.use_files == True:
                        self.lookup = np.load("lookup.npy")
                        return
 
                lookup = None
 
-               for r in range(0, 255, self.lookup_compression):
-                       for g in range(0, 255, self.lookup_compression):
-                               for b in range(0, 255, self.lookup_compression):
-                                       pixel = np.array([[[b, g, r]]], dtype=np.int8)
-                                       pixel = cv.resize(pixel, self.display_size, interpolation=cv.INTER_NEAREST_EXACT)
-                                       self.display(pixel)
+               for (r, g, b) in [(0,0,0), (255,255,255)]:
+                       pixel = np.array([[[b, g, r]]], dtype=np.int8)
+                       pixel = cv.resize(pixel, self.display_size, interpolation=cv.INTER_NEAREST_EXACT)
+                       self.display(pixel)
 
-                                       for i in range(10): # silly hack
-                                               time.sleep(0.2)
-                                               recovered = self.capture()
+                       for i in range(100): # silly hack
+                               cv.waitKey(1)
+                               recovered = self.capture()
 
-                                       error = np.copy(recovered.astype(np.int16))
-                                       error[..., 0] -= b
-                                       error[..., 1] -= g
-                                       error[..., 2] -= r
+                       error = np.copy(recovered.astype(np.int16))
+                       error[..., 0] -= b
+                       error[..., 1] -= g
+                       error[..., 2] -= r
 
-                                       error = np.clip(error, -50, 255)
+                       error = np.clip(error, -50, 255)
 
-                                       if lookup is None:
-                                               lookup = error
-                                       else:
-                                               lookup += error
-                                               lookup  = lookup // 2
+                       if lookup is None:
+                               lookup = error
+                       else:
+                               lookup += error
+                               lookup  = lookup // 2
 
                self.lookup = lookup
                np.save("lookup.npy", lookup)
 
+               lookup += np.min(lookup)
+               lookup = np.round(lookup * (255 / np.min(lookup)))
+               cv.imwrite("lookup.jpg", lookup)
+
+
        def display(
                self,
                image: np.ndarray
@@ -168,6 +172,11 @@ class camera():
 
                self.last_display = image
                image = cv.resize(image, self.display_size, interpolation=cv.INTER_NEAREST_EXACT)
+               image = cv.convertScaleAbs(image, alpha=self.contrast, beta=self.brightness) # contrast / brightness correction
+               image[...,2] + self.temperature # color correction
+               image[...,1] + self.tint
+               image[...,0] - self.temperature
+
                cv.imshow("display", image)
                cv.waitKey(1)
 
@@ -194,6 +203,7 @@ class camera():
 
                else:
                        _, image = self.camera.read()
+
                        self.last_capture = image
                        if self.homography is not None:
                                image = cv.warpPerspective(image, self.homography, self.display_size)
diff --git a/fft.py b/fft.py
index 45af6c4a82c71eae5644c278b32838a344463083..bd0d1df3558c65944ca53817f70c3557ad4d9b58 100644 (file)
--- a/fft.py
+++ b/fft.py
@@ -1,16 +1,18 @@
 import numpy as np
-import math
-import time
 import cv2 as cv
 import matplotlib.pyplot as plt
-import cProfile
 
 """
 Notes:
        The data is receved in int16 format (limits -32768 to 32767). It is converted to a complex array from which we can work out power. This is from 0 to over 10,000. It is converted to decibels [1].
        This decibels can be bounded to some upper and lower limit of both volume and noise floor. This also makes our calculations more predictable.
 
+       For some reason, when the lookup table scales via the relative volume insted of the volume, it all fucks up. I cant seem to work out why even though i belive this is mathematically wrong. Either way, the program has been changed to allow this while i work out why this is happening. [2]
+
 [1] We convert to decibels via the function 20*log10(power) followed by some scaling to the required limits. We covert BACK to power via a lookup table. This is not efficient for the first conversion as the lookup table would have to be huge and have incies in the floating points.
+
+[2] The normal FFT has been changed to mirror the IFFT, this does seem to produce some peeking, if this becomes a problem it can be changed back at the expense of some volume.
+
 """
 
 class fft():
@@ -41,7 +43,7 @@ class fft():
 
                # generate lookup table for the converstion from decibels to power
                a = self.volume_min
-               b = self.volume_relative / self.amplitude_relative
+               b = self.volume_max / self.amplitude_relative
 
                # this is the parameterized inverted function of y = (20 * log10(x) - 40) * (255/140)
                log_lookup = [10 ** (((x * b) + a) / 20) for x in range(0, 256)]
@@ -64,7 +66,7 @@ class fft():
 
                # confine the amplitude within the limits specified
                a = self.volume_min
-               b = self.amplitude_relative / self.volume_relative
+               b = self.amplitude_relative / self.volume_relative # possibly change the vol_max to vol_rel ?? see [2]
                c = self.amplitude_min
                amplitude = ((amplitude - a) * b) + c
 
@@ -77,7 +79,7 @@ class fft():
 
                # rearrange to image format
                full = np.full(angle.shape, fill_value=60)
-               image = np.stack((full, angle, amplitude), axis=-1)
+               image = np.stack((amplitude * (180/255), angle, amplitude), axis=-1)
                image = np.array([image], dtype=np.uint8)
                image = cv.cvtColor(image, cv.COLOR_HSV2BGR)
 
@@ -90,8 +92,11 @@ class fft():
 
                # split the image into constituant parts
                image = cv.cvtColor(image, cv.COLOR_BGR2HSV)
-               amplitude = image[0][...,2].astype(np.uint8)
+               amplitude = image[0][...,2]
                angle = image[0][...,1].astype(np.float64)
+               hue = image[0][...,0].astype(np.float64) * (255/180)
+
+               amplitude = np.mean( np.array([ amplitude, hue ]), axis=0 ).astype(np.uint8)
 
                # convert amplitude back into vector length
                amplitude = self.log_lookup[amplitude]
@@ -104,12 +109,6 @@ class fft():
                imag = np.sin(angle) * amplitude
                segment = real + (1j * imag)
 
-               data = np.fft.ifft(segment * self.window_size).real.astype(np.int16)
-
-               return data
+               data = np.fft.ifft(segment * self.window_size).real
 
-if __name__ == "__main__":
-       with cProfile.Profile() as pr:
-               fft = fft(130, 65)
-               fft.istft(fft.stft(np.random.randint(-32768, 32767, (130,))))
-               pr.print_stats()
+               return data.astype(np.int16)
diff --git a/loop.py b/loop.py
index 0e6acdade1dd2d542358a7966c1a4fe407c2a683..ed974b8480fb7e879c7f8bb2cb0c48f35531cc68 100644 (file)
--- a/loop.py
+++ b/loop.py
@@ -11,7 +11,10 @@ import time
 import pyaudio
 import os
 import sys
+import wave
 import matplotlib.pyplot as plt
+import gi
+from gi.repository import Gtk
 
 """
 notes:
@@ -25,13 +28,13 @@ notes:
 sample_rate, data = wavfile.read("/home/will/Downloads/birdsong.wav")
 #data = data[...,0]
 
-new_rate = 10000.
+new_rate = 11025.
 sample_count = round(len(data) * new_rate / sample_rate)
 data = sps.resample(data, sample_count)
 sample_rate = int(new_rate)
 
-window_size = 130
-window_height = 70
+window_size = 170
+window_height = 80
 
 hop_size = window_size // 2
 camera = camera(
@@ -39,12 +42,26 @@ camera = camera(
        window_height,
        (1920, 1080),
        device_id = 2,
-       debug = False,
-       dummy = True,
+       brightness = 1.,
+       contrast = 0.,
+       temperature = 0.,
+       tint = 0.,
+       debug = True,
+       dummy = False,
        use_lookup = False,
-       use_files = True
+       use_files = False
 )
 
+file = wave.open("out.wav", "wb")
+file.setparams((
+       1,                      # channels
+       2,                      # sample width
+       sample_rate,
+       0,
+       "NONE",         # compression type
+       "NONE"          # compression name
+))
+
 camera.calibrate()
 
 transform = fft(window_size, hop_size)
@@ -94,6 +111,9 @@ try:
                camera.display(spectrum)
                capture = camera.capture()
 
+               # plt.clf()
+               # plt.plot(rows[0])
+
                rows = [np.array([i]) for i in capture]
                with Pool() as p:
                        recovered = np.array(p.map(transform.istft, rows))
@@ -101,24 +121,32 @@ try:
                if len(audio) < hop_size:
                        audio = np.zeros((hop_size,), dtype=np.int16)
 
+               # plt.plot(recovered[0])
+               # plt.pause(0.05)
+
                for row in recovered:
                        audio[-hop_size:] += row[:hop_size]
                        audio = np.append(audio, row[hop_size:])
 
+                       #file.writeframes(row[hop_size:])
+
                segment_index += 1
                if segment_index == segment_count: segment_index = 0
 
-               slept = 0
+               slept = False
                time.sleep(0.1)
                while len(audio) > 1 * segment_samples:
+                       slept = True
                        cv.waitKey(1)
-                       slept += 1
-               print(f"slept {slept} times")
+
+               if not slept:
+                       print("Dropped frames!")
 
 except KeyboardInterrupt:
        stream.stop_stream()
        stream.close()
        pyaudio_object.terminate()
+       file.close()
 
        try:
                sys.exit()