import os
import torch
import torchaudio
from IPython.display import Audio
import numpy as np
from scipy.fft import fft, ifft
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import random
from collections import Counter
from tqdm import tqdm
from matplotlib.pyplot import imshow
wav_path = '/Users/ginoprasad/autotune/m4a_files/grapevine.m4a'
wav, sample_rate = torchaudio.load(wav_path)
wav = wav[0]
wav = wav[int(sample_rate*20):int(sample_rate*25)]
Audio(wav, rate=sample_rate)
prelim_notes_octave = 4
prelim_notes = [("C", 261.63), ("C#", 277.18), ("D", 293.66), ("D#", 311.13),
("E", 329.63), ("F", 349.23), ("F#", 369.99), ("G", 392.00), ("G#", 415.30), ("A", 440.00), ("A#", 466.16), ("B", 493.88), ]
# prelim_notes
notes = [(freq * (2 ** (octave - prelim_notes_octave)), f"{name}{octave}") for octave in range(8) for name, freq in prelim_notes]
pd.DataFrame(notes)
0 | 1 | |
---|---|---|
0 | 16.351875 | C0 |
1 | 17.323750 | C#0 |
2 | 18.353750 | D0 |
3 | 19.445625 | D#0 |
4 | 20.601875 | E0 |
... | ... | ... |
91 | 3136.000000 | G7 |
92 | 3322.400000 | G#7 |
93 | 3520.000000 | A7 |
94 | 3729.280000 | A#7 |
95 | 3951.040000 | B7 |
96 rows × 2 columns
min_frequency = 60 # hertz
max_frequency = 500 # hertz
W = 0.05 # seconds
precision = 1000
Ideally: $x_t - x_{t+T} = 0$ for all t
$d_t (\tau) = \sum_{j=t+1}^{t+W} (x_j - x_{j+\tau})^2$
$= r_t(0) + r_{t+\tau}(0) - 2r_t(\tau)$
Where $r_t(\tau) = \sum_{j=t+1}^{t+W} x_j x_{j+\tau}$
def randints(n, k):
return [random.randint(0, n-1) for _ in range(k)]
def d(wav_slice, t):
if type(wav_slice) != np.ndarray:
wav_slice = wav_slice.numpy()
autocorrelation = scipy.signal.convolve(wav_slice[t:], wav_slice[t:t+int(W*sample_rate)][::-1], mode='valid')
energy = scipy.signal.convolve(wav_slice[t:] * wav_slice[t:], np.ones(int(W*sample_rate)), mode='valid')
distance = (energy + energy[0]) - (2 * autocorrelation)
assert len(distance) > 10
# print(len(distance))
return distance
def normalized_d(wav_slice, t):
sum_ = 0
ret = []
distance = d(wav_slice, t)
for tau, dist in enumerate(distance):
if tau == 0:
ret.append(1)
else:
if sum_ == 0:
ret.append(1)
else:
ret.append(dist / ((1/tau) * sum_))
sum_ += dist
return np.array(ret), distance
def parabolic_interpolation(y):
x = np.array(range(len(y)))
x_squared = x ** 2
ones = np.ones(len(y))
mat = np.transpose(np.array([x_squared, x, ones]))
if len(y) < 3:
return np.argmin(y)
a, b, c = np.matmul(np.linalg.inv(np.matmul(np.transpose(mat), mat)), np.transpose(mat)).dot(y)
if a == 0 or -(b / (2 * a)) < 0:
return np.argmin(y)
return -(b / (2 * a))
def pitch(wav_slice, y_threshold=0.15, t=0, width=3, index=False):
ls, dist = normalized_d(wav_slice, t)
minimum = None
for x, val in enumerate(ls):
if x and min_frequency <= sample_rate / x <= max_frequency and val < y_threshold and x < len(ls) - 1 and ls[x+1] >= val:
minimum = x
break
if minimum is None:
return None
minimum = max(minimum-width, 0) + parabolic_interpolation(dist[max(minimum-width, 0):minimum+width+1]) # parabolic interpolation
if index:
return sample_rate / minimum, minimum
return sample_rate / minimum
def get_median_pitch(pitch_candidates):
if type(pitch_candidates) != np.ndarray:
pitch_candidates = np.array(pitch_candidates)
closest = np.vectorize(lambda x: pd.DataFrame(notes)[1][np.argmin(pd.DataFrame(notes)[0] - x)])
closest_notes = closest(pitch_candidates)
mode = max(Counter(closest_notes).items(), key=lambda x: x[1])[0]
return np.median(pitch_candidates[closest_notes == mode])
def pitch_predict(wav_slice, iterations=30):
pitch_candidates = []
for t in randints(len(wav_slice)-int(W + np.ceil(sample_rate / max_frequency)), iterations):
pitch_ = pitch(wav_slice, t=t)
if pitch_ is not None:
pitch_candidates.append(pitch_)
if not pitch_candidates:
return None
median = np.median(pitch_candidates)
return median if median in pitch_candidates else pitch_candidates[0]
amplitude = 1
def get_frequency(frequency, length):
base = np.arange(0, length*sample_rate).astype(np.float64)
c = (frequency * 2 * np.pi) / sample_rate
wavelet_ = amplitude * np.sin(c * base)
return wavelet_
section = wav[int(0.5*sample_rate):]
Audio(section, rate=sample_rate)
pitch_predict(section)
393.8275575430257
Audio(get_frequency(399.7325834830941, length=5), rate=sample_rate)