|
|
|
"""tp3__1_-1.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1_Sjx5G1BW689ggZJAJ4P7kCZndOobNCp |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import time |
|
|
|
import tensorflow as tf |
|
import tensorflow_hub as hub |
|
|
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import librosa |
|
from librosa import display as librosadisplay |
|
|
|
import logging |
|
import math |
|
import statistics |
|
import sys |
|
|
|
from IPython.display import Audio, Javascript |
|
from scipy.io import wavfile |
|
|
|
from base64 import b64decode |
|
|
|
import music21 |
|
from pydub import AudioSegment |
|
|
|
logger = logging.getLogger() |
|
logger.setLevel(logging.ERROR) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXPECTED_SAMPLE_RATE = 16000 |
|
|
|
|
|
def convert_audio_for_model(user_file, output_file='converted_audio_file.wav'): |
|
audio = AudioSegment.from_file(user_file) |
|
audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1) |
|
audio.export(output_file, format="wav") |
|
return output_file |
|
|
|
MAX_ABS_INT16 = 32768.0 |
|
|
|
def plot_stft(x, sample_rate, show_black_and_white=False): |
|
x_stft = np.abs(librosa.stft(x, n_fft=2048)) |
|
fig, ax = plt.subplots() |
|
fig.set_size_inches(20, 10) |
|
x_stft_db = librosa.amplitude_to_db(x_stft, ref=np.max) |
|
|
|
if(show_black_and_white): |
|
librosadisplay.specshow(data=x_stft_db, |
|
y_axis='log', |
|
sr=sample_rate, |
|
cmap='gray_r') |
|
else: |
|
librosadisplay.specshow(data=x_stft_db, |
|
y_axis='log', |
|
sr=sample_rate) |
|
|
|
plt.colorbar(format='%+2.0f dB') |
|
|
|
return fig |
|
|
|
|
|
sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb') |
|
|
|
fig = plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE) |
|
|
|
|
|
|
|
model = hub.load("https://tfhub.dev/google/spice/2") |
|
|
|
def plot_pitch_conf(pitch_outputs,confidence_outputs): |
|
fig, ax = plt.subplots() |
|
fig.set_size_inches(20, 10) |
|
plt.plot(pitch_outputs, label='pitch') |
|
plt.plot(confidence_outputs, label='confidence') |
|
plt.legend(loc="lower right") |
|
return fig |
|
|
|
def plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y): |
|
fig, ax = plt.subplots() |
|
fig.set_size_inches(20, 10) |
|
ax.set_ylim([0, 1]) |
|
plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, ) |
|
plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, c="r") |
|
return fig |
|
|
|
def output2hz(pitch_output): |
|
|
|
PT_OFFSET = 25.58 |
|
PT_SLOPE = 63.07 |
|
FMIN = 10.0; |
|
BINS_PER_OCTAVE = 12.0; |
|
cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET; |
|
return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE) |
|
|
|
def espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz): |
|
fig, ax = plt.subplots() |
|
plot_stft(audio_samples / MAX_ABS_INT16 , |
|
sample_rate=EXPECTED_SAMPLE_RATE, show_black_and_white=True) |
|
|
|
|
|
plt.scatter(confident_pitch_outputs_x, confident_pitch_values_hz, c="r") |
|
return fig |
|
|
|
def hz2offset(freq): |
|
|
|
if freq == 0: |
|
return None |
|
|
|
h = round(12 * math.log2(freq / C0)) |
|
return 12 * math.log2(freq / C0) - h |
|
|
|
def quantize_predictions(group, ideal_offset): |
|
|
|
non_zero_values = [v for v in group if v != 0] |
|
zero_values_count = len(group) - len(non_zero_values) |
|
|
|
|
|
if zero_values_count > 0.8 * len(group): |
|
|
|
|
|
return 0.51 * len(non_zero_values), "Rest" |
|
else: |
|
|
|
h = round( |
|
statistics.mean([ |
|
12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values |
|
])) |
|
octave = h // 12 |
|
n = h % 12 |
|
note = note_names[n] + str(octave) |
|
|
|
error = sum([ |
|
abs(12 * math.log2(freq / C0) - ideal_offset - h) |
|
for freq in non_zero_values |
|
]) |
|
return error, note |
|
|
|
def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth, |
|
prediction_start_offset, ideal_offset): |
|
|
|
pitch_outputs_and_rests = [0] * prediction_start_offset + \ |
|
pitch_outputs_and_rests |
|
|
|
groups = [ |
|
pitch_outputs_and_rests[i:i + predictions_per_eighth] |
|
for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth) |
|
] |
|
|
|
quantization_error = 0 |
|
|
|
notes_and_rests = [] |
|
for group in groups: |
|
error, note_or_rest = quantize_predictions(group, ideal_offset) |
|
quantization_error += error |
|
notes_and_rests.append(note_or_rest) |
|
|
|
return quantization_error, notes_and_rests |
|
|
|
def main(audio): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
converted_audio_file = convert_audio_for_model(audio) |
|
|
|
|
|
sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb') |
|
|
|
audio_samples = audio_samples / float(MAX_ABS_INT16) |
|
|
|
|
|
|
|
model_output = model.signatures["serving_default"](tf.constant(audio_samples, tf.float32)) |
|
|
|
pitch_outputs = model_output["pitch"] |
|
uncertainty_outputs = model_output["uncertainty"] |
|
|
|
|
|
confidence_outputs = 1.0 - uncertainty_outputs |
|
|
|
|
|
confidence_outputs = list(confidence_outputs) |
|
pitch_outputs = [ float(x) for x in pitch_outputs] |
|
|
|
indices = range(len (pitch_outputs)) |
|
confident_pitch_outputs = [ (i,p) |
|
for i, p, c in zip(indices, pitch_outputs, confidence_outputs) if c >= 0.9 ] |
|
confident_pitch_outputs_x, confident_pitch_outputs_y = zip(*confident_pitch_outputs) |
|
|
|
confident_pitch_values_hz = [ output2hz(p) for p in confident_pitch_outputs_y ] |
|
|
|
|
|
|
|
fig1 = plt.figure() |
|
plt.plot(audio_samples) |
|
|
|
|
|
fig2 = plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE) |
|
|
|
|
|
fig3 = plot_pitch_conf(pitch_outputs,confidence_outputs) |
|
|
|
|
|
|
|
fig4 = plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y) |
|
|
|
|
|
fig5 = espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pitch_outputs_and_rests = [ |
|
output2hz(p) if c >= 0.9 else 0 |
|
for i, p, c in zip(indices, pitch_outputs, confidence_outputs) |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
A4 = 440 |
|
C0 = A4 * pow(2, -4.75) |
|
note_names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] |
|
|
|
def hz2offset(freq): |
|
|
|
if freq == 0: |
|
return None |
|
|
|
h = round(12 * math.log2(freq / C0)) |
|
return 12 * math.log2(freq / C0) - h |
|
|
|
|
|
|
|
|
|
offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0] |
|
|
|
off = offsets |
|
|
|
ideal_offset = statistics.mean(offsets) |
|
|
|
ideal_off = ideal_offset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def quantize_predictions(group, ideal_offset): |
|
|
|
non_zero_values = [v for v in group if v != 0] |
|
zero_values_count = len(group) - len(non_zero_values) |
|
|
|
|
|
if zero_values_count > 0.8 * len(group): |
|
|
|
|
|
return 0.51 * len(non_zero_values), "Rest" |
|
else: |
|
|
|
h = round( |
|
statistics.mean([ |
|
12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values |
|
])) |
|
octave = h // 12 |
|
n = h % 12 |
|
note = note_names[n] + str(octave) |
|
|
|
error = sum([ |
|
abs(12 * math.log2(freq / C0) - ideal_offset - h) |
|
for freq in non_zero_values |
|
]) |
|
return error, note |
|
|
|
|
|
def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth, |
|
prediction_start_offset, ideal_offset): |
|
|
|
pitch_outputs_and_rests = [0] * prediction_start_offset + \ |
|
pitch_outputs_and_rests |
|
|
|
groups = [ |
|
pitch_outputs_and_rests[i:i + predictions_per_eighth] |
|
for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth) |
|
] |
|
|
|
quantization_error = 0 |
|
|
|
notes_and_rests = [] |
|
for group in groups: |
|
error, note_or_rest = quantize_predictions(group, ideal_offset) |
|
quantization_error += error |
|
notes_and_rests.append(note_or_rest) |
|
|
|
return quantization_error, notes_and_rests |
|
|
|
|
|
best_error = float("inf") |
|
best_notes_and_rests = None |
|
best_predictions_per_note = None |
|
|
|
for predictions_per_note in range(20, 65, 1): |
|
for prediction_start_offset in range(predictions_per_note): |
|
|
|
error, notes_and_rests = get_quantization_and_error( |
|
pitch_outputs_and_rests, predictions_per_note, |
|
prediction_start_offset, ideal_offset) |
|
|
|
if error < best_error: |
|
best_error = error |
|
best_notes_and_rests = notes_and_rests |
|
best_predictions_per_note = predictions_per_note |
|
|
|
|
|
|
|
while best_notes_and_rests[0] == 'Rest': |
|
best_notes_and_rests = best_notes_and_rests[1:] |
|
|
|
while best_notes_and_rests[-1] == 'Rest': |
|
best_notes_and_rests = best_notes_and_rests[:-1] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sc = music21.stream.Score() |
|
|
|
bpm = 60 * 60 / best_predictions_per_note |
|
|
|
a = music21.tempo.MetronomeMark(number=bpm) |
|
sc.insert(0,a) |
|
|
|
for snote in best_notes_and_rests: |
|
d = 'half' |
|
if snote == 'Rest': |
|
sc.append(music21.note.Rest(type=d)) |
|
else: |
|
sc.append(music21.note.Note(snote, type=d)) |
|
|
|
|
|
|
|
|
|
from IPython.core.display import display, HTML, Javascript |
|
import json, random |
|
|
|
def showScore(score): |
|
xml = open(score.write('musicxml')).read() |
|
showMusicXML(xml) |
|
|
|
def showMusicXML(xml): |
|
DIV_ID = "OSMD_div" |
|
a = display(HTML('<div id="'+DIV_ID+'">loading OpenSheetMusicDisplay</div>')) |
|
script = """ |
|
var div_id = {{DIV_ID}}; |
|
function loadOSMD() { |
|
return new Promise(function(resolve, reject){ |
|
if (window.opensheetmusicdisplay) { |
|
return resolve(window.opensheetmusicdisplay) |
|
} |
|
// OSMD script has a 'define' call which conflicts with requirejs |
|
var _define = window.define // save the define object |
|
window.define = undefined // now the loaded script will ignore requirejs |
|
var s = document.createElement( 'script' ); |
|
s.setAttribute( 'src', "https://cdn.jsdelivr.net/npm/opensheetmusicdisplay@0.7.6/build/opensheetmusicdisplay.min.js" ); |
|
//s.setAttribute( 'src', "/custom/opensheetmusicdisplay.js" ); |
|
s.onload=function(){ |
|
window.define = _define |
|
resolve(opensheetmusicdisplay); |
|
}; |
|
document.body.appendChild( s ); // browser will try to load the new script tag |
|
}) |
|
} |
|
loadOSMD().then((OSMD)=>{ |
|
window.openSheetMusicDisplay = new OSMD.OpenSheetMusicDisplay(div_id, { |
|
drawingParameters: "compacttight" |
|
}); |
|
openSheetMusicDisplay |
|
.load({{data}}) |
|
.then( |
|
function() { |
|
openSheetMusicDisplay.render(); |
|
} |
|
); |
|
}) |
|
""".replace('{{DIV_ID}}',DIV_ID).replace('{{data}}',json.dumps(xml)) |
|
|
|
return a |
|
|
|
|
|
partitura = showScore(sc) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
converted_audio_file_as_midi = converted_audio_file[:-4] + '.mid' |
|
fp = sc.write('midi', fp=converted_audio_file_as_midi) |
|
|
|
wav_from_created_midi = converted_audio_file_as_midi.replace(' ', '_') + "_midioutput.wav" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
link = "https://www.tensorflow.org/hub/tutorials/spice?hl=es-419&authuser=2" |
|
|
|
iface = gr.Interface( |
|
fn=main, |
|
title= "Trabajo Práctico N°3 - Detección de tono con SPICE", |
|
description="Implementación de Modelo con GitHub + Hugging Face🤗-- 🔊✅ " + "Basado en: " + link, |
|
inputs = [gr.inputs.Audio(source= "microphone" , type="filepath",label="Ingrese Audio")], |
|
outputs= [gr.outputs.Audio(label="Audio Original"), |
|
gr.outputs.Plot(type="auto",label="Gráfico de Frecuencias"), |
|
gr.outputs.Plot(type="auto",label="Especto"), |
|
gr.outputs.Plot(type="auto",label="Pitch Confidence"), |
|
gr.outputs.Plot(type="auto",label="Notas"), |
|
gr.outputs.Plot(type="auto",label="Espectro+Notas"), |
|
gr.outputs.Textbox(label="bpm"), |
|
gr.outputs.Textbox(label="partitura"), |
|
gr.outputs.Textbox(type="html",label="partitura1"), |
|
gr.outputs.Audio(label="midi")], |
|
interpretation = "default", |
|
) |
|
|
|
iface.launch(debug=True) |
|
|