OpenSound's picture
Upload 211 files
9d3cb0a verified
raw
history blame
6.51 kB
import json
import shlex
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple
import ffmpy
import numpy as np
import torch
def r128stats(filepath: str, quiet: bool):
"""Takes a path to an audio file, returns a dict with the loudness
stats computed by the ffmpeg ebur128 filter.
Parameters
----------
filepath : str
Path to compute loudness stats on.
quiet : bool
Whether to show FFMPEG output during computation.
Returns
-------
dict
Dictionary containing loudness stats.
"""
ffargs = [
"ffmpeg",
"-nostats",
"-i",
filepath,
"-filter_complex",
"ebur128",
"-f",
"null",
"-",
]
if quiet:
ffargs += ["-hide_banner"]
proc = subprocess.Popen(ffargs, stderr=subprocess.PIPE, universal_newlines=True)
stats = proc.communicate()[1]
summary_index = stats.rfind("Summary:")
summary_list = stats[summary_index:].split()
i_lufs = float(summary_list[summary_list.index("I:") + 1])
i_thresh = float(summary_list[summary_list.index("I:") + 4])
lra = float(summary_list[summary_list.index("LRA:") + 1])
lra_thresh = float(summary_list[summary_list.index("LRA:") + 4])
lra_low = float(summary_list[summary_list.index("low:") + 1])
lra_high = float(summary_list[summary_list.index("high:") + 1])
stats_dict = {
"I": i_lufs,
"I Threshold": i_thresh,
"LRA": lra,
"LRA Threshold": lra_thresh,
"LRA Low": lra_low,
"LRA High": lra_high,
}
return stats_dict
def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]:
"""Given a path to a file, returns the start time offset and codec of
the first audio stream.
"""
ff = ffmpy.FFprobe(
inputs={path: None},
global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet",
)
streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"]
seconds_offset = 0.0
codec = None
# Get the offset and codec of the first audio stream we find
# and return its start time, if it has one.
for stream in streams:
if stream["codec_type"] == "audio":
seconds_offset = stream.get("start_time", 0.0)
codec = stream.get("codec_name")
break
return float(seconds_offset), codec
class FFMPEGMixin:
_loudness = None
def ffmpeg_loudness(self, quiet: bool = True):
"""Computes loudness of audio file using FFMPEG.
Parameters
----------
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
torch.Tensor
Loudness of every item in the batch, computed via
FFMPEG.
"""
loudness = []
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
for i in range(self.batch_size):
self[i].write(f.name)
loudness_stats = r128stats(f.name, quiet=quiet)
loudness.append(loudness_stats["I"])
self._loudness = torch.from_numpy(np.array(loudness)).float()
return self.loudness()
def ffmpeg_resample(self, sample_rate: int, quiet: bool = True):
"""Resamples AudioSignal using FFMPEG. More memory-efficient
than using julius.resample for long audio files.
Parameters
----------
sample_rate : int
Sample rate to resample to.
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
AudioSignal
Resampled AudioSignal.
"""
from audiotools import AudioSignal
if sample_rate == self.sample_rate:
return self
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
self.write(f.name)
f_out = f.name.replace("wav", "rs.wav")
command = f"ffmpeg -i {f.name} -ar {sample_rate} {f_out}"
if quiet:
command += " -hide_banner -loglevel error"
subprocess.check_call(shlex.split(command))
resampled = AudioSignal(f_out)
Path.unlink(Path(f_out))
return resampled
@classmethod
def load_from_file_with_ffmpeg(cls, audio_path: str, quiet: bool = True, **kwargs):
"""Loads AudioSignal object after decoding it to a wav file using FFMPEG.
Useful for loading audio that isn't covered by librosa's loading mechanism. Also
useful for loading mp3 files, without any offset.
Parameters
----------
audio_path : str
Path to load AudioSignal from.
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
AudioSignal
AudioSignal loaded from file with FFMPEG.
"""
audio_path = str(audio_path)
with tempfile.TemporaryDirectory() as d:
wav_file = str(Path(d) / "extracted.wav")
padded_wav = str(Path(d) / "padded.wav")
global_options = "-y"
if quiet:
global_options += " -loglevel error"
ff = ffmpy.FFmpeg(
inputs={audio_path: None},
outputs={wav_file: None},
global_options=global_options,
)
ff.run()
# We pad the file using the start time offset in case it's an audio
# stream starting at some offset in a video container.
pad, codec = ffprobe_offset_and_codec(audio_path)
# For mp3s, don't pad files with discrepancies less than 0.027s -
# it's likely due to codec latency. The amount of latency introduced
# by mp3 is 1152, which is 0.0261 44khz. So we set the threshold
# here slightly above that.
# Source: https://lame.sourceforge.io/tech-FAQ.txt.
if codec == "mp3" and pad < 0.027:
pad = 0.0
ff = ffmpy.FFmpeg(
inputs={wav_file: None},
outputs={padded_wav: f"-af 'adelay={pad*1000}:all=true'"},
global_options=global_options,
)
ff.run()
signal = cls(padded_wav, **kwargs)
return signal