音频数据增强
import json
import os
import random
import sys
from typing import List
import librosa
import numpy as np
import soundfile
from torch.utils.data import Dataset
from tqdm import tqdm
# 分割读取音频
@staticmethod
def slice_from_file(file, start, end):
sndfile = soundfile.SoundFile(file)
sample_rate = sndfile.samplerate
duration = round(float(len(sndfile)) / sample_rate, 3)
start = round(start, 3)
end = round(end, 3)
# 从末尾开始计
if start < 0.0: start += duration
if end < 0.0: end += duration
# 保证数据不越界
if start < 0.0: start = 0.0
if end > duration: end = duration
if end < 0.0:
raise ValueError("切片结束位置(%f s)越界" % end)
if start > end:
raise ValueError("切片开始位置(%f s)晚于切片结束位置(%f s)" % (start, end))
start_frame = int(start * sample_rate)
end_frame = int(end * sample_rate)
sndfile.seek(start_frame)
sample = sndfile.read(frames=end_frame - start_frame, dtype='float32')
return sample, sample_rate
# 数据增强
def augment(self, sample, sample_rate):
for config in self.augment_configs:
if config['type'] == 'speed' and random.random() < config['prob']:
if self.speed_rates is None:
min_speed_rate, max_speed_rate, num_rates = config['params']['min_speed_rate'], \
config['params']['max_speed_rate'], config['params']['num_rates']
self.speed_rates = np.linspace(min_speed_rate, max_speed_rate, num_rates, endpoint=True)
rate = random.choice(self.speed_rates)
sample = self.change_speed(sample, speed_rate=rate)
if config['type'] == 'shift' and random.random() < config['prob']:
min_shift_ms, max_shift_ms = config['params']['min_shift_ms'], config['params']['max_shift_ms']
shift_ms = random.randint(min_shift_ms, max_shift_ms)
sample = self.shift(sample, sample_rate, shift_ms=shift_ms)
if config['type'] == 'volume' and random.random() < config['prob']:
min_gain_dBFS, max_gain_dBFS = config['params']['min_gain_dBFS'], config['params']['max_gain_dBFS']
gain = random.randint(min_gain_dBFS, max_gain_dBFS)
sample = self.volume(sample, gain=gain)
if config['type'] == 'resample' and random.random() < config['prob']:
new_sample_rates = config['params']['new_sample_rates']
new_sample_rate = np.random.choice(new_sample_rates)
sample = self.resample(sample, orig_sr=sample_rate, target_sr=new_sample_rate)
sample_rate = new_sample_rate
if config['type'] == 'noise' and random.random() < config['prob']:
min_snr_dB, max_snr_dB = config['params']['min_snr_dB'], config['params']['max_snr_dB']
if self.noises_path is None:
self.noises_path = []
noise_dir = config['params']['noise_dir']
if os.path.exists(noise_dir):
for file in os.listdir(noise_dir):
self.noises_path.append(os.path.join(noise_dir, file))
noise_path = random.choice(self.noises_path)
snr_dB = random.randint(min_snr_dB, max_snr_dB)
sample = self.add_noise(sample, sample_rate, noise_path=noise_path, snr_dB=snr_dB)
return sample, sample_rate
# 改变语速
@staticmethod
def change_speed(sample, speed_rate):
if speed_rate == 1.0:
return sample
if speed_rate <= 0:
raise ValueError("速度速率应大于零")
old_length = sample.shape[0]
new_length = int(old_length / speed_rate)
old_indices = np.arange(old_length)
new_indices = np.linspace(start=0, stop=old_length, num=new_length)
sample = np.interp(new_indices, old_indices, sample).astype(np.float32)
return sample
# 音频偏移
@staticmethod
def shift(sample, sample_rate, shift_ms):
duration = sample.shape[0] / sample_rate
if abs(shift_ms) / 1000.0 > duration:
raise ValueError("shift_ms的绝对值应该小于音频持续时间")
shift_samples = int(shift_ms * sample_rate / 1000)
if shift_samples > 0:
sample[:-shift_samples] = sample[shift_samples:]
sample[-shift_samples:] = 0
elif shift_samples < 0:
sample[-shift_samples:] = sample[:shift_samples]
sample[:-shift_samples] = 0
return sample
# 改变音量
@staticmethod
def volume(sample, gain):
sample *= 10.**(gain / 20.)
return sample
# 声音重采样
@staticmethod
def resample(sample, orig_sr, target_sr):
sample = librosa.resample(sample, orig_sr=orig_sr, target_sr=target_sr)
return sample
# 添加噪声
def add_noise(self, sample, sample_rate, noise_path, snr_dB, max_gain_db=300.0):
noise_sample, sr = librosa.load(noise_path, sr=sample_rate)
# 标准化音频音量,保证噪声不会太大
target_db = -20
gain = min(max_gain_db, target_db - self.rms_db(sample))
sample *= 10. ** (gain / 20.)
# 指定噪声音量
sample_rms_db, noise_rms_db = self.rms_db(sample), self.rms_db(noise_sample)
noise_gain_db = min(sample_rms_db - noise_rms_db - snr_dB, max_gain_db)
noise_sample *= 10. ** (noise_gain_db / 20.)
# 固定噪声长度
if noise_sample.shape[0] < sample.shape[0]:
diff_duration = sample.shape[0] - noise_sample.shape[0]
noise_sample = np.pad(noise_sample, (0, diff_duration), 'wrap')
elif noise_sample.shape[0] > sample.shape[0]:
start_frame = random.randint(0, noise_sample.shape[0] - sample.shape[0])
noise_sample = noise_sample[start_frame:sample.shape[0] + start_frame]
sample += noise_sample
return sample
@staticmethod
def rms_db(sample):
mean_square = np.mean(sample ** 2)
return 10 * np.log10(mean_square)