micmon/micmon/utils/datagen.py

118 lines
5.5 KiB
Python

import argparse
import logging
import os
import sys
from micmon.audio import AudioDirectory, AudioFile, AudioSegment
from micmon.dataset import DatasetWriter
logger = logging.getLogger(__name__)
defaults = {
'sample_duration': 2.0,
'sample_rate': 44100,
'channels': 1,
'ffmpeg_bin': 'ffmpeg',
}
def create_dataset(audio_dir: str, dataset_dir: str,
low_freq: int = AudioSegment.default_low_freq,
high_freq: int = AudioSegment.default_high_freq,
bins: int = AudioSegment.default_bins,
sample_duration: float = defaults['sample_duration'],
sample_rate: int = defaults['sample_rate'],
channels: int = defaults['channels'],
ffmpeg_bin: str = defaults['ffmpeg_bin']):
audio_dir = os.path.abspath(os.path.expanduser(audio_dir))
dataset_dir = os.path.abspath(os.path.expanduser(dataset_dir))
audio_dirs = AudioDirectory.scan(audio_dir)
for audio_dir in audio_dirs:
dataset_file = os.path.join(dataset_dir, os.path.basename(audio_dir.path) + '.npz')
logger.info(f'Processing audio sample {audio_dir.path}')
with AudioFile(audio_dir.audio_file, audio_dir.labels_file,
sample_duration=sample_duration, sample_rate=sample_rate, channels=channels,
ffmpeg_bin=os.path.expanduser(ffmpeg_bin)) as reader, \
DatasetWriter(dataset_file, low_freq=low_freq, high_freq=high_freq, bins=bins) as writer:
for sample in reader:
writer += sample
def main():
# noinspection PyTypeChecker
parser = argparse.ArgumentParser(
description='''
Tool to create numpy dataset files with audio spectrum data from a set of labelled raw audio files.''',
epilog='''
- audio_dir should contain a list of sub-directories, each of which represents a labelled audio sample.
audio_dir should have the following structure:
audio_dir/
-> train_sample_1
-> audio.mp3
-> labels.json
-> train_sample_2
-> audio.mp3
-> labels.json
...
- labels.json is a key-value JSON file that contains the labels for each audio segment. Example:
{
"00:00": "negative",
"02:13": "positive",
"04:57": "negative",
"15:41": "positive",
"18:24": "negative"
}
Each entry indicates that all the audio samples between the specified timestamp and the next entry or
the end of the audio file should be applied the specified label.
- dataset_dir is the directory where the generated labelled spectrum dataset in .npz format will be saved.
Each dataset file will be named like its associated audio samples directory.''',
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('audio_dir', help='Directory containing the raw audio samples directories to be scanned.')
parser.add_argument('dataset_dir', help='Destination directory for the compressed .npz files containing the '
'frequency spectrum datasets.')
parser.add_argument('--low', help='Specify the lowest frequency to be considered in the generated frequency '
'spectrum. Default: 20 Hz (lowest possible frequency audible to a human ear).',
required=False, default=AudioSegment.default_low_freq, dest='low_freq', type=int)
parser.add_argument('--high', help='Specify the highest frequency to be considered in the generated frequency '
'spectrum. Default: 20 kHz (highest possible frequency audible to a human ear).',
required=False, default=AudioSegment.default_high_freq, dest='high_freq', type=int)
parser.add_argument('-b', '--bins', help=f'Specify the number of frequency bins to be used for the spectrum '
f'analysis (default: {AudioSegment.default_bins})',
required=False, default=AudioSegment.default_bins, dest='bins', type=int)
parser.add_argument('-d', '--sample-duration', help=f'The script will calculate the spectrum of audio segments of '
f'this specified length in seconds (default: '
f'{defaults["sample_duration"]}).',
required=False, default=defaults['sample_duration'], dest='sample_duration', type=float)
parser.add_argument('-r', '--sample-rate', help=f'Audio sample rate (default: {defaults["sample_rate"]} Hz)',
required=False, default=defaults['sample_rate'], dest='sample_rate', type=int)
parser.add_argument('-c', '--channels', help=f'Number of destination audio channels (default: '
f'{defaults["channels"]})',
required=False, default=defaults['channels'], dest='channels', type=int)
parser.add_argument('--ffmpeg', help=f'Absolute path to the ffmpeg executable (default: {defaults["ffmpeg_bin"]})',
required=False, default=defaults['ffmpeg_bin'], dest='ffmpeg_bin', type=str)
opts, args = parser.parse_known_args(sys.argv[1:])
return create_dataset(audio_dir=opts.audio_dir, dataset_dir=opts.dataset_dir, low_freq=opts.low_freq,
high_freq=opts.high_freq, bins=opts.bins, sample_duration=opts.sample_duration,
sample_rate=opts.sample_rate, channels=opts.channels, ffmpeg_bin=opts.ffmpeg_bin)
if __name__ == '__main__':
main()