import argparse import logging import os import sys from micmon.audio import AudioDirectory, AudioFile, AudioSegment from micmon.dataset import DatasetWriter logger = logging.getLogger(__name__) defaults = { 'sample_duration': 2.0, 'sample_rate': 44100, 'channels': 1, 'ffmpeg_bin': 'ffmpeg', } def create_dataset(audio_dir: str, dataset_dir: str, low_freq: int = AudioSegment.default_low_freq, high_freq: int = AudioSegment.default_high_freq, bins: int = AudioSegment.default_bins, sample_duration: float = defaults['sample_duration'], sample_rate: int = defaults['sample_rate'], channels: int = defaults['channels'], ffmpeg_bin: str = defaults['ffmpeg_bin']): audio_dir = os.path.abspath(os.path.expanduser(audio_dir)) dataset_dir = os.path.abspath(os.path.expanduser(dataset_dir)) audio_dirs = AudioDirectory.scan(audio_dir) for audio_dir in audio_dirs: dataset_file = os.path.join(dataset_dir, os.path.basename(audio_dir.path) + '.npz') logger.info(f'Processing audio sample {audio_dir.path}') with AudioFile(audio_dir.audio_file, audio_dir.labels_file, sample_duration=sample_duration, sample_rate=sample_rate, channels=channels, ffmpeg_bin=os.path.expanduser(ffmpeg_bin)) as reader, \ DatasetWriter(dataset_file, low_freq=low_freq, high_freq=high_freq, bins=bins) as writer: for sample in reader: writer += sample def main(): # noinspection PyTypeChecker parser = argparse.ArgumentParser( description=''' Tool to create numpy dataset files with audio spectrum data from a set of labelled raw audio files.''', epilog=''' - audio_dir should contain a list of sub-directories, each of which represents a labelled audio sample. audio_dir should have the following structure: audio_dir/ -> train_sample_1 -> audio.mp3 -> labels.json -> train_sample_2 -> audio.mp3 -> labels.json ... - labels.json is a key-value JSON file that contains the labels for each audio segment. Example: { "00:00": "negative", "02:13": "positive", "04:57": "negative", "15:41": "positive", "18:24": "negative" } Each entry indicates that all the audio samples between the specified timestamp and the next entry or the end of the audio file should be applied the specified label. - dataset_dir is the directory where the generated labelled spectrum dataset in .npz format will be saved. Each dataset file will be named like its associated audio samples directory.''', formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('audio_dir', help='Directory containing the raw audio samples directories to be scanned.') parser.add_argument('dataset_dir', help='Destination directory for the compressed .npz files containing the ' 'frequency spectrum datasets.') parser.add_argument('--low', help='Specify the lowest frequency to be considered in the generated frequency ' 'spectrum. Default: 20 Hz (lowest possible frequency audible to a human ear).', required=False, default=AudioSegment.default_low_freq, dest='low_freq', type=int) parser.add_argument('--high', help='Specify the highest frequency to be considered in the generated frequency ' 'spectrum. Default: 20 kHz (highest possible frequency audible to a human ear).', required=False, default=AudioSegment.default_high_freq, dest='high_freq', type=int) parser.add_argument('-b', '--bins', help=f'Specify the number of frequency bins to be used for the spectrum ' f'analysis (default: {AudioSegment.default_bins})', required=False, default=AudioSegment.default_bins, dest='bins', type=int) parser.add_argument('-d', '--sample-duration', help=f'The script will calculate the spectrum of audio segments of ' f'this specified length in seconds (default: ' f'{defaults["sample_duration"]}).', required=False, default=defaults['sample_duration'], dest='sample_duration', type=float) parser.add_argument('-r', '--sample-rate', help=f'Audio sample rate (default: {defaults["sample_rate"]} Hz)', required=False, default=defaults['sample_rate'], dest='sample_rate', type=int) parser.add_argument('-c', '--channels', help=f'Number of destination audio channels (default: ' f'{defaults["channels"]})', required=False, default=defaults['channels'], dest='channels', type=int) parser.add_argument('--ffmpeg', help=f'Absolute path to the ffmpeg executable (default: {defaults["ffmpeg_bin"]})', required=False, default=defaults['ffmpeg_bin'], dest='ffmpeg_bin', type=str) opts, args = parser.parse_known_args(sys.argv[1:]) return create_dataset(audio_dir=opts.audio_dir, dataset_dir=opts.dataset_dir, low_freq=opts.low_freq, high_freq=opts.high_freq, bins=opts.bins, sample_duration=opts.sample_duration, sample_rate=opts.sample_rate, channels=opts.channels, ffmpeg_bin=opts.ffmpeg_bin) if __name__ == '__main__': main()