# Import
from panns_inference import AudioTagging, labels

# Create audio tagger 
audio_tagger = AudioTagging(
    checkpoint_path=model_storage_filename, 
    device='cuda'
)
# Apply to test audio
(clipwise_output, embeddings) = audio_tagger.inference(
    test_audio_32k.data[None, :] # (batch_size, audio_samples)
)

Score Tag
----- -------
0.724 Whistle
0.453 Whistling
0.334 Speech
0.079 Music
0.053 Inside, small room


# Import
from panns_inference import SoundEventDetection, labels

# Create sound event detector
detector = SoundEventDetection(
    checkpoint_path=model_storage_filename, 
    device='cuda'
)
# Apply to test audio
framewise_output = detector.inference(test_audio_32k.data[None, :])


# Import
import tensorflow_hub as hub

# Load model
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

# Load class names
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names = list(pd.read_csv(class_map_path)['display_name'])

# Run the model, check the output.
scores, embeddings, spectrogram = yamnet_model(test_audio_16k.data)

scores matrix shape (analysis segments, class-wise scores):
 (10, 521)


# Run the model
scores, embeddings, spectrogram_data = yamnet_model(test_audio_16k.data)

# Get top class
infered_class = class_names[scores.numpy().mean(axis=0).argmax()]

infered_class: Whistle


target_classes = [
    'cat',       # YAMNet class
    'cow', 
    'crickets',
    'crow',      # YAMNet class
    'dog',       # YAMNet class
    'frog',      # YAMNet class
    'hen', 
    'pig',       # YAMNet class
    'rooster',
    'sheep'      # YAMNet class
]


# Load dataset meta
dataset_meta = pd.read_csv(meta_csv)

# Filter data
dataset_meta_filtered = dataset_meta[dataset_meta.category.isin(target_classes)]
class_id = dataset_meta_filtered['category'].apply(
    lambda name: map_class_to_id[name]
)
dataset_meta_filtered = dataset_meta_filtered.assign(target=class_id)


main_ds = tf.data.Dataset.from_tensor_slices(
    (dataset_meta_filtered['filename'], 
     dataset_meta_filtered['target'], 
     dataset_meta_filtered['fold'])
)

def load_audio_for_map_function(filename, label, fold):
  return load_audio(filename), label, fold

main_ds = main_ds.map(load_audio_for_map_function)

# Function to extract embeddings
def extract_embedding(wav_data, label, fold):
  scores, embeddings, spectrogram = yamnet_model(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings,
          tf.repeat(label, num_embeddings),
          tf.repeat(fold, num_embeddings))

# Extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()


finetuned_model = tf.keras.Sequential([
    tf.keras.layers.Input(
        shape=(1024),
        dtype=tf.float32,
        name='input_embedding'
    ),
    tf.keras.layers.Dense(512, activation='relu', name='fully-connected'),
    tf.keras.layers.Dense(len(target_classes), name='output')
], name='my_model')


finetuned_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

# Track power consumption and time
tracker = EmissionsTracker("Transfer learning example", output_dir=os.path.join('data', 'training_codecarbon'))
tracker.start()
start_time = time.time()

history = finetuned_model.fit(train_ds, epochs=100, validation_data=val_ds, callbacks=callback, verbose=0)

# Stop tracking
stop_time = time.time()
tracker.stop()

Time used for training: 28.55 sec
Total energy consumed during training: 0.00083 kWh

Layer name	Layer type	Output shape	Parameters
fully-connected	Dense	(None, 512)	524800
output	Dense	(None, 10)	5130

Scene label	Accuracy
cat	100.0
cow	100.0
crickets	87.5
crow	100.0
dog	100.0
frog	62.5
hen	100.0
pig	62.5
rooster	100.0
sheep	100.0
Average	91.2

Environmental Audio AI
Example

Introduction¶

Outline¶

PANNS¶

Example audio¶

Usage as audio tagger¶

Usage as a sound event detector¶

YAMNet¶

Usage¶

Example audio¶

Analyze content¶

Scores of the top 5 classes across the sample¶

Transfer Learning¶

Transfer Learning¶

Application example¶

Application¶

Select material from ESC-50 dataset¶

Extract embeddings¶

Create neural network¶

Training¶

Evaluation¶

Confusion matrix¶

Sound Event Detection¶

System structure¶

Example input signal¶

System output versus reference¶

	filename	category
0	data/datasets/ESC-50-master/audio16k/1-100032-...	dog
8	data/datasets/ESC-50-master/audio16k/1-103298-...	crow
14	data/datasets/ESC-50-master/audio16k/1-110389-...	dog
29	data/datasets/ESC-50-master/audio16k/1-121951-...	sheep
45	data/datasets/ESC-50-master/audio16k/1-15689-A...	frog

Environmental Audio AIExample

Introduction¶

Outline¶

PANNS¶

Example audio¶

Usage as audio tagger¶

Usage as a sound event detector¶

YAMNet¶

Usage¶

Example audio¶

Analyze content¶

Scores of the top 5 classes across the sample¶

Transfer Learning¶

Transfer Learning¶

Application example¶

Application¶

Select material from ESC-50 dataset¶

Extract embeddings¶

Create neural network¶

Training¶

Evaluation¶

Confusion matrix¶

Sound Event Detection¶

System structure¶

Example input signal¶

System output versus reference¶

Environmental Audio AI
Example