import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests

SEQUENCES_URL = 'https://raw.githubusercontent.com/abidlabs/deep-learning-genomics-primer/master/sequences.txt'

sequences = requests.get(SEQUENCES_URL).text.split('\n')
sequences = list(filter(None, sequences))  # This removes empty sequences.

# Let's print the first few sequences.
pd.DataFrame(sequences, index=np.arange(1, len(sequences)+1), 
             columns=['Sequences']).head()


len(sequences)

2000


LABELS_URL = 'https://raw.githubusercontent.com/abidlabs/deep-learning-genomics-primer/master/labels.txt'

labels = requests.get(LABELS_URL).text.split('\n')
labels = list(filter(None, labels))  # removes empty sequences

# Let's print the first few labels.
pd.DataFrame(labels, index=np.arange(1, len(labels)+1), 
             columns=['labels']).head()


# Count the number of 0s and 1s
print("Number of '0's: {}".format(labels.count('0')))
print("Number of '1's: {}".format(labels.count('1')))

Number of '0's: 1013
Number of '1's: 987


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split


# One-hot encoding function for DNA sequences
def one_hot_encode_sequence(sequence):
    mapping = {'A': [1, 0, 0, 0],
               'C': [0, 1, 0, 0],
               'G': [0, 0, 1, 0],
               'T': [0, 0, 0, 1]}
    return np.array([mapping[base] for base in sequence])

# Encode all sequences
encoded_sequences = np.array([one_hot_encode_sequence(seq) for seq in sequences])

# Convert labels from strings to integers
encoded_labels = np.array([int(label) for label in labels])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, encoded_labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Print to verify
print("One-hot encoded sequences (training):")
print(X_train_tensor)
print("Encoded binary labels (training):")
print(y_train_tensor)

One-hot encoded sequences (training):
tensor([[[0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         ...,
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.]],

        [[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.],
         ...,
         [0., 0., 0., 1.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]],

        [[0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.],
         ...,
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.]],

        ...,

        [[0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         ...,
         [1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]],

        [[0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         ...,
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.]],

        [[0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.],
         ...,
         [1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [1., 0., 0., 0.]]])
Encoded binary labels (training):
tensor([0, 1, 0,  ..., 0, 0, 1])


# Define the 1D CNN model
class Conv1DModel(nn.Module):
    def __init__(self, input_length):
        super(Conv1DModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=4, out_channels=32, kernel_size=12)
        self.pool = nn.MaxPool1d(kernel_size=4)
        conv_output_length = input_length - 12 + 1
        pooled_length = conv_output_length // 4
        self.flattened_size = 32 * pooled_length
        self.fc1 = nn.Linear(self.flattened_size, 16)
        self.fc2 = nn.Linear(16, 2)
        
    def forward(self, x):
        x = x.permute(0, 2, 1) # (batch_size, channels, sequence_length)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = x.view(-1, self.flattened_size)
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=1)
        return x


from torch.utils.data import TensorDataset, DataLoader, random_split
import matplotlib.pyplot as plt


# Create dataset and split into training and validation sets
dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_size = int(0.25 * len(dataset))
train_size = len(dataset) - val_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Initialize model
model = Conv1DModel(input_length=X_train_tensor.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Initialize lists to store history
train_losses = []
val_losses = []
val_accuracies = []

# Training loop
for epoch in range(20):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    train_losses.append(running_loss / len(train_loader))

    # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for val_X, val_y in val_loader:
            val_outputs = model(val_X)
            val_loss += criterion(val_outputs, val_y).item()
            pred = val_outputs.argmax(dim=1)
            correct += (pred == val_y).sum().item()
    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(correct / len(val_dataset))
    print(f"Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, Val Accuracy: {val_accuracies[-1]:.4f}")

# Plotting the training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

plt.subplot(1, 2, 2)
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Validation Accuracy')

plt.show()

Epoch 1, Train Loss: 0.6800, Val Loss: 0.6467, Val Accuracy: 0.7775
Epoch 2, Train Loss: 0.5928, Val Loss: 0.5238, Val Accuracy: 0.8250
Epoch 3, Train Loss: 0.4766, Val Loss: 0.4487, Val Accuracy: 0.8800
Epoch 4, Train Loss: 0.4156, Val Loss: 0.4090, Val Accuracy: 0.9275
Epoch 5, Train Loss: 0.3873, Val Loss: 0.3750, Val Accuracy: 0.9650
Epoch 6, Train Loss: 0.3623, Val Loss: 0.3633, Val Accuracy: 0.9650
Epoch 7, Train Loss: 0.3505, Val Loss: 0.3549, Val Accuracy: 0.9725
Epoch 8, Train Loss: 0.3417, Val Loss: 0.3498, Val Accuracy: 0.9700
Epoch 9, Train Loss: 0.3352, Val Loss: 0.3500, Val Accuracy: 0.9800
Epoch 10, Train Loss: 0.3311, Val Loss: 0.3489, Val Accuracy: 0.9800
Epoch 11, Train Loss: 0.3286, Val Loss: 0.3441, Val Accuracy: 0.9825
Epoch 12, Train Loss: 0.3260, Val Loss: 0.3365, Val Accuracy: 0.9825
Epoch 13, Train Loss: 0.3233, Val Loss: 0.3387, Val Accuracy: 0.9775
Epoch 14, Train Loss: 0.3205, Val Loss: 0.3333, Val Accuracy: 0.9875
Epoch 15, Train Loss: 0.3192, Val Loss: 0.3324, Val Accuracy: 0.9875
Epoch 16, Train Loss: 0.3183, Val Loss: 0.3315, Val Accuracy: 0.9875
Epoch 17, Train Loss: 0.3178, Val Loss: 0.3310, Val Accuracy: 0.9875
Epoch 18, Train Loss: 0.3174, Val Loss: 0.3307, Val Accuracy: 0.9875
Epoch 19, Train Loss: 0.3170, Val Loss: 0.3299, Val Accuracy: 0.9875
Epoch 20, Train Loss: 0.3167, Val Loss: 0.3295, Val Accuracy: 0.9875


# predict for the first sequence in test set
model(X_test_tensor[0].unsqueeze(0))

tensor([[1.0086e-04, 9.9990e-01]], grad_fn=<SoftmaxBackward0>)


from sklearn.metrics import confusion_matrix
import itertools

predicted_labels = model(X_test_tensor)
predicted_labels = predicted_labels[:,0].detach().numpy()
predicted_labels[predicted_labels> 0.5] = 1 
predicted_labels[predicted_labels<= 0.5] = 0 
cm = confusion_matrix(y_test_tensor.detach().numpy(), predicted_labels)
print('Confusion matrix:\n',cm)

cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]

plt.imshow(cm, cmap=plt.cm.Blues)
plt.title('Normalized confusion matrix')
plt.colorbar()
plt.xlabel('True label')
plt.ylabel('Predicted label')
plt.xticks([0, 1]); plt.yticks([0, 1])
plt.grid('off')
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], '.2f'),
             horizontalalignment='center',
             color='white' if cm[i, j] > 0.5 else 'black')

Confusion matrix:
 [[  1 204]
 [195   0]]

	Sequences
1	CCGAGGGCTATGGTTTGGAAGTTAGAACCCTGGGGCTTCTCGCGGA...
2	GAGTTTATATGGCGCGAGCCTAGTGGTTTTTGTACTTGTTTGTCGC...
3	GATCAGTAGGGAAACAAACAGAGGGCCCAGCCACATCTAGCAGGTA...
4	GTCCACGACCGAACTCCCACCTTGACCGCAGAGGTACCACCAGAGC...
5	GGCGACCGAACTCCAACTAGAACCTGCATAACTGGCCTGGGAGATA...

Before the class¶

Train a convolutional neural network to predict protein binding¶

Data processing¶

Model training¶

Prediction for new data¶