Supervised learning

Binary classification of protein sequences

Proteins are polymers made of 20 different amino acids. Proteins have been classified into different families based on their sequence similarity. Positive and negative datasets corresponding to one of the protein families are available here

Extract features from the protein sequences

Amino Acid Composition refers to frequency of each amino acid within a protein sequence. E.g. if a protein has a sequence ‘MSAARQTTRKAE’ it’s amino acid composition can be represented as a vector of length 20:

‘A’:3,’C’:0,’D’:0,’E’:1,’F’:0,’G’:0,’H’:0,’I’:0,’K’:1,’L’:0,’M’:1,’N’:0,’P’:0,’Q’:1,’R’:1,’S’:1,’T’:2,’V’:0,’W’:0,’Y’:0

In this way any protein sequence can be represented as a feature vector of fixed length. This is important because the protein sequences, even within the same class, can have different number of amino acids.

import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from sklearn import model_selection
positive_dict = SeqIO.to_dict(SeqIO.parse("positive-aars.fasta", "fasta"))
negative_dict = SeqIO.to_dict(SeqIO.parse("negative-aars.fasta", "fasta"))

## Amino acid composition calculation##
#c1 = ProteinAnalysis("AAAASTRRRTRRAWEQWERQW").count_amino_acids()
df1 = pd.DataFrame()
for keys,values in positive_dict.items():
    df1 = df1.append(pd.Series(ProteinAnalysis(str(values.seq)).get_amino_acids_percent(),name='1'))
for keys,values in negative_dict.items():
    df1 = df1.append(pd.Series(ProteinAnalysis(str(values.seq)).get_amino_acids_percent(),name='-1'))

print("Number of positive samples: ", len(positive_dict))
print("Number of negative samples: ", len(negative_dict))

df1
Number of positive samples:  117
Number of negative samples:  117
A C D E F G H I K L M N P Q R S T V W Y
1 0.035928 0.007984 0.041916 0.131737 0.055888 0.049900 0.017964 0.081836 0.119760 0.107784 0.015968 0.037924 0.037924 0.011976 0.037924 0.055888 0.049900 0.057884 0.013972 0.029940
1 0.076233 0.004484 0.049327 0.065770 0.029895 0.082212 0.020927 0.055306 0.082212 0.074738 0.040359 0.037369 0.043348 0.056801 0.058296 0.055306 0.061286 0.053812 0.017937 0.034380
1 0.093333 0.009524 0.068571 0.080000 0.026667 0.120000 0.024762 0.059048 0.055238 0.095238 0.028571 0.015238 0.041905 0.019048 0.074286 0.060952 0.020952 0.053333 0.009524 0.043810
1 0.079470 0.036424 0.052980 0.067881 0.046358 0.072848 0.023179 0.046358 0.021523 0.117550 0.019868 0.016556 0.046358 0.039735 0.081126 0.072848 0.049669 0.072848 0.008278 0.028146
1 0.084158 0.003300 0.046205 0.094059 0.034653 0.077558 0.018152 0.061056 0.089109 0.099010 0.036304 0.057756 0.031353 0.029703 0.026403 0.047855 0.042904 0.061056 0.011551 0.047855
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
-1 0.087221 0.004057 0.050710 0.060852 0.034483 0.068966 0.020284 0.056795 0.070994 0.089249 0.020284 0.036511 0.050710 0.042596 0.054767 0.079108 0.052738 0.075051 0.004057 0.040568
-1 0.067901 0.009259 0.049383 0.030864 0.027778 0.055556 0.024691 0.037037 0.030864 0.080247 0.015432 0.040123 0.111111 0.080247 0.104938 0.101852 0.058642 0.030864 0.018519 0.024691
-1 0.053521 0.010329 0.056338 0.101408 0.037559 0.037559 0.014085 0.059155 0.082629 0.110798 0.025352 0.059155 0.029108 0.047887 0.061033 0.078873 0.048826 0.046948 0.005634 0.033803
-1 0.038462 0.022869 0.070686 0.062370 0.062370 0.024948 0.014553 0.066528 0.060291 0.121622 0.016632 0.069647 0.027027 0.040541 0.050936 0.096674 0.061331 0.058212 0.009356 0.024948
-1 0.042791 0.014884 0.055814 0.071628 0.058605 0.032558 0.020465 0.053953 0.071628 0.110698 0.010233 0.054884 0.044651 0.042791 0.059535 0.102326 0.040930 0.054884 0.013023 0.043721

234 rows × 20 columns

#correlation between different features 
sns.heatmap(df1.corr())
<AxesSubplot:>
_images/ML_supervised_4_1.png

Model building

Create data and label matrices.

df1_matrix = df1.iloc[:,range(20)].values ## X matrix
labels = df1.index.values ## Y matrix

Create training and testing datasets, build the model and check prediction accuracy.

# Split-out validation dataset"
validation_size = 0.20 # 20% data for testing
seed = None # change to int for reproducibility
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(df1_matrix, labels, \
                                                                               test_size=validation_size, \
                                                                                random_state=seed)

new_clf = SVC()
model1 = new_clf.fit(X_train, Y_train)
Y_predicted = model1.predict(X_validation)
score1 = accuracy_score(Y_predicted, Y_validation)
print(score1)
0.8085106382978723
df1_matrix.shape
(234, 20)

Cross-validation

from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Test options and evaluation metric
scoring = 'accuracy'

kfold = model_selection.KFold(n_splits=5)
cv_results = model_selection.cross_val_score(SVC(), df1_matrix, labels, cv=kfold, scoring=scoring)

print(cv_results)
print(cv_results.mean(), cv_results.std())
[0.38297872 0.34042553 0.65957447 0.34042553 0.30434783]
0.40555041628122107 0.12943119067958725
SVC().get_params()
{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

Hyperparameters

clf_poly = SVC(kernel='poly', degree=3)
clf_rbf = SVC(kernel='rbf', C=10)

kfold = model_selection.KFold(n_splits=5)
cv_SVM_poly_results = model_selection.cross_val_score(clf_poly, df1_matrix, labels, cv=kfold, scoring=scoring)
cv_SVM_rbf_results = model_selection.cross_val_score(clf_rbf, df1_matrix, labels, cv=kfold, scoring=scoring)
print(cv_SVM_rbf_results)
print (cv_SVM_poly_results.mean(), cv_SVM_rbf_results.mean())
[0.70212766 0.65957447 0.78723404 0.5106383  0.65217391]
0.6281221091581869 0.6623496762257168

Confusion matrix

from collections import Counter
from sklearn.metrics import confusion_matrix
clf_new = SVC(kernel='poly',degree=3)

## Fit model to the data ##
clf_new.fit(df1_matrix,labels)
print(Counter(labels))

## Predict labels for the original data ##
clf_new_predict = clf_new.predict(df1_matrix)
print(Counter(clf_new_predict))
Counter({'1': 117, '-1': 117})
Counter({'-1': 121, '1': 113})
cm = confusion_matrix(labels,clf_new_predict)
print(cm)

fig,ax= plt.subplots()
sns.heatmap(cm, square=True, annot=True, fmt='g', cbar=False, ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(['positive', 'negative'])
ax.yaxis.set_ticklabels(['positive', 'negative'])
plt.show()
[[108   9]
 [ 13 104]]
_images/ML_supervised_23_1.png

Exercise

Extract another feature - Di-Peptide Composition (DPC)

For each sequence calculate the frequency of pairwaise occurrence of amino acids. The length of the feature vector for each sequence would be 400 (20 x 20). Construct a classification model using DPC as a feature and compare the results with classification using amino acid composition.

import itertools
aa_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
## Create a series of dipeptides
dpc_series = pd.Series(name='1',dtype=float)
for x in itertools.product(aa_list,aa_list):
    dpc_series[''.join([''.join(x)])] = 0
df_dpc = pd.DataFrame([])
for keys,values in positive_dict.items():
    dpc_series_copy = dpc_series.copy()
#    print (values.seq)
    dpc_seq = [str(values.seq[i:i+2]) for i in range(len(values.seq))]
    del dpc_seq[-1]
    for x in dpc_seq:
        dpc_series_copy[x] += 1
    dpc_series_copy /= len(values.seq)
    dpc_series_copy *= 100
    df_dpc = df_dpc.append(dpc_series_copy)
#dpc_series_copy
df_dpc
AA AC AD AE AF AG AH AI AK AL ... YM YN YP YQ YR YS YT YV YW YY
1 0.399202 0.000000 0.000000 0.399202 0.399202 0.399202 0.199601 0.199601 0.598802 0.199601 ... 0.199601 0.000000 0.598802 0.199601 0.000000 0.199601 0.000000 0.000000 0.0 0.000000
1 1.345291 0.000000 0.149477 0.747384 0.149477 0.597907 0.149477 0.298954 0.896861 0.448430 ... 0.000000 0.000000 0.298954 0.448430 0.000000 0.000000 0.298954 0.597907 0.0 0.000000
1 0.571429 0.000000 0.761905 0.761905 0.380952 1.142857 0.190476 0.380952 0.571429 1.142857 ... 0.000000 0.000000 0.000000 0.000000 0.761905 0.190476 0.190476 0.190476 0.0 0.000000
1 0.331126 0.165563 0.331126 0.496689 0.662252 0.662252 0.331126 0.000000 0.165563 0.496689 ... 0.000000 0.165563 0.165563 0.331126 0.827815 0.331126 0.000000 0.165563 0.0 0.000000
1 0.495050 0.000000 0.660066 0.990099 0.165017 0.660066 0.000000 0.165017 0.825083 0.825083 ... 0.165017 0.495050 0.165017 0.165017 0.165017 0.165017 0.165017 0.330033 0.0 0.330033
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1 0.790514 0.000000 0.592885 0.395257 0.592885 0.592885 0.197628 0.197628 1.185771 0.395257 ... 0.000000 0.000000 0.197628 0.000000 0.395257 0.197628 0.000000 0.000000 0.0 0.000000
1 0.401606 0.000000 0.401606 0.200803 1.004016 0.200803 0.401606 0.000000 0.803213 0.200803 ... 0.000000 0.000000 0.000000 0.401606 0.200803 0.602410 0.200803 0.803213 0.0 0.401606
1 2.923977 0.000000 1.754386 1.949318 1.169591 0.779727 0.194932 0.389864 0.000000 1.559454 ... 0.000000 0.000000 0.194932 0.000000 0.584795 0.000000 0.000000 0.000000 0.0 0.000000
1 0.000000 0.000000 0.114416 0.114416 0.114416 0.114416 0.114416 0.114416 0.343249 0.114416 ... 0.000000 0.228833 0.228833 0.000000 0.228833 0.228833 0.228833 0.228833 0.0 0.228833
1 0.000000 0.142653 0.285307 0.000000 0.570613 0.000000 0.285307 0.855920 0.570613 1.711840 ... 0.000000 0.000000 0.142653 0.427960 0.000000 0.000000 0.142653 0.000000 0.0 0.285307

117 rows × 400 columns