Data cleaning

For part I, the dataset is one binary class classification and the data should be cleaned to remove the space, null, and irrelevant blanks. And loading the first 5 lines is a basic command to see the structure of dataset.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# load .csv file as pandas dataframe
data = pd.read_csv("data_subset.csv")
# show first 5 rows
data.head(5)

Then the label corresponding to “normal” and “disease” should be picked up to deal with through using Boolean indexing.

data_binary = data[(data["label"]=="normal") | (data["label"]=="disease")].copy()
data_binary.groupby("label").count() # using groupby() to see the number of disease and normal of gene

then using numpy array to do slice of the specific area.

X = data_binary.iloc[:,2:-1]
X = X.to_numpy() # using X.shape() to see the structure of this specific data area

Also do selection for the label area as well and using .shape() to check the result.

labels = data_binary.iloc[:,-1]
y = (labels=="disease").astype(int)

Spliting dataset

Then, create one pipeline to compute the result of each item after inserting the standard libraries. cross_val_scorewould be utilized to create one crossing validity result after 5 folders with scoring of ‘f1’, and finally compute the mean value.

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Then choose one supervised learning algorithm to compute the result. Basically, support-vector-machine(SVM) and Principal component analysis(PCA) are always used as most of them. Then the code could be seen as follows:

## Logistic Regression as supervised learning algorithm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
pca = PCA(n_components=20, random_state=42)
logreg = LogisticRegression()
pipe = Pipeline([('scaler', scaler), ('pca', pca), ('logreg', logreg)])
scores = cross_val_score(pipe, X_train, y_train, cv = 5, scoring='f1')

## Support Vector Machine would be as follows
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
pca = PCA(n_components=20, random_state=42)
svm = SVC()

pipe = Pipeline([('scaler', scaler), ('pca', pca), ('svm', svm)])
param_grid = {'svm__C': [0.01, 1, 100], 'svm__gamma': [1e-04, 1e-03, 1e-02]} 

search = GridSearchCV(pipe, param_grid,cv=5,scoring='f1')
search.fit(X_train, y_train)

## using search's object "search.best_score_" to show the result and search's object "best_params_" to show the result of best parameters.
print("Best parameter are (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Additionally, here using f1_score to show the accuracy with matrix format. \(F1 Score = 2 * (Precision * Recall) / (Precision + Recall)\) For example, here using training accuracy and testing accuracy to express the result with pipeline and using f1-score to check its reliability.

from sklearn.metrics import f1_score

scaler = StandardScaler()
pca = PCA(n_components=20)
svm = SVC(C=100, gamma=0.001)

pipe = Pipeline([('scaler', scaler), ('pca', pca), ('svm', svm)])
pipe.fit(X_train,y_train)

train_accuracy = pipe.score(X_train,y_train)
test_accuracy  = pipe.score(X_test,y_test)

f1_train = f1_score(y_train, pipe.predict(X_train))
f1_test  = f1_score(y_test, pipe.predict(X_test))

print(train_accuracy, test_accuracy)
print(f1_train, f1_test)

plot_confusion_matrix(pipe, X_test, y_test, normalize='true')  
plt.show() 

Multiclass Classification

For part II, the dataset is the multiclass classification which are ‘cell line’, ‘disease’, ‘neoplasm’, ‘normal’. Here, data would not be removed due to its four-class classification.

# count the gene's number after it is transferred to Numpy arrays from Dataframe
X = data.iloc[:,2:-1]
X = X.to_numpy()
y = data.iloc[:,-1]
# if you want to check its structure, it could use X.shape or y.shape to see the result

Then, transfer four text labels to numeric number as annotation.

set(y) # {'cell line', 'disease', 'neoplasm', 'normal'}

then using preprocessing to handle data from four labels.

from sklearn import preprocessing

fourLabels = preprocessing.LabelEncoder()
fourLabels.fit(['cell line', 'disease', 'neoplasm', 'normal'])
fourLabels.transform(y) # set the text labels to numeric numbers with an array ([0,0,...,3,3,3])

Then using Grid search and Cross validation to optimize the data

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import time

# split data with test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

f1_table = np.zeros((4,3*3)) # store f1-score for all params combination
cols = [] # store values of params combination

start_time = time.time() # track execution time
count = 0 # counter for number of params combination
for gamma in [1e-04, 1e-03, 1e-02]:
    for C in [0.001, 1, 100]:
        f1_fold = np.zeros((3,4)) # array to store f1-score for each fold (3 folds) for each class (4 classes)
        fold=0 # indexing f1_fold array
        # cross-validation
        for train_ind, val_ind  in skf.split(X_train, y_train):       
            # PCA 
            pca = PCA(n_components=20, random_state=42).fit(X_train[train_ind])
            Z_train = pca.transform(X_train[train_ind])
            Z_val   = pca.transform(X_train[val_ind])
            # SVM
            svm = SVC(gamma=gamma, C=C, random_state=42).fit(Z_train, y_train[train_ind])
            # get evalutaion metrics on val set
            y_pred = svm.predict(Z_val)
            f1 = f1_score(y_train[val_ind], y_pred, average=None)
            # store f1-score
            f1_fold[fold]=f1
            fold+=1
        f1_table[:,count] = np.around(np.mean(f1_fold,axis=0),decimals=2)
        cols.append(str(gamma)+"; "+str(C))
        count+=1
        
print("--- %s seconds ---" % (time.time() - start_time))

Then same with binary classification to plot scores for test set.

fig, ax = plt.subplots() 
ax.set_axis_off() 
rows=['cell line', 'disease', 'neoplasm', 'normal']
the_table = ax.table(cellText=f1_table, 
                     rowLabels=rows, 
                     colLabels=cols, 
                     cellLoc ='center',
                     loc ='center')
the_table.auto_set_font_size(False)
the_table.set_fontsize(24)
the_table.scale(6, 4)
ax.set_title("F1 scores", fontsize=34, fontweight ="bold", pad=30)
plt.show()

Then, it would go to the final classification to see the result.

from sklearn.metrics import f1_score

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

pca = PCA(n_components=20, random_state=42).fit(X_train)
Z_train = pca.transform(X_train)
Z_test  = pca.transform(X_test)

svm = SVC(C=100, gamma=0.001).fit(Z_train, y_train)

f1_train = f1_score(y_train, svm.predict(Z_train),average=None)
f1_test  = f1_score(y_test, svm.predict(Z_test),average=None)

print(f1_train, f1_test)

plot_confusion_matrix(svm, Z_test, y_test, normalize='true')  
plt.show()  

Yukun Yu

Data cleaning

Spliting dataset

Multiclass Classification