Required Packages: “dendextend”,“ggplot2”,“ggdendro”,“caret”,“boot”,“randomForest”,“statip”,“sm”,“reshape”,“pROC”,“stats”. Their dependencies may also be required.
This part of the code reads in the name files and pre-processes the experimental dataset. The first pre-processing function will remove redundant columns, inactive compounds, technical replicates of controls and media.
data_names <- read.csv("./Reference_ER_Activity.csv", header = T)
feature_names <- read.csv("./Feature_Names.csv", header = T)
ERData_process <- function(data_set) {
# Order Data Sets based on compound ID
data_set <- data_set[order(data_set$compound, decreasing = F), ]
# Drop columns with channel 00
data_set <- data_set[, -grep("ch00", colnames(data_set))]
# Separate out Negative Control - Media
media <- data_set[which(data_set$compound %in% "DMSO"), ]
data_set <- data_set[-which(data_set$compound %in% "DMSO"), ]
# Separate out Positive Control - E2 (Estradiol)
e2 <- data_set[which(data_set$compound %in% "E2"), ]
data_set <- data_set[-which(data_set$compound %in% "E2"), ]
# Separate out 4HT
ht4 <- data_set[which(data_set$compound %in% "4HT"), ]
data_set <- data_set[-which(data_set$compound %in% "4HT"), ]
# Remove Inactive Compounds
a02 <- data_set[which(data_set$compound %in% "A02"), ]
data_set <- data_set[-which(data_set$compound %in% "A02"), ]
a03 <- data_set[which(data_set$compound %in% "A03"), ]
data_set <- data_set[-which(data_set$compound %in% "A03"), ]
b06 <- data_set[which(data_set$compound %in% "B06"), ]
data_set <- data_set[-which(data_set$compound %in% "B06"), ]
c02 <- data_set[which(data_set$compound %in% "C02"), ]
data_set <- data_set[-which(data_set$compound %in% "C02"), ]
c05 <- data_set[which(data_set$compound %in% "C05"), ]
data_set <- data_set[-which(data_set$compound %in% "C05"), ]
c06 <- data_set[which(data_set$compound %in% "C06"), ]
data_set <- data_set[-which(data_set$compound %in% "C06"), ]
d03 <- data_set[which(data_set$compound %in% "D03"), ]
data_set <- data_set[-which(data_set$compound %in% "D03"), ]
d06 <- data_set[which(data_set$compound %in% "D06"), ]
data_set <- data_set[-which(data_set$compound %in% "D06"), ]
e03 <- data_set[which(data_set$compound %in% "E03"), ]
data_set <- data_set[-which(data_set$compound %in% "E03"), ]
f01 <- data_set[which(data_set$compound %in% "F01"), ]
data_set <- data_set[-which(data_set$compound %in% "F01"), ]
f02 <- data_set[which(data_set$compound %in% "F02"), ]
data_set <- data_set[-which(data_set$compound %in% "F02"), ]
h05 <- data_set[which(data_set$compound %in% "H05"), ]
data_set <- data_set[-which(data_set$compound %in% "H05"), ]
g02 <- data_set[which(data_set$compound %in% "G02"), ]
data_set <- data_set[-which(data_set$compound %in% "G02"), ]
# Merge masked compund IDs with correct names
expand_names <- data.frame(matrix(NA, ncol = 2, nrow = dim(data_set)[1]))
for (j in 1:length(data_names$Short_ID)) {
for (i in 1:dim(data_set)[1]) {
if (data_names$Short_ID[j] == data_set$compound[i]) {
expand_names[i, 1] <- toString(data_names$Preferred_Name[j])
expand_names[i, 2] <- toString(data_names$ER.Activity[j])
}
}
}
colnames(expand_names) <- c("Preferred.Name", "ER.Activity")
# Create a new data frame with experimental features only
clean_set <- data_set[, -(1:6)]
media_set <- media[, -(1:6)]
e2_set <- e2[, -(1:6)]
for (k in 1:length(colnames(clean_set))) {
if (colnames(clean_set)[k] == feature_names$Original.Feature.Names[k]) {
colnames(clean_set)[k] <- as.character(feature_names$Even.Reduced.Names[k])
}
}
return(list(clean_set = clean_set, expand_names = expand_names, e2_set = e2_set, media_set = media_set))
}
This function is used to normalize the experimental datasets before model building.
# Data Normalization with respect to mean absolute deviation
normalize_mad <- function(clean_set, e2_set) {
mad_e2 <- NULL
bm_e2 <- data.frame(matrix(NA, nrow = dim(e2_set)[1], ncol = dim(clean_set)[2]))
for (i in 1:dim(e2_set)[1]) {
for (j in 1:dim(e2_set)[2]) {
bm_e2[i, j] <- abs(e2_set[i, j] - mean(e2_set[, j]))
mad_e2[j] <- mean(bm_e2[, j])
}
}
df_mad <- data.frame(matrix(NA, nrow = dim(clean_set)[1], ncol = dim(clean_set)[2]))
for (i in 1:dim(clean_set)[1]) {
for (j in 1:dim(clean_set)[2]) {
df_mad[i, j] <- (clean_set[i, j] - median(e2_set[, j]))/mad_e2[j]
}
}
colnames(df_mad) <- colnames(clean_set)
return(df_mad)
}
This function is used in the confidence interval calculations of the logistic regression model parameters.
bs <- function(formula, data, indices) {
d <- data[indices, ] # allows boot to select sample
fit <- glm(formula, family = binomial(link = "logit"), data = d)
return(coef(fit))
}
This function is used to calculate model performance metrics; accuracy, sensitivity, specificity, balanced accuracy and the 95% confidence intervals for the prediction accuracy.
mperform <- function(fit_model, valid_data_numeric, valid_data_class, type = "logit") {
if (type == "logit") {
predict_test <- round(predict(fit_model, valid_data_numeric, type = "response"), 3)
conf_valid <- confusionMatrix(factor(round(predict_test)), valid_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
Accuracy <- round(as.numeric(conf_valid$overall[1]), 2)
Sensitivity <- round(as.numeric(conf_valid$byClass[1]), 2)
Specificity <- round(as.numeric(conf_valid$byClass[2]), 2)
BA <- round(as.numeric(conf_valid$byClass[11]), 2)
CI_Lower <- round(as.numeric(conf_valid$overall[3]), 2)
CI_Upper <- round(as.numeric(conf_valid$overall[4]), 2)
} else if (type == "rf") {
predict_test <- predict(fit_model, valid_data_numeric)
conf_valid <- confusionMatrix(predict_test, valid_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
Accuracy <- round(as.numeric(conf_valid$overall[1]), 2)
Sensitivity <- round(as.numeric(conf_valid$byClass[1]), 2)
Specificity <- round(as.numeric(conf_valid$byClass[2]), 2)
BA <- round(as.numeric(conf_valid$byClass[11]), 2)
CI_Lower <- round(as.numeric(conf_valid$overall[3]), 2)
CI_Upper <- round(as.numeric(conf_valid$overall[4]), 2)
}
summary.res <- data.frame(Accuracy = Accuracy, CI.Lower = CI_Lower, CI.Upper = CI_Upper, Sensitivity = Sensitivity, Specificity = Specificity, Balanced.Accuracy = BA)
return(summary.res)
}
data_sets <- NULL
for (j in 1:6) {
data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531A_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}
for (j in 7:12) {
data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531B_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}
for (j in 13:18) {
data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531C_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}
for (i in 1:length(data_sets)) {
print(sum(is.na(data_sets[[i]]$clean_set)))
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
Call the pre-processing function for data cleaning.
processed_data <- NULL
for (i in 1:length(data_sets)) {
processed_data[[i]] <- ERData_process(data_sets[[i]])
}
# Replicate 8 (index =5) is selected for analysis
new_frame <- data.frame(processed_data[[5]]$clean_set, Preferred.Name = processed_data[[5]]$expand_names$Preferred.Name)
Cell.Area | Cell.Circularity | Nucleus.Area | Nucleus.Circularity | Fraction.Localized.In.Nucleus |
---|---|---|---|---|
7591.96 | 0.953436 | 2096.64 | 1.13147 | 0.832444 |
7570.25 | 0.949877 | 2031.14 | 1.13205 | 0.824702 |
6815.69 | 0.957420 | 2008.79 | 1.14522 | 0.834381 |
6944.24 | 0.956784 | 2009.98 | 1.14160 | 0.844161 |
7222.53 | 0.946303 | 2117.25 | 1.13752 | 0.811180 |
6582.49 | 0.937830 | 2077.13 | 1.13671 | 0.824058 |
6742.16 | 0.944760 | 2202.86 | 1.14404 | 0.835256 |
6920.76 | 0.939669 | 2124.64 | 1.14398 | 0.819195 |
6848.19 | 0.949418 | 2019.10 | 1.15171 | 0.784999 |
7118.88 | 0.947879 | 2140.21 | 1.15036 | 0.776418 |
# Average biological replicates raw biological replicates for outlier detection
aggregated_res <- aggregate(new_frame[, -dim(new_frame)[2]], by = list(new_frame$Preferred.Name), FUN = mean)
rownames(aggregated_res) <- aggregated_res$Group.1
aggregate_numeric <- aggregated_res[, -1]
# Cluster results and visualize it with a dendrogram
hc <- hclust(dist(aggregate_numeric, method = "euclidean"), method = "complete")
dd.row <- as.dendrogram(hc)
ddata_x <- dendro_data(dd.row, type = "rectangle")
# Vertical Dendrogram
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + geom_text(data = label(ddata_x), aes(label = label, x = x, y = 0),
hjust = 0, nudge_y = max(segment(ddata_x)$y) * 0.01, size = 5) + labs(x = "", y = "") + coord_flip() + scale_y_reverse(expand = c(1, 1)) + theme(legend.position = "none",
axis.text.y = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), panel.background = element_rect(fill = "white"),
panel.grid = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank())
Call the data normalization function to normalize the active chemical compounds prior to classification analysis.
scale_to_mad <- NULL
clear_active <- NULL
for (i in 1:length(data_sets)) {
clear_active[[i]] <- normalize_mad(processed_data[[i]]$clean_set, processed_data[[i]]$e2_set)
}
Cell Area | Cell Circularity | Nucleus Area | Nucleus Circularity | Fraction Localized In Nucleus |
---|---|---|---|---|
5.0832966 | 2.2867407 | 2.5587224 | -11.660448 | 1.7918457 |
4.9826639 | 1.3568909 | -0.3469003 | -11.227612 | 0.7760685 |
1.4850395 | 3.3276290 | -1.3383609 | -1.399254 | 2.0459868 |
2.0809094 | 3.1614631 | -1.2855717 | -4.100746 | 3.3291567 |
3.3708717 | 0.4231221 | 3.4729955 | -7.145522 | -0.9980647 |
0.4040837 | -1.7905944 | 1.6932461 | -7.750000 | 0.6915735 |
1.1442047 | 0.0199869 | 7.2707109 | -2.279851 | 2.1607898 |
1.9720722 | -1.3101241 | 3.8008207 | -2.324627 | 0.0535310 |
1.6356873 | 1.2369693 | -0.8810026 | 3.444030 | -4.4331026 |
2.8904211 | 0.8348792 | 4.4915160 | 2.436567 | -5.5589596 |
similarity_pearson <- data.frame(matrix(NA, nrow = dim(clear_active[[5]])[2], ncol = dim(clear_active[[5]])[2]))
for (i in 1:dim(clear_active[[5]])[2]) {
for (j in 1:dim(clear_active[[5]])[2]) {
similarity_pearson[i, j] <- cor(as.numeric(as.vector(clear_active[[5]][, i])), as.numeric(as.vector(clear_active[[5]][, j])), method = "pearson")
}
}
rownames(similarity_pearson) <- colnames(similarity_pearson) <- colnames(clear_active[[5]])
hc1 <- hclust(as.dist(1 - abs(similarity_pearson)), method = "complete")
dend <- as.dendrogram(hc1)
par(mar = c(5, 5, 5, 15))
plot(dend, horiz = TRUE, xlab = "Height")
abline(v = max(hc1$height) * 0.05, col = "red")
# Cut tree to 5% similarity
feat_sel <- cutree(hc1, h = max(hc1$height) * 0.05)
# Select the topmost bilogically relevant features from independent clusters and reduce the data matrix size
col_select <- c("Array Area", "Array PI Variance", "Array Mean PI", "Array Total PI", "Array to Nucleoplasm Intensity Ratio")
reduced_data <- NULL
temp_class <- NULL
antagonists <- NULL
for (i in 1:length(data_sets)) {
reduced_data[[i]] <- cbind(clear_active[[i]][, col_select], processed_data[[i]]$expand_names) # Append compound names and ER Activity
temp_class[[i]] <- data.frame(reduced_data[[i]], Class.Info = 0) # Append Class info 0 Agonist, +1 Antagonist
antagonists[[i]] <- temp_class[[i]][which(temp_class[[i]]$ER.Activity %in% "Antagonist"), ]
temp_class[[i]]$Class.Info[as.numeric(rownames(antagonists[[i]]))] <- 1
temp_class[[i]]$Class.Info <- as.factor(temp_class[[i]]$Class.Info) # Make class info factor
}
class_data <- temp_class[[5]]
# Setting the seed for reproducibility
set.seed(1)
# Split Training Compounds
subset_data <- rbind(class_data[which(class_data$ER.Activity %in% "Antagonist"), ], class_data[which(class_data$Preferred.Name %in% "Dicofol"), ], class_data[which(class_data$Preferred.Name %in%
"Diethylstilbestrol"), ], class_data[which(class_data$Preferred.Name %in% "Estrone"), ], class_data[which(class_data$Preferred.Name %in% "Fenarimol"),
], class_data[which(class_data$Preferred.Name %in% "o,p'-DDT"), ])
# Reduce Training Data to Numeric input-output
train_data <- data.frame(subset_data[, 1:5], Class.Info = subset_data$Class.Info)
# Assign remaining compunds to test/validation set
test_data <- class_data[-as.numeric(rownames(subset_data)), ]
test_data_numeric <- test_data[, 1:5]
test_data_class <- data.frame(test_data[, 1:5], Class.Info = test_data$Class.Info)
# Show the PCA analysis and the distribution of training and test sets over 2 principal components For this part we will be using the processed
# unscaled data and we will center and scale during within the PCA function.
pca_data <- processed_data[[5]]$clean_set[, col_select]
# Call PCA function using prcomp
experiment.pca <- prcomp(pca_data, center = TRUE, scale = TRUE)
# Calculating percentage proportion of each PC
percentage <- round(experiment.pca$sdev^2/sum(experiment.pca$sdev^2) * 100, 2)
# Save results into a data frame with class information
df_out <- as.data.frame(experiment.pca$x)
df_out$group2 <- rep("Agonist", times = dim(df_out)[1])
df_out$group2[as.numeric(rownames(class_data[which(class_data$ER.Activity %in% "Antagonist"), ]))] <- "Antagonist"
# For second type of visualization, mark the training and testing sets
df_out$group <- rep("Testing", times = dim(df_out)[1])
# Use training indices to remark the training rows
df_out$group[as.numeric(rownames(train_data))] <- "Training"
percentage <- paste(colnames(df_out), "(", paste(as.character(percentage), "% explained var.", ")", sep = ""))
# Visualizing PCA Analysis with biplots
ggplot(df_out, aes(x = PC1, y = PC2, color = group)) + geom_point() + stat_ellipse(level = 0.95) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
theme_bw() + xlab(percentage[1]) + ylab(percentage[2]) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = c(0.1,
0.1), legend.title = element_blank(), legend.background = element_blank()) + scale_color_manual(values = c("#E69F00", "dodgerblue3")) + xlim(-7, 5) +
ylim(-3.5, 4.5)
ggplot(df_out, aes(x = PC1, y = PC2, color = group2)) + geom_point() + stat_ellipse(level = 0.95) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
theme_bw() + xlab(percentage[1]) + ylab(percentage[2]) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = c(0.12,
0.1), legend.title = element_blank(), legend.background = element_blank()) + scale_color_manual(values = c("seagreen4", "mediumpurple3")) + xlim(-7,
5) + ylim(-3.5, 4.5)
# Create combinations of 4 chemicals out of 5. This is done to get equal sizes of agonist and antagonist vectors for the wilcoxon test. There are 5
# different combinations.
antagonist_group <- train_data[train_data$Class.Info == 1, ] # all antagonists are selected
agonist_Maingroup <- train_data[train_data$Class.Info == 0, ]
wilcox_p <- data.frame(pvalue = matrix(NA, nrow = 5, ncol = 1))
for (j in 1:(dim(train_data)[2] - 1)) {
wil_res <- wilcox.test(antagonist_group[, j], agonist_Maingroup[, j], paired = FALSE)
wilcox_p[j, ] <- wil_res$p.value
rownames(wilcox_p)[j] <- colnames(agonist_Maingroup)[j]
}
wilcox_p_round <- round(wilcox_p, 3)
pvalue | |
---|---|
Array.Area | 0.000 |
Array.PI.Variance | 0.000 |
Array.Mean.PI | 0.000 |
Array.Total.PI | 0.067 |
Array.to.Nucleoplasm.Intensity.Ratio | 0.000 |
# Fit Logistic Regression for Single Features (Total of 5 models)
cname_train <- colnames(train_data)[1:(length(colnames(train_data)) - 1)]
# Initialization
model_param <- NULL
train_accuracy <- NULL
AIC_logit <- NULL
cv_error <- NULL
cv_accuracy <- NULL
test_accuracy <- NULL
conf_interval <- NULL
confusion_test <- NULL
for (i in 1:length(cname_train)) {
fmla <- paste("Class.Info ~", cname_train[i])
fmla <- as.formula(fmla)
fit_logit <- glm(fmla, family = binomial(link = "logit"), data = train_data)
# Calculate Training Accuracy
predict_logit <- predict(fit_logit, type = "response")
confusion_train <- confusionMatrix(factor(round(predict_logit)), train_data$Class.Info, positive = "0", dnn = c("Predicted", "True"))
train_accuracy[i] <- round(as.numeric(confusion_train$overall[1]), 2)
AIC_logit[i] <- round(AIC(fit_logit), 2)
model_param[[i]] <- coef(fit_logit)
# Calculate 5-fold Cross Validation Error and Accuracy
cv_error[i] <- cv.glm(train_data, fit_logit, K = 5)$delta[1]
cv_accuracy[i] <- round(1 - cv_error[i], 2)
# Calculate Testing Accuracy
predict_test <- round(predict(fit_logit, test_data_numeric, type = "response"), 3)
confusion_test[[i]] <- confusionMatrix(factor(round(predict_test)), test_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
test_accuracy[i] <- round(as.numeric(confusion_test[[i]]$overall[1]), 2)
# Calculate the Confidence Intervals via Bootstrapping
out_boot <- boot(data = train_data, statistic = bs, R = 400, formula = fmla)
conf_interval[[i]] <- round(boot.ci(out_boot, type = "bca", index = 1 + 1)$bca, 2)
# Plotting the ROC curves for the training data set
color_list <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00", "#6A3D9A")
if (i == 1) {
# par(pty='s', mar=c(2,2,2,2))
roc(train_data$Class.Info, predict_logit, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Rate (1 - Specificity)", ylab = "True Positive Rate (Sensitivity)",
col = color_list[i], lwd = 4, print.auc = TRUE, asp = NA)
} else {
plot.roc(train_data$Class.Info, predict_logit, col = color_list[i], lwd = 4, print.auc = TRUE, add = TRUE, print.auc.y = 0.5 - 0.05 * (i - 1),
asp = NA)
}
legend("bottomright", legend = col_select, col = color_list, lwd = 2, cex = 0.65, box.lty = 0)
}
# Bind results into 1 data frame and sort with respect to AIC Values
results_logit <- as.data.frame(cbind(cname_train, AIC_logit, cv_accuracy = cv_accuracy, test_accuracy = test_accuracy))
results_logit <- results_logit[order(test_accuracy, decreasing = TRUE), ]
Training Results
Feature.Name | Beta.1 | CI.Lower | CI.Upper | AIC | CV.Accuracy | Testing.Accuracy |
---|---|---|---|---|---|---|
Array.to.Nucleoplasm.Intensity.Ratio | 7.12 | 5.49 | 7.59 | 4 | 1 | 0.96 |
Array.PI.Variance | 8.25 | 4.48 | 8.75 | 4 | 1 | 0.87 |
Array.Area | -0.65 | -0.70 | -0.59 | 4 | 1 | 0.87 |
Array.Mean.PI | 0.20 | 0.12 | 0.44 | 27.92 | 0.87 | 0.85 |
Array.Total.PI | -0.11 | -0.21 | -0.03 | 46.9 | 0.78 | 0.7 |
# Validate the best performing logistic regression model using other experimental replicates
validation_set <- temp_class
# Remove train/test replicate (index = 5)
validation_set[[5]] <- NULL
# Remove trained compounds from the set for unbiased estimate of prediction performance
subset_validation <- NULL
split_calculate <- NULL
valid_data_numeric <- NULL
valid_data_class <- NULL
for (i in 1:length(validation_set)) {
subset_validation[[i]] <- rbind(validation_set[[i]][which(validation_set[[i]]$ER.Activity %in% "Antagonist"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in%
"Dicofol"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in% "Diethylstilbestrol"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in%
"Estrone"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in% "Fenarimol"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in%
"o,p'-DDT"), ])
# Assign remaining compunds to test/validation set
split_calculate[[i]] <- validation_set[[i]][-as.numeric(rownames(subset_validation[[i]])), ]
# Separate numerics and categorical outputs in validation data
valid_data_numeric[[i]] <- split_calculate[[i]][, 1:5]
valid_data_class[[i]] <- data.frame(split_calculate[[i]][, 1:5], Class.Info = split_calculate[[i]]$Class.Info)
}
# Validate also with full data
valid_full_numeric <- NULL
valid_full_class <- NULL
for (i in 1:length(validation_set)) {
# Separate numerics and categorical outputs in validation data
valid_full_numeric[[i]] <- validation_set[[i]][, 1:5]
valid_full_class[[i]] <- data.frame(validation_set[[i]][, 1:5], Class.Info = validation_set[[i]]$Class.Info)
}
# Call fit and predict functions Array PI Variance
fmla <- paste("Class.Info ~", cname_train[2])
fmla <- as.formula(fmla)
fit_logit_PI_Var <- glm(fmla, family = binomial(link = "logit"), data = train_data)
temp_valid <- NULL
blind_PI_Var <- NULL
temp_valid2 <- NULL
full_PI_Var <- NULL
for (i in 1:length(validation_set)) {
temp_valid <- mperform(fit_logit_PI_Var, valid_data_numeric[[i]], valid_data_class[[i]], type = "logit")
blind_PI_Var <- rbind(blind_PI_Var, temp_valid)
temp_valid2 <- mperform(fit_logit_PI_Var, valid_full_numeric[[i]], valid_full_class[[i]], type = "logit")
full_PI_Var <- rbind(full_PI_Var, temp_valid2)
}
# Save results to csv file
write.csv(blind_PI_Var, file = "BlindValidation_Results_Array_PI_Variance_Logit.csv", row.names = FALSE)
write.csv(full_PI_Var, file = "FullValidation_Results_Array_PI_Variance_Logit.csv", row.names = FALSE)
# Array to Nucleoplasm Intensity Ratio
fmla <- paste("Class.Info ~", cname_train[5])
fmla <- as.formula(fmla)
fit_logit_NIR <- glm(fmla, family = binomial(link = "logit"), data = train_data)
temp_valid <- NULL
blind_NIR <- NULL
temp_valid2 <- NULL
full_NIR <- NULL
for (i in 1:length(validation_set)) {
temp_valid <- mperform(fit_logit_NIR, valid_data_numeric[[i]], valid_data_class[[i]], type = "logit")
blind_NIR <- rbind(blind_NIR, temp_valid)
temp_valid2 <- mperform(fit_logit_NIR, valid_full_numeric[[i]], valid_full_class[[i]], type = "logit")
full_NIR <- rbind(full_NIR, temp_valid2)
}
# Save results to csv file
write.csv(blind_NIR, file = "BlindValidation_Results_NIR_Logit.csv", row.names = FALSE)
write.csv(full_NIR, file = "FullValidation_Results_NIR_Logit.csv", row.names = FALSE)
Model validation results with 24 unseen agonist compounds for the Logistic Regression Model as a function of Array PI Variance.
Experiment.Number | Accuracy | CI.Lower | CI.Upper | Sensitivity |
---|---|---|---|---|
1 | 0.80 | 0.71 | 0.88 | 0.80 |
2 | 0.97 | 0.91 | 0.99 | 0.97 |
3 | 0.75 | 0.65 | 0.83 | 0.75 |
4 | 0.98 | 0.92 | 1.00 | 0.98 |
5 | 0.97 | 0.91 | 0.99 | 0.97 |
6 | 0.88 | 0.80 | 0.94 | 0.88 |
7 | 0.97 | 0.91 | 0.99 | 0.97 |
8 | 0.75 | 0.65 | 0.83 | 0.75 |
9 | 0.98 | 0.92 | 1.00 | 0.98 |
10 | 0.88 | 0.80 | 0.94 | 0.88 |
11 | 0.89 | 0.81 | 0.95 | 0.89 |
12 | 0.78 | 0.68 | 0.86 | 0.78 |
13 | 0.97 | 0.91 | 0.99 | 0.97 |
14 | 0.78 | 0.68 | 0.86 | 0.78 |
15 | 0.97 | 0.91 | 0.99 | 0.97 |
16 | 0.90 | 0.82 | 0.95 | 0.90 |
17 | 1.00 | 0.96 | 1.00 | 1.00 |
Experiment.Number | Accuracy | CI.Lower | CI.Upper | Sensitivity | Specificity | Balanced.Accuracy |
---|---|---|---|---|---|---|
1 | 0.84 | 0.77 | 0.90 | 0.82 | 1.00 | 0.91 |
2 | 0.93 | 0.87 | 0.97 | 0.97 | 0.62 | 0.80 |
3 | 0.80 | 0.72 | 0.86 | 0.77 | 1.00 | 0.88 |
4 | 0.94 | 0.88 | 0.97 | 0.98 | 0.62 | 0.80 |
5 | 0.95 | 0.89 | 0.98 | 0.97 | 0.75 | 0.86 |
6 | 0.91 | 0.85 | 0.96 | 0.90 | 1.00 | 0.95 |
7 | 0.92 | 0.86 | 0.96 | 0.97 | 0.56 | 0.77 |
8 | 0.80 | 0.72 | 0.86 | 0.77 | 1.00 | 0.88 |
9 | 0.94 | 0.88 | 0.97 | 0.98 | 0.62 | 0.80 |
10 | 0.91 | 0.84 | 0.95 | 0.90 | 0.94 | 0.92 |
11 | 0.89 | 0.82 | 0.94 | 0.91 | 0.75 | 0.83 |
12 | 0.82 | 0.74 | 0.88 | 0.79 | 1.00 | 0.90 |
13 | 0.93 | 0.87 | 0.97 | 0.97 | 0.62 | 0.80 |
14 | 0.82 | 0.74 | 0.88 | 0.79 | 1.00 | 0.90 |
15 | 0.93 | 0.87 | 0.97 | 0.97 | 0.62 | 0.80 |
16 | 0.91 | 0.85 | 0.96 | 0.92 | 0.88 | 0.90 |
17 | 0.97 | 0.92 | 0.99 | 1.00 | 0.75 | 0.88 |
set.seed(123)
fit_rf <- randomForest(formula = Class.Info ~ ., data = train_data, importance = TRUE, ntree = 130)
# Calculate Training Accuracy
predict_rf <- predict(fit_rf)
confusion_RF_train <- confusionMatrix(predict_rf, train_data$Class.Info, positive = "0", dnn = c("Predicted", "True"))
train_RF_accuracy <- round(as.numeric(confusion_RF_train$overall[1]), 2)
print(train_RF_accuracy)
## [1] 1
# Calculate Testing Accuracy
predict_rf_test <- predict(fit_rf, test_data_numeric)
confusion_RF_test <- confusionMatrix(predict_rf_test, test_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
test_RF_accuracy <- round(as.numeric(confusion_RF_test$overall[1]), 2)
print(test_RF_accuracy)
## [1] 0.93
# Calculate Mean Decrease in Gini Index
feature_imp <- as.data.frame(fit_rf$importance)
feature_rank <- as.matrix(feature_imp[order(feature_imp$MeanDecreaseGini, decreasing = T), 4])
rownames(feature_rank) <- rownames(feature_imp)[order(feature_imp$MeanDecreaseGini, decreasing = T)]
colnames(feature_rank) <- c("Mean Decrease in Gini Index")
write.csv(feature_rank, file = "Feauture_Ranking_RF.csv", row.names = TRUE)
Feature ranking as a result of Random Forest Classifier training
Mean Decrease in Gini Index | |
---|---|
Array.to.Nucleoplasm.Intensity.Ratio | 5.7874176 |
Array.Area | 5.1726496 |
Array.PI.Variance | 5.1491282 |
Array.Mean.PI | 0.9671319 |
Array.Total.PI | 0.1899120 |
temp_valid <- NULL
blind_RF <- NULL
temp_valid2 <- NULL
full_RF <- NULL
for (i in 1:length(validation_set)) {
temp_valid <- mperform(fit_rf, valid_data_numeric[[i]], valid_data_class[[i]], type = "rf")
blind_RF <- rbind(blind_RF, temp_valid)
temp_valid2 <- mperform(fit_rf, valid_full_numeric[[i]], valid_full_class[[i]], type = "rf")
full_RF <- rbind(full_RF, temp_valid2)
}
# Save results to csv file
write.csv(blind_RF, file = "BlindValidation_Results_RF.csv", row.names = FALSE)
write.csv(full_RF, file = "FullValidation_Results_RF.csv", row.names = FALSE)
Model validation results with 24 unseen agonist compounds
Experiment.Number | Accuracy | CI.Lower | CI.Upper | Sensitivity |
---|---|---|---|---|
1 | 0.95 | 0.88 | 0.98 | 0.95 |
2 | 1.00 | 0.96 | 1.00 | 1.00 |
3 | 0.96 | 0.89 | 0.99 | 0.96 |
4 | 1.00 | 0.96 | 1.00 | 1.00 |
5 | 1.00 | 0.96 | 1.00 | 1.00 |
6 | 0.97 | 0.91 | 0.99 | 0.97 |
7 | 1.00 | 0.96 | 1.00 | 1.00 |
8 | 0.96 | 0.89 | 0.99 | 0.96 |
9 | 1.00 | 0.96 | 1.00 | 1.00 |
10 | 0.97 | 0.91 | 0.99 | 0.97 |
11 | 1.00 | 0.96 | 1.00 | 1.00 |
12 | 1.00 | 0.96 | 1.00 | 1.00 |
13 | 1.00 | 0.96 | 1.00 | 1.00 |
14 | 0.97 | 0.91 | 0.99 | 0.97 |
15 | 1.00 | 0.96 | 1.00 | 1.00 |
16 | 0.96 | 0.89 | 0.99 | 0.96 |
17 | 1.00 | 0.96 | 1.00 | 1.00 |
Model validation results with all active compounds
Experiment.Number | Accuracy | CI.Lower | CI.Upper | Sensitivity | Specificity | Balanced.Accuracy |
---|---|---|---|---|---|---|
1 | 0.95 | 0.89 | 0.98 | 0.94 | 1.00 | 0.97 |
2 | 0.88 | 0.80 | 0.93 | 1.00 | 0.00 | 0.50 |
3 | 0.95 | 0.90 | 0.98 | 0.95 | 1.00 | 0.97 |
4 | 0.88 | 0.80 | 0.93 | 1.00 | 0.00 | 0.50 |
5 | 0.95 | 0.89 | 0.98 | 1.00 | 0.56 | 0.78 |
6 | 0.98 | 0.93 | 1.00 | 0.97 | 1.00 | 0.99 |
7 | 0.88 | 0.80 | 0.93 | 1.00 | 0.00 | 0.50 |
8 | 0.95 | 0.90 | 0.98 | 0.96 | 0.88 | 0.92 |
9 | 0.88 | 0.80 | 0.93 | 1.00 | 0.00 | 0.50 |
10 | 0.97 | 0.92 | 0.99 | 0.97 | 0.94 | 0.96 |
11 | 0.95 | 0.90 | 0.98 | 1.00 | 0.62 | 0.81 |
12 | 0.98 | 0.94 | 1.00 | 1.00 | 0.88 | 0.94 |
13 | 0.88 | 0.80 | 0.93 | 1.00 | 0.00 | 0.50 |
14 | 0.95 | 0.90 | 0.98 | 0.96 | 0.94 | 0.95 |
15 | 0.89 | 0.82 | 0.94 | 1.00 | 0.12 | 0.56 |
16 | 0.95 | 0.90 | 0.98 | 0.96 | 0.88 | 0.92 |
17 | 0.91 | 0.85 | 0.96 | 1.00 | 0.31 | 0.66 |
Visualizing model performance metrics with all active compounds on a boxplot.
# Append model names to data frames
full_NIR$Model <- rep("Logistic Regression 'Array to Nucleoplasm Intensity Ratio'", times = dim(full_NIR)[1])
full_PI_Var$Model <- rep("Logistic Regression 'Array PI Variance'", times = dim(full_PI_Var)[1])
full_RF$Model <- rep("Random Forest", times = dim(full_RF)[1])
# Merge all results
merged_data <- rbind(full_PI_Var, full_NIR, full_RF)
# Drop confidence intervals
merged_data$CI.Lower <- NULL
merged_data$CI.Upper <- NULL
colnames(merged_data)[4] <- "Balanced Accuracy"
# Melt data for plotting with respect to model type
melted_data <- melt(merged_data, id.vars = "Model")
ggplot(melted_data, aes(x = variable, y = value, fill = Model)) + geom_boxplot() + theme_bw() + theme(panel.grid.minor = element_blank(), axis.title.x = element_blank(),
legend.justification = c("left", "bottom"), legend.position = c(0.03, 0.05), legend.text = element_text(size = 7.5), legend.title = element_blank()) +
scale_fill_brewer(palette = "Blues") + labs(y = "Model Performance")
Model performance metrics after data quality monitoring.
# Removing high-density experimental replicates from the analysis
full_NIR_red <- full_NIR[-c(2, 4, 5, 7, 9, 11, 12, 13, 15, 17), ]
full_PI_Var_red <- full_PI_Var[-c(2, 4, 5, 7, 9, 11, 12, 13, 15, 17), ]
full_RF_red <- full_RF[-c(2, 4, 5, 7, 9, 11, 12, 13, 15, 17), ]
# Merge all results
merged_data2 <- rbind(full_PI_Var_red, full_NIR_red, full_RF_red)
# Drop confidence intervals
merged_data2$CI.Lower <- NULL
merged_data2$CI.Upper <- NULL
colnames(merged_data2)[4] <- "Balanced Accuracy"
# Melt data for plotting with respect to model type
melted_data2 <- melt(merged_data2, id.vars = "Model")
ggplot(melted_data2, aes(x = variable, y = value, fill = Model)) + geom_boxplot() + theme_bw() + theme(panel.grid.minor = element_blank(), axis.title.x = element_blank(),
legend.justification = c("left", "bottom"), legend.position = c(0.03, 0.05), legend.text = element_text(size = 7.5), legend.title = element_blank()) +
scale_fill_brewer(palette = "Reds") + ylim(0, 1) + labs(y = "Model Performance")
agonists <- NULL
for (i in 1:length(temp_class)) {
agonists[[i]] <- temp_class[[i]][which(temp_class[[i]]$Class.Info == 0), ]
}
# Calculate Hellinger Distance between Agonists and Antagonists for Array PI Variance
hdist_PI_Var <- NULL
par(mfrow = c(3, 6), cex.axis = 2, cex.lab = 2, mar = c(5, 5, 5, 3))
for (i in 1:length(agonists)) {
hdist_PI_Var[[i]] <- round(hellinger(agonists[[i]]$Array.PI.Variance, antagonists[[i]]$Array.PI.Variance, -Inf, Inf, method = 1), 2)
sm.density.compare(temp_class[[i]]$Array.PI.Variance, temp_class[[i]]$Class.Info, xlab = paste("Exp", i, ", HD=", hdist_PI_Var[[i]], sep = ""), lty = c(1,
1), lwd = c(2, 2), col = c("dodgerblue3", "red3"))
}
# Calculate Hellinger Distance between Agonists and Antagonists for Array to Nucleoplasm Intensity Ratio
hdist_NIR <- NULL
par(mfrow = c(3, 6), cex.axis = 2, cex.lab = 2, mar = c(5, 5, 5, 3))
for (i in 1:length(agonists)) {
hdist_NIR[[i]] <- round(hellinger(agonists[[i]]$Array.to.Nucleoplasm.Intensity.Ratio, antagonists[[i]]$Array.to.Nucleoplasm.Intensity.Ratio, -Inf,
Inf, method = 1), 2)
sm.density.compare(temp_class[[i]]$Array.to.Nucleoplasm.Intensity.Ratio, temp_class[[i]]$Class.Info, xlab = paste("Exp", i, ", HD=", hdist_NIR[[i]],
sep = ""), lty = c(1, 1), lwd = c(2, 2), col = c("dodgerblue3", "red3"))
}
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pROC_1.16.2 reshape_0.8.8 formatR_1.7 kableExtra_1.1.0 sm_2.2-5.6 statip_0.2.3 boot_1.3-24 caret_6.0-86 lattice_0.20-41 randomForest_4.6-14
## [11] ggplot2_3.3.0 ggdendro_0.1-20 knitr_1.28 dendextend_1.13.4
##
## loaded via a namespace (and not attached):
## [1] viridis_0.5.1 httr_1.4.1 viridisLite_0.3.0 splines_4.0.0 foreach_1.5.0 prodlim_2019.11.13 assertthat_0.2.1 highr_0.8 stats4_4.0.0 yaml_2.2.1
## [11] ipred_0.9-9 pillar_1.4.4 glue_1.4.0 digest_0.6.25 RColorBrewer_1.1-2 rvest_0.3.5 colorspace_1.4-1 recipes_0.1.12 htmltools_0.4.0 Matrix_1.2-18
## [21] plyr_1.8.6 timeDate_3043.102 pkgconfig_2.0.3 purrr_0.3.4 scales_1.1.0 webshot_0.5.2 gower_0.2.1 lava_1.6.7 tibble_3.0.1 farver_2.0.3
## [31] generics_0.0.2 ellipsis_0.3.0 withr_2.2.0 nnet_7.3-13 survival_3.1-12 magrittr_1.5 crayon_1.3.4 evaluate_0.14 nlme_3.1-147 MASS_7.3-51.5
## [41] xml2_1.3.2 class_7.3-16 tools_4.0.0 data.table_1.12.8 hms_0.5.3 lifecycle_0.2.0 stringr_1.4.0 munsell_0.5.0 cluster_2.1.0 e1071_1.7-3
## [51] compiler_4.0.0 rlang_0.4.6 grid_4.0.0 iterators_1.0.12 rstudioapi_0.11 labeling_0.3 tcltk_4.0.0 rmarkdown_2.1 gtable_0.3.0 ModelMetrics_1.2.2.2
## [61] codetools_0.2-16 reshape2_1.4.4 R6_2.4.1 gridExtra_2.3 lubridate_1.7.8 dplyr_0.8.5 clue_0.3-57 readr_1.3.1 stringi_1.4.6 Rcpp_1.0.4.6
## [71] vctrs_0.2.4 rpart_4.1-15 tidyselect_1.0.0 xfun_0.13