Load Required Libraries

Required Packages: “dendextend”,“ggplot2”,“ggdendro”,“caret”,“boot”,“randomForest”,“statip”,“sm”,“reshape”,“pROC”,“stats”. Their dependencies may also be required.

Pre-Processing Functions

This part of the code reads in the name files and pre-processes the experimental dataset. The first pre-processing function will remove redundant columns, inactive compounds, technical replicates of controls and media.

data_names <- read.csv("./Reference_ER_Activity.csv", header = T)
feature_names <- read.csv("./Feature_Names.csv", header = T)

ERData_process <- function(data_set) {
    # Order Data Sets based on compound ID
    data_set <- data_set[order(data_set$compound, decreasing = F), ]
    
    # Drop columns with channel 00
    data_set <- data_set[, -grep("ch00", colnames(data_set))]
    
    # Separate out Negative Control - Media
    media <- data_set[which(data_set$compound %in% "DMSO"), ]
    data_set <- data_set[-which(data_set$compound %in% "DMSO"), ]
    
    # Separate out Positive Control - E2 (Estradiol)
    e2 <- data_set[which(data_set$compound %in% "E2"), ]
    data_set <- data_set[-which(data_set$compound %in% "E2"), ]
    
    # Separate out 4HT
    ht4 <- data_set[which(data_set$compound %in% "4HT"), ]
    data_set <- data_set[-which(data_set$compound %in% "4HT"), ]
    
    # Remove Inactive Compounds
    a02 <- data_set[which(data_set$compound %in% "A02"), ]
    data_set <- data_set[-which(data_set$compound %in% "A02"), ]
    
    a03 <- data_set[which(data_set$compound %in% "A03"), ]
    data_set <- data_set[-which(data_set$compound %in% "A03"), ]
    
    b06 <- data_set[which(data_set$compound %in% "B06"), ]
    data_set <- data_set[-which(data_set$compound %in% "B06"), ]
    
    c02 <- data_set[which(data_set$compound %in% "C02"), ]
    data_set <- data_set[-which(data_set$compound %in% "C02"), ]
    
    c05 <- data_set[which(data_set$compound %in% "C05"), ]
    data_set <- data_set[-which(data_set$compound %in% "C05"), ]
    
    c06 <- data_set[which(data_set$compound %in% "C06"), ]
    data_set <- data_set[-which(data_set$compound %in% "C06"), ]
    
    d03 <- data_set[which(data_set$compound %in% "D03"), ]
    data_set <- data_set[-which(data_set$compound %in% "D03"), ]
    
    d06 <- data_set[which(data_set$compound %in% "D06"), ]
    data_set <- data_set[-which(data_set$compound %in% "D06"), ]
    
    e03 <- data_set[which(data_set$compound %in% "E03"), ]
    data_set <- data_set[-which(data_set$compound %in% "E03"), ]
    
    f01 <- data_set[which(data_set$compound %in% "F01"), ]
    data_set <- data_set[-which(data_set$compound %in% "F01"), ]
    
    f02 <- data_set[which(data_set$compound %in% "F02"), ]
    data_set <- data_set[-which(data_set$compound %in% "F02"), ]
    
    h05 <- data_set[which(data_set$compound %in% "H05"), ]
    data_set <- data_set[-which(data_set$compound %in% "H05"), ]
    
    g02 <- data_set[which(data_set$compound %in% "G02"), ]
    data_set <- data_set[-which(data_set$compound %in% "G02"), ]
    
    # Merge masked compund IDs with correct names
    expand_names <- data.frame(matrix(NA, ncol = 2, nrow = dim(data_set)[1]))
    for (j in 1:length(data_names$Short_ID)) {
        for (i in 1:dim(data_set)[1]) {
            if (data_names$Short_ID[j] == data_set$compound[i]) {
                expand_names[i, 1] <- toString(data_names$Preferred_Name[j])
                expand_names[i, 2] <- toString(data_names$ER.Activity[j])
            }
        }
    }
    colnames(expand_names) <- c("Preferred.Name", "ER.Activity")
    
    # Create a new data frame with experimental features only
    clean_set <- data_set[, -(1:6)]
    media_set <- media[, -(1:6)]
    e2_set <- e2[, -(1:6)]
    
    for (k in 1:length(colnames(clean_set))) {
        if (colnames(clean_set)[k] == feature_names$Original.Feature.Names[k]) {
            colnames(clean_set)[k] <- as.character(feature_names$Even.Reduced.Names[k])
        }
    }
    
    return(list(clean_set = clean_set, expand_names = expand_names, e2_set = e2_set, media_set = media_set))
}

Data Normalization

This function is used to normalize the experimental datasets before model building.

# Data Normalization with respect to mean absolute deviation
normalize_mad <- function(clean_set, e2_set) {
    mad_e2 <- NULL
    bm_e2 <- data.frame(matrix(NA, nrow = dim(e2_set)[1], ncol = dim(clean_set)[2]))
    for (i in 1:dim(e2_set)[1]) {
        for (j in 1:dim(e2_set)[2]) {
            bm_e2[i, j] <- abs(e2_set[i, j] - mean(e2_set[, j]))
            mad_e2[j] <- mean(bm_e2[, j])
        }
    }
    
    df_mad <- data.frame(matrix(NA, nrow = dim(clean_set)[1], ncol = dim(clean_set)[2]))
    for (i in 1:dim(clean_set)[1]) {
        for (j in 1:dim(clean_set)[2]) {
            df_mad[i, j] <- (clean_set[i, j] - median(e2_set[, j]))/mad_e2[j]
        }
    }
    colnames(df_mad) <- colnames(clean_set)
    return(df_mad)
}

Confidence Interval Calculations in Logistic Regression

This function is used in the confidence interval calculations of the logistic regression model parameters.

bs <- function(formula, data, indices) {
    d <- data[indices, ]  # allows boot to select sample
    fit <- glm(formula, family = binomial(link = "logit"), data = d)
    return(coef(fit))
}

Model Performance Calculations

This function is used to calculate model performance metrics; accuracy, sensitivity, specificity, balanced accuracy and the 95% confidence intervals for the prediction accuracy.

mperform <- function(fit_model, valid_data_numeric, valid_data_class, type = "logit") {
    if (type == "logit") {
        predict_test <- round(predict(fit_model, valid_data_numeric, type = "response"), 3)
        conf_valid <- confusionMatrix(factor(round(predict_test)), valid_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
        Accuracy <- round(as.numeric(conf_valid$overall[1]), 2)
        Sensitivity <- round(as.numeric(conf_valid$byClass[1]), 2)
        Specificity <- round(as.numeric(conf_valid$byClass[2]), 2)
        BA <- round(as.numeric(conf_valid$byClass[11]), 2)
        CI_Lower <- round(as.numeric(conf_valid$overall[3]), 2)
        CI_Upper <- round(as.numeric(conf_valid$overall[4]), 2)
        
    } else if (type == "rf") {
        predict_test <- predict(fit_model, valid_data_numeric)
        conf_valid <- confusionMatrix(predict_test, valid_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
        Accuracy <- round(as.numeric(conf_valid$overall[1]), 2)
        Sensitivity <- round(as.numeric(conf_valid$byClass[1]), 2)
        Specificity <- round(as.numeric(conf_valid$byClass[2]), 2)
        BA <- round(as.numeric(conf_valid$byClass[11]), 2)
        CI_Lower <- round(as.numeric(conf_valid$overall[3]), 2)
        CI_Upper <- round(as.numeric(conf_valid$overall[4]), 2)
    }
    summary.res <- data.frame(Accuracy = Accuracy, CI.Lower = CI_Lower, CI.Upper = CI_Upper, Sensitivity = Sensitivity, Specificity = Specificity, Balanced.Accuracy = BA)
    return(summary.res)
}

Reading Experimental Datasets

data_sets <- NULL
for (j in 1:6) {
    data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531A_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}

for (j in 7:12) {
    data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531B_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}

for (j in 13:18) {
    data_sets[[j]] <- read.csv(paste("./EPA_Reps_CSV/20190531C_Rep_", (j + 3), "_Well_Data_updated_cmpnd.csv", sep = ""), header = T)
}

Data Pre-processing

Check Missing Data

for (i in 1:length(data_sets)) {
    print(sum(is.na(data_sets[[i]]$clean_set)))
}

## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0

Data Cleaning

Call the pre-processing function for data cleaning.

processed_data <- NULL
for (i in 1:length(data_sets)) {
    processed_data[[i]] <- ERData_process(data_sets[[i]])
}
# Replicate 8 (index =5) is selected for analysis
new_frame <- data.frame(processed_data[[5]]$clean_set, Preferred.Name = processed_data[[5]]$expand_names$Preferred.Name)

First 10 rows and 5 columns of the processed experimental data used for analysis
Cell.Area	Cell.Circularity	Nucleus.Area	Nucleus.Circularity	Fraction.Localized.In.Nucleus
7591.96	0.953436	2096.64	1.13147	0.832444
7570.25	0.949877	2031.14	1.13205	0.824702
6815.69	0.957420	2008.79	1.14522	0.834381
6944.24	0.956784	2009.98	1.14160	0.844161
7222.53	0.946303	2117.25	1.13752	0.811180
6582.49	0.937830	2077.13	1.13671	0.824058
6742.16	0.944760	2202.86	1.14404	0.835256
6920.76	0.939669	2124.64	1.14398	0.819195
6848.19	0.949418	2019.10	1.15171	0.784999
7118.88	0.947879	2140.21	1.15036	0.776418

Outlier Detection

# Average biological replicates raw biological replicates for outlier detection
aggregated_res <- aggregate(new_frame[, -dim(new_frame)[2]], by = list(new_frame$Preferred.Name), FUN = mean)
rownames(aggregated_res) <- aggregated_res$Group.1

aggregate_numeric <- aggregated_res[, -1]
# Cluster results and visualize it with a dendrogram
hc <- hclust(dist(aggregate_numeric, method = "euclidean"), method = "complete")
dd.row <- as.dendrogram(hc)
ddata_x <- dendro_data(dd.row, type = "rectangle")

# Vertical Dendrogram
ggplot(segment(ddata_x)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + geom_text(data = label(ddata_x), aes(label = label, x = x, y = 0), 
    hjust = 0, nudge_y = max(segment(ddata_x)$y) * 0.01, size = 5) + labs(x = "", y = "") + coord_flip() + scale_y_reverse(expand = c(1, 1)) + theme(legend.position = "none", 
    axis.text.y = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), panel.background = element_rect(fill = "white"), 
    panel.grid = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank())

Data Normalization

Call the data normalization function to normalize the active chemical compounds prior to classification analysis.

scale_to_mad <- NULL
clear_active <- NULL
for (i in 1:length(data_sets)) {
    clear_active[[i]] <- normalize_mad(processed_data[[i]]$clean_set, processed_data[[i]]$e2_set)
}

First 10 rows and 5 columns of the normalized clean experimental data used for analysis
Cell Area	Cell Circularity	Nucleus Area	Nucleus Circularity	Fraction Localized In Nucleus
5.0832966	2.2867407	2.5587224	-11.660448	1.7918457
4.9826639	1.3568909	-0.3469003	-11.227612	0.7760685
1.4850395	3.3276290	-1.3383609	-1.399254	2.0459868
2.0809094	3.1614631	-1.2855717	-4.100746	3.3291567
3.3708717	0.4231221	3.4729955	-7.145522	-0.9980647
0.4040837	-1.7905944	1.6932461	-7.750000	0.6915735
1.1442047	0.0199869	7.2707109	-2.279851	2.1607898
1.9720722	-1.3101241	3.8008207	-2.324627	0.0535310
1.6356873	1.2369693	-0.8810026	3.444030	-4.4331026
2.8904211	0.8348792	4.4915160	2.436567	-5.5589596

Feature Selection

similarity_pearson <- data.frame(matrix(NA, nrow = dim(clear_active[[5]])[2], ncol = dim(clear_active[[5]])[2]))
for (i in 1:dim(clear_active[[5]])[2]) {
    for (j in 1:dim(clear_active[[5]])[2]) {
        similarity_pearson[i, j] <- cor(as.numeric(as.vector(clear_active[[5]][, i])), as.numeric(as.vector(clear_active[[5]][, j])), method = "pearson")
    }
}
rownames(similarity_pearson) <- colnames(similarity_pearson) <- colnames(clear_active[[5]])

hc1 <- hclust(as.dist(1 - abs(similarity_pearson)), method = "complete")
dend <- as.dendrogram(hc1)
par(mar = c(5, 5, 5, 15))
plot(dend, horiz = TRUE, xlab = "Height")
abline(v = max(hc1$height) * 0.05, col = "red")

# Cut tree to 5% similarity
feat_sel <- cutree(hc1, h = max(hc1$height) * 0.05)

# Select the topmost bilogically relevant features from independent clusters and reduce the data matrix size
col_select <- c("Array Area", "Array PI Variance", "Array Mean PI", "Array Total PI", "Array to Nucleoplasm Intensity Ratio")
reduced_data <- NULL
temp_class <- NULL
antagonists <- NULL

for (i in 1:length(data_sets)) {
    reduced_data[[i]] <- cbind(clear_active[[i]][, col_select], processed_data[[i]]$expand_names)  # Append compound names and ER Activity
    temp_class[[i]] <- data.frame(reduced_data[[i]], Class.Info = 0)  # Append Class info 0 Agonist, +1 Antagonist
    antagonists[[i]] <- temp_class[[i]][which(temp_class[[i]]$ER.Activity %in% "Antagonist"), ]
    temp_class[[i]]$Class.Info[as.numeric(rownames(antagonists[[i]]))] <- 1
    temp_class[[i]]$Class.Info <- as.factor(temp_class[[i]]$Class.Info)  # Make class info factor
}
class_data <- temp_class[[5]]

Classification Model Training

# Setting the seed for reproducibility
set.seed(1)
# Split Training Compounds
subset_data <- rbind(class_data[which(class_data$ER.Activity %in% "Antagonist"), ], class_data[which(class_data$Preferred.Name %in% "Dicofol"), ], class_data[which(class_data$Preferred.Name %in% 
    "Diethylstilbestrol"), ], class_data[which(class_data$Preferred.Name %in% "Estrone"), ], class_data[which(class_data$Preferred.Name %in% "Fenarimol"), 
    ], class_data[which(class_data$Preferred.Name %in% "o,p'-DDT"), ])

# Reduce Training Data to Numeric input-output
train_data <- data.frame(subset_data[, 1:5], Class.Info = subset_data$Class.Info)

# Assign remaining compunds to test/validation set
test_data <- class_data[-as.numeric(rownames(subset_data)), ]

test_data_numeric <- test_data[, 1:5]
test_data_class <- data.frame(test_data[, 1:5], Class.Info = test_data$Class.Info)

Data Visualization with PCA

# Show the PCA analysis and the distribution of training and test sets over 2 principal components For this part we will be using the processed
# unscaled data and we will center and scale during within the PCA function.

pca_data <- processed_data[[5]]$clean_set[, col_select]

# Call PCA function using prcomp
experiment.pca <- prcomp(pca_data, center = TRUE, scale = TRUE)

# Calculating percentage proportion of each PC
percentage <- round(experiment.pca$sdev^2/sum(experiment.pca$sdev^2) * 100, 2)
# Save results into a data frame with class information
df_out <- as.data.frame(experiment.pca$x)
df_out$group2 <- rep("Agonist", times = dim(df_out)[1])
df_out$group2[as.numeric(rownames(class_data[which(class_data$ER.Activity %in% "Antagonist"), ]))] <- "Antagonist"

# For second type of visualization, mark the training and testing sets
df_out$group <- rep("Testing", times = dim(df_out)[1])

# Use training indices to remark the training rows
df_out$group[as.numeric(rownames(train_data))] <- "Training"

percentage <- paste(colnames(df_out), "(", paste(as.character(percentage), "% explained var.", ")", sep = ""))

# Visualizing PCA Analysis with biplots
ggplot(df_out, aes(x = PC1, y = PC2, color = group)) + geom_point() + stat_ellipse(level = 0.95) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + 
    theme_bw() + xlab(percentage[1]) + ylab(percentage[2]) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = c(0.1, 
    0.1), legend.title = element_blank(), legend.background = element_blank()) + scale_color_manual(values = c("#E69F00", "dodgerblue3")) + xlim(-7, 5) + 
    ylim(-3.5, 4.5)

ggplot(df_out, aes(x = PC1, y = PC2, color = group2)) + geom_point() + stat_ellipse(level = 0.95) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + 
    theme_bw() + xlab(percentage[1]) + ylab(percentage[2]) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = c(0.12, 
    0.1), legend.title = element_blank(), legend.background = element_blank()) + scale_color_manual(values = c("seagreen4", "mediumpurple3")) + xlim(-7, 
    5) + ylim(-3.5, 4.5)

Wilcoxon Rank Sum Test

# Create combinations of 4 chemicals out of 5. This is done to get equal sizes of agonist and antagonist vectors for the wilcoxon test. There are 5
# different combinations.
antagonist_group <- train_data[train_data$Class.Info == 1, ]  # all antagonists are selected
agonist_Maingroup <- train_data[train_data$Class.Info == 0, ]

wilcox_p <- data.frame(pvalue = matrix(NA, nrow = 5, ncol = 1))
for (j in 1:(dim(train_data)[2] - 1)) {
    wil_res <- wilcox.test(antagonist_group[, j], agonist_Maingroup[, j], paired = FALSE)
    wilcox_p[j, ] <- wil_res$p.value
    rownames(wilcox_p)[j] <- colnames(agonist_Maingroup)[j]
}

wilcox_p_round <- round(wilcox_p, 3)

P-values reported from the Wilcoxon rank sum test
	pvalue
Array.Area	0.000
Array.PI.Variance	0.000
Array.Mean.PI	0.000
Array.Total.PI	0.067
Array.to.Nucleoplasm.Intensity.Ratio	0.000

Logistic Regression

Model Training

# Fit Logistic Regression for Single Features (Total of 5 models)
cname_train <- colnames(train_data)[1:(length(colnames(train_data)) - 1)]

# Initialization
model_param <- NULL
train_accuracy <- NULL
AIC_logit <- NULL
cv_error <- NULL
cv_accuracy <- NULL
test_accuracy <- NULL
conf_interval <- NULL
confusion_test <- NULL

for (i in 1:length(cname_train)) {
    fmla <- paste("Class.Info ~", cname_train[i])
    fmla <- as.formula(fmla)
    fit_logit <- glm(fmla, family = binomial(link = "logit"), data = train_data)
    
    # Calculate Training Accuracy
    predict_logit <- predict(fit_logit, type = "response")
    confusion_train <- confusionMatrix(factor(round(predict_logit)), train_data$Class.Info, positive = "0", dnn = c("Predicted", "True"))
    train_accuracy[i] <- round(as.numeric(confusion_train$overall[1]), 2)
    AIC_logit[i] <- round(AIC(fit_logit), 2)
    model_param[[i]] <- coef(fit_logit)
    
    # Calculate 5-fold Cross Validation Error and Accuracy
    cv_error[i] <- cv.glm(train_data, fit_logit, K = 5)$delta[1]
    cv_accuracy[i] <- round(1 - cv_error[i], 2)
    
    # Calculate Testing Accuracy
    predict_test <- round(predict(fit_logit, test_data_numeric, type = "response"), 3)
    confusion_test[[i]] <- confusionMatrix(factor(round(predict_test)), test_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
    test_accuracy[i] <- round(as.numeric(confusion_test[[i]]$overall[1]), 2)
    
    # Calculate the Confidence Intervals via Bootstrapping
    out_boot <- boot(data = train_data, statistic = bs, R = 400, formula = fmla)
    conf_interval[[i]] <- round(boot.ci(out_boot, type = "bca", index = 1 + 1)$bca, 2)
    
    # Plotting the ROC curves for the training data set
    color_list <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00", "#6A3D9A")
    if (i == 1) {
        # par(pty='s', mar=c(2,2,2,2))
        roc(train_data$Class.Info, predict_logit, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Rate (1 - Specificity)", ylab = "True Positive Rate (Sensitivity)", 
            col = color_list[i], lwd = 4, print.auc = TRUE, asp = NA)
    } else {
        plot.roc(train_data$Class.Info, predict_logit, col = color_list[i], lwd = 4, print.auc = TRUE, add = TRUE, print.auc.y = 0.5 - 0.05 * (i - 1), 
            asp = NA)
    }
    legend("bottomright", legend = col_select, col = color_list, lwd = 2, cex = 0.65, box.lty = 0)
}

# Bind results into 1 data frame and sort with respect to AIC Values
results_logit <- as.data.frame(cbind(cname_train, AIC_logit, cv_accuracy = cv_accuracy, test_accuracy = test_accuracy))
results_logit <- results_logit[order(test_accuracy, decreasing = TRUE), ]

Training Results

Feature.Name	Beta.1	CI.Lower	CI.Upper	AIC	CV.Accuracy	Testing.Accuracy
Array.to.Nucleoplasm.Intensity.Ratio	7.12	5.49	7.59	4	1	0.96
Array.PI.Variance	8.25	4.48	8.75	4	1	0.87
Array.Area	-0.65	-0.70	-0.59	4	1	0.87
Array.Mean.PI	0.20	0.12	0.44	27.92	0.87	0.85
Array.Total.PI	-0.11	-0.21	-0.03	46.9	0.78	0.7

Model Validation

# Validate the best performing logistic regression model using other experimental replicates
validation_set <- temp_class
# Remove train/test replicate (index = 5)
validation_set[[5]] <- NULL

# Remove trained compounds from the set for unbiased estimate of prediction performance
subset_validation <- NULL
split_calculate <- NULL
valid_data_numeric <- NULL
valid_data_class <- NULL

for (i in 1:length(validation_set)) {
    subset_validation[[i]] <- rbind(validation_set[[i]][which(validation_set[[i]]$ER.Activity %in% "Antagonist"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in% 
        "Dicofol"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in% "Diethylstilbestrol"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in% 
        "Estrone"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in% "Fenarimol"), ], validation_set[[i]][which(validation_set[[i]]$Preferred.Name %in% 
        "o,p'-DDT"), ])
    
    # Assign remaining compunds to test/validation set
    split_calculate[[i]] <- validation_set[[i]][-as.numeric(rownames(subset_validation[[i]])), ]
    
    # Separate numerics and categorical outputs in validation data
    valid_data_numeric[[i]] <- split_calculate[[i]][, 1:5]
    valid_data_class[[i]] <- data.frame(split_calculate[[i]][, 1:5], Class.Info = split_calculate[[i]]$Class.Info)
}

# Validate also with full data
valid_full_numeric <- NULL
valid_full_class <- NULL

for (i in 1:length(validation_set)) {
    # Separate numerics and categorical outputs in validation data
    valid_full_numeric[[i]] <- validation_set[[i]][, 1:5]
    valid_full_class[[i]] <- data.frame(validation_set[[i]][, 1:5], Class.Info = validation_set[[i]]$Class.Info)
}

# Call fit and predict functions Array PI Variance

fmla <- paste("Class.Info ~", cname_train[2])
fmla <- as.formula(fmla)
fit_logit_PI_Var <- glm(fmla, family = binomial(link = "logit"), data = train_data)

temp_valid <- NULL
blind_PI_Var <- NULL
temp_valid2 <- NULL
full_PI_Var <- NULL
for (i in 1:length(validation_set)) {
    temp_valid <- mperform(fit_logit_PI_Var, valid_data_numeric[[i]], valid_data_class[[i]], type = "logit")
    blind_PI_Var <- rbind(blind_PI_Var, temp_valid)
    
    temp_valid2 <- mperform(fit_logit_PI_Var, valid_full_numeric[[i]], valid_full_class[[i]], type = "logit")
    full_PI_Var <- rbind(full_PI_Var, temp_valid2)
}

# Save results to csv file
write.csv(blind_PI_Var, file = "BlindValidation_Results_Array_PI_Variance_Logit.csv", row.names = FALSE)
write.csv(full_PI_Var, file = "FullValidation_Results_Array_PI_Variance_Logit.csv", row.names = FALSE)

# Array to Nucleoplasm Intensity Ratio
fmla <- paste("Class.Info ~", cname_train[5])
fmla <- as.formula(fmla)
fit_logit_NIR <- glm(fmla, family = binomial(link = "logit"), data = train_data)

temp_valid <- NULL
blind_NIR <- NULL
temp_valid2 <- NULL
full_NIR <- NULL

for (i in 1:length(validation_set)) {
    temp_valid <- mperform(fit_logit_NIR, valid_data_numeric[[i]], valid_data_class[[i]], type = "logit")
    blind_NIR <- rbind(blind_NIR, temp_valid)
    
    temp_valid2 <- mperform(fit_logit_NIR, valid_full_numeric[[i]], valid_full_class[[i]], type = "logit")
    full_NIR <- rbind(full_NIR, temp_valid2)
}

# Save results to csv file
write.csv(blind_NIR, file = "BlindValidation_Results_NIR_Logit.csv", row.names = FALSE)
write.csv(full_NIR, file = "FullValidation_Results_NIR_Logit.csv", row.names = FALSE)

Model validation results with 24 unseen agonist compounds for the Logistic Regression Model as a function of Array PI Variance.

Experiment.Number	Accuracy	CI.Lower	CI.Upper	Sensitivity
1	0.80	0.71	0.88	0.80
2	0.97	0.91	0.99	0.97
3	0.75	0.65	0.83	0.75
4	0.98	0.92	1.00	0.98
5	0.97	0.91	0.99	0.97
6	0.88	0.80	0.94	0.88
7	0.97	0.91	0.99	0.97
8	0.75	0.65	0.83	0.75
9	0.98	0.92	1.00	0.98
10	0.88	0.80	0.94	0.88
11	0.89	0.81	0.95	0.89
12	0.78	0.68	0.86	0.78
13	0.97	0.91	0.99	0.97
14	0.78	0.68	0.86	0.78
15	0.97	0.91	0.99	0.97
16	0.90	0.82	0.95	0.90
17	1.00	0.96	1.00	1.00

Model validation results with all active compounds for the Logistic Regression Model as a function of Array PI Variance.

Experiment.Number	Accuracy	CI.Lower	CI.Upper	Sensitivity	Specificity	Balanced.Accuracy
1	0.84	0.77	0.90	0.82	1.00	0.91
2	0.93	0.87	0.97	0.97	0.62	0.80
3	0.80	0.72	0.86	0.77	1.00	0.88
4	0.94	0.88	0.97	0.98	0.62	0.80
5	0.95	0.89	0.98	0.97	0.75	0.86
6	0.91	0.85	0.96	0.90	1.00	0.95
7	0.92	0.86	0.96	0.97	0.56	0.77
8	0.80	0.72	0.86	0.77	1.00	0.88
9	0.94	0.88	0.97	0.98	0.62	0.80
10	0.91	0.84	0.95	0.90	0.94	0.92
11	0.89	0.82	0.94	0.91	0.75	0.83
12	0.82	0.74	0.88	0.79	1.00	0.90
13	0.93	0.87	0.97	0.97	0.62	0.80
14	0.82	0.74	0.88	0.79	1.00	0.90
15	0.93	0.87	0.97	0.97	0.62	0.80
16	0.91	0.85	0.96	0.92	0.88	0.90
17	0.97	0.92	0.99	1.00	0.75	0.88

Random Forest Classifier

Model Training

set.seed(123)

fit_rf <- randomForest(formula = Class.Info ~ ., data = train_data, importance = TRUE, ntree = 130)

# Calculate Training Accuracy
predict_rf <- predict(fit_rf)
confusion_RF_train <- confusionMatrix(predict_rf, train_data$Class.Info, positive = "0", dnn = c("Predicted", "True"))
train_RF_accuracy <- round(as.numeric(confusion_RF_train$overall[1]), 2)
print(train_RF_accuracy)

## [1] 1

# Calculate Testing Accuracy
predict_rf_test <- predict(fit_rf, test_data_numeric)
confusion_RF_test <- confusionMatrix(predict_rf_test, test_data_class$Class.Info, positive = "0", dnn = c("Predicted", "True"))
test_RF_accuracy <- round(as.numeric(confusion_RF_test$overall[1]), 2)
print(test_RF_accuracy)

## [1] 0.93

# Calculate Mean Decrease in Gini Index
feature_imp <- as.data.frame(fit_rf$importance)
feature_rank <- as.matrix(feature_imp[order(feature_imp$MeanDecreaseGini, decreasing = T), 4])
rownames(feature_rank) <- rownames(feature_imp)[order(feature_imp$MeanDecreaseGini, decreasing = T)]
colnames(feature_rank) <- c("Mean Decrease in Gini Index")

write.csv(feature_rank, file = "Feauture_Ranking_RF.csv", row.names = TRUE)

Feature ranking as a result of Random Forest Classifier training

	Mean Decrease in Gini Index
Array.to.Nucleoplasm.Intensity.Ratio	5.7874176
Array.Area	5.1726496
Array.PI.Variance	5.1491282
Array.Mean.PI	0.9671319
Array.Total.PI	0.1899120

Model Validation

temp_valid <- NULL
blind_RF <- NULL
temp_valid2 <- NULL
full_RF <- NULL

for (i in 1:length(validation_set)) {
    temp_valid <- mperform(fit_rf, valid_data_numeric[[i]], valid_data_class[[i]], type = "rf")
    blind_RF <- rbind(blind_RF, temp_valid)
    
    temp_valid2 <- mperform(fit_rf, valid_full_numeric[[i]], valid_full_class[[i]], type = "rf")
    full_RF <- rbind(full_RF, temp_valid2)
}

# Save results to csv file
write.csv(blind_RF, file = "BlindValidation_Results_RF.csv", row.names = FALSE)
write.csv(full_RF, file = "FullValidation_Results_RF.csv", row.names = FALSE)

Model validation results with 24 unseen agonist compounds

Experiment.Number	Accuracy	CI.Lower	CI.Upper	Sensitivity
1	0.95	0.88	0.98	0.95
2	1.00	0.96	1.00	1.00
3	0.96	0.89	0.99	0.96
4	1.00	0.96	1.00	1.00
5	1.00	0.96	1.00	1.00
6	0.97	0.91	0.99	0.97
7	1.00	0.96	1.00	1.00
8	0.96	0.89	0.99	0.96
9	1.00	0.96	1.00	1.00
10	0.97	0.91	0.99	0.97
11	1.00	0.96	1.00	1.00
12	1.00	0.96	1.00	1.00
13	1.00	0.96	1.00	1.00
14	0.97	0.91	0.99	0.97
15	1.00	0.96	1.00	1.00
16	0.96	0.89	0.99	0.96
17	1.00	0.96	1.00	1.00

Model validation results with all active compounds

Experiment.Number	Accuracy	CI.Lower	CI.Upper	Sensitivity	Specificity	Balanced.Accuracy
1	0.95	0.89	0.98	0.94	1.00	0.97
2	0.88	0.80	0.93	1.00	0.00	0.50
3	0.95	0.90	0.98	0.95	1.00	0.97
4	0.88	0.80	0.93	1.00	0.00	0.50
5	0.95	0.89	0.98	1.00	0.56	0.78
6	0.98	0.93	1.00	0.97	1.00	0.99
7	0.88	0.80	0.93	1.00	0.00	0.50
8	0.95	0.90	0.98	0.96	0.88	0.92
9	0.88	0.80	0.93	1.00	0.00	0.50
10	0.97	0.92	0.99	0.97	0.94	0.96
11	0.95	0.90	0.98	1.00	0.62	0.81
12	0.98	0.94	1.00	1.00	0.88	0.94
13	0.88	0.80	0.93	1.00	0.00	0.50
14	0.95	0.90	0.98	0.96	0.94	0.95
15	0.89	0.82	0.94	1.00	0.12	0.56
16	0.95	0.90	0.98	0.96	0.88	0.92
17	0.91	0.85	0.96	1.00	0.31	0.66

Model Performance Metrics

Visualizing model performance metrics with all active compounds on a boxplot.

# Append model names to data frames
full_NIR$Model <- rep("Logistic Regression 'Array to Nucleoplasm Intensity Ratio'", times = dim(full_NIR)[1])
full_PI_Var$Model <- rep("Logistic Regression 'Array PI Variance'", times = dim(full_PI_Var)[1])
full_RF$Model <- rep("Random Forest", times = dim(full_RF)[1])

# Merge all results
merged_data <- rbind(full_PI_Var, full_NIR, full_RF)

# Drop confidence intervals
merged_data$CI.Lower <- NULL
merged_data$CI.Upper <- NULL
colnames(merged_data)[4] <- "Balanced Accuracy"
# Melt data for plotting with respect to model type
melted_data <- melt(merged_data, id.vars = "Model")

ggplot(melted_data, aes(x = variable, y = value, fill = Model)) + geom_boxplot() + theme_bw() + theme(panel.grid.minor = element_blank(), axis.title.x = element_blank(), 
    legend.justification = c("left", "bottom"), legend.position = c(0.03, 0.05), legend.text = element_text(size = 7.5), legend.title = element_blank()) + 
    scale_fill_brewer(palette = "Blues") + labs(y = "Model Performance")

Model performance metrics after data quality monitoring.

# Removing high-density experimental replicates from the analysis
full_NIR_red <- full_NIR[-c(2, 4, 5, 7, 9, 11, 12, 13, 15, 17), ]
full_PI_Var_red <- full_PI_Var[-c(2, 4, 5, 7, 9, 11, 12, 13, 15, 17), ]
full_RF_red <- full_RF[-c(2, 4, 5, 7, 9, 11, 12, 13, 15, 17), ]

# Merge all results
merged_data2 <- rbind(full_PI_Var_red, full_NIR_red, full_RF_red)

# Drop confidence intervals
merged_data2$CI.Lower <- NULL
merged_data2$CI.Upper <- NULL
colnames(merged_data2)[4] <- "Balanced Accuracy"
# Melt data for plotting with respect to model type
melted_data2 <- melt(merged_data2, id.vars = "Model")

ggplot(melted_data2, aes(x = variable, y = value, fill = Model)) + geom_boxplot() + theme_bw() + theme(panel.grid.minor = element_blank(), axis.title.x = element_blank(), 
    legend.justification = c("left", "bottom"), legend.position = c(0.03, 0.05), legend.text = element_text(size = 7.5), legend.title = element_blank()) + 
    scale_fill_brewer(palette = "Reds") + ylim(0, 1) + labs(y = "Model Performance")

Feature Density Visualization

agonists <- NULL
for (i in 1:length(temp_class)) {
    agonists[[i]] <- temp_class[[i]][which(temp_class[[i]]$Class.Info == 0), ]
}

# Calculate Hellinger Distance between Agonists and Antagonists for Array PI Variance
hdist_PI_Var <- NULL
par(mfrow = c(3, 6), cex.axis = 2, cex.lab = 2, mar = c(5, 5, 5, 3))

for (i in 1:length(agonists)) {
    hdist_PI_Var[[i]] <- round(hellinger(agonists[[i]]$Array.PI.Variance, antagonists[[i]]$Array.PI.Variance, -Inf, Inf, method = 1), 2)
    sm.density.compare(temp_class[[i]]$Array.PI.Variance, temp_class[[i]]$Class.Info, xlab = paste("Exp", i, ", HD=", hdist_PI_Var[[i]], sep = ""), lty = c(1, 
        1), lwd = c(2, 2), col = c("dodgerblue3", "red3"))
}

# Calculate Hellinger Distance between Agonists and Antagonists for Array to Nucleoplasm Intensity Ratio
hdist_NIR <- NULL
par(mfrow = c(3, 6), cex.axis = 2, cex.lab = 2, mar = c(5, 5, 5, 3))

for (i in 1:length(agonists)) {
    hdist_NIR[[i]] <- round(hellinger(agonists[[i]]$Array.to.Nucleoplasm.Intensity.Ratio, antagonists[[i]]$Array.to.Nucleoplasm.Intensity.Ratio, -Inf, 
        Inf, method = 1), 2)
    sm.density.compare(temp_class[[i]]$Array.to.Nucleoplasm.Intensity.Ratio, temp_class[[i]]$Class.Info, xlab = paste("Exp", i, ", HD=", hdist_NIR[[i]], 
        sep = ""), lty = c(1, 1), lwd = c(2, 2), col = c("dodgerblue3", "red3"))
}

Session Information

sessionInfo()

## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] pROC_1.16.2         reshape_0.8.8       formatR_1.7         kableExtra_1.1.0    sm_2.2-5.6          statip_0.2.3        boot_1.3-24         caret_6.0-86        lattice_0.20-41     randomForest_4.6-14
## [11] ggplot2_3.3.0       ggdendro_0.1-20     knitr_1.28          dendextend_1.13.4  
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.5.1        httr_1.4.1           viridisLite_0.3.0    splines_4.0.0        foreach_1.5.0        prodlim_2019.11.13   assertthat_0.2.1     highr_0.8            stats4_4.0.0         yaml_2.2.1          
## [11] ipred_0.9-9          pillar_1.4.4         glue_1.4.0           digest_0.6.25        RColorBrewer_1.1-2   rvest_0.3.5          colorspace_1.4-1     recipes_0.1.12       htmltools_0.4.0      Matrix_1.2-18       
## [21] plyr_1.8.6           timeDate_3043.102    pkgconfig_2.0.3      purrr_0.3.4          scales_1.1.0         webshot_0.5.2        gower_0.2.1          lava_1.6.7           tibble_3.0.1         farver_2.0.3        
## [31] generics_0.0.2       ellipsis_0.3.0       withr_2.2.0          nnet_7.3-13          survival_3.1-12      magrittr_1.5         crayon_1.3.4         evaluate_0.14        nlme_3.1-147         MASS_7.3-51.5       
## [41] xml2_1.3.2           class_7.3-16         tools_4.0.0          data.table_1.12.8    hms_0.5.3            lifecycle_0.2.0      stringr_1.4.0        munsell_0.5.0        cluster_2.1.0        e1071_1.7-3         
## [51] compiler_4.0.0       rlang_0.4.6          grid_4.0.0           iterators_1.0.12     rstudioapi_0.11      labeling_0.3         tcltk_4.0.0          rmarkdown_2.1        gtable_0.3.0         ModelMetrics_1.2.2.2
## [61] codetools_0.2-16     reshape2_1.4.4       R6_2.4.1             gridExtra_2.3        lubridate_1.7.8      dplyr_0.8.5          clue_0.3-57          readr_1.3.1          stringi_1.4.6        Rcpp_1.0.4.6        
## [71] vctrs_0.2.4          rpart_4.1-15         tidyselect_1.0.0     xfun_0.13

Classification of Estrogenic Compounds by Coupling High Content Analysis and Machine Learning Algorithms

Rajib Mukherjee, Burcu Beykal, Adam T. Szafran, Melis Onel, Fabio Stossi, Maureen G. Mancini, Dillon Lloyd, Fred A. Wright, Lan Zhou, Michael A. Mancini, Efstratios N. Pistikopoulos

June 2020

Load Required Libraries

Pre-Processing Functions

Data Normalization

Confidence Interval Calculations in Logistic Regression

Model Performance Calculations

Reading Experimental Datasets

Data Pre-processing

Check Missing Data

Data Cleaning

Outlier Detection

Data Normalization

Feature Selection

Classification Model Training

Data Visualization with PCA

Wilcoxon Rank Sum Test

Logistic Regression

Model Training

Model Validation

Random Forest Classifier

Model Training

Model Validation

Model Performance Metrics

Feature Density Visualization

Session Information