ab1 <- read.csv ("VerifiedApterae20170418.csv") ## with first 3 lines of file below ## Specimen,species,hostorder,hostfamily,host,Depositedat,ant-1,ant-2,ant-3,ant-4,ant-5,ant-6,ant-pt,head,rost-l,rost-w,fem-l,tib-l,dist-l,siph-l,caud-l,abdset-l,bl,siph-w,rost-l.rost-w,rost-l.dist-l,ant-pt.ant-6,siph-l.siph-w,siph-l.caud-l,siph-l.ant-3,rost-l.caud-l,bl-head,ant3-4fused ## 0143-1,chusqueae,Poales,Poaceae,Chusquea,CZULE,93.7,94.9,334.7,194.7,200.3,109.2,460.3,506.3,132.7,79.5,725.2,1262.5,101.0,419.6,187.1,117.6,2345.3,94.8,1.67,1.31,4.22,4.43,2.24,1.25,0.71,4.63,0 ## 0144-1,chusqueae,Poales,Poaceae,Chusquea,CZULE,NA,94.5,464.0,255.1,253.3,128.5,493.0,NA,139.0,95.3,800.7,1426.5,132.0,441.7,209.1,118.6,2639.0,104.6,1.46,1.05,3.84,4.22,2.11,0.95,0.66,NA,0 ab2 <- read.csv ("SynonymizedApterae20170418.csv") ## with first 3 lines of file below ## Specimen,species,hostorder,hostfamily,hostgenus,Depositedat,ant-1,ant-2,ant-3,ant-4,ant-5,ant-6,ant-pt,head,rost-l,rost-w,fem-l,tib-l,dist-l,siph-l,caud-l,abdset-l,bl,siph-w,rost-l.rost-w,rost-l.dist-l,ant-pt.ant-6,siph-l.siph-w,siph-l.caud-l,siph-l.ant-3,rost-l.caud-l,bl-head,ant3-4fused ## 0716-1,Aphisafricana,NA,NA,NA,NA,56.1,42.2,267.4,0.0,142.1,81.7,304.5,358.9,NA,NA,NA,656.3,70.7,119.2,153.8,NA,1377.7,50.9,NA,NA,3.73,2.34,0.78,0.45,NA,3.84,1 ## 0716-2,Aphisafricana,NA,NA,NA,NA,53.6,42.9,299.2,0.0,153.2,88.5,308.8,359.1,NA,NA,388.7,665.2,77.8,110.8,138.2,NA,1586.8,61.7,NA,NA,3.49,1.80,0.80,0.37,NA,4.42,1 names (ab2) <- names (ab1) ab1$status <- "V" ab2$status <- "S" ab3 <- rbind (ab1, ab2) ## this creates one data set with both 'V' and 'S' taxa ###---------- create a size variable for each species, scale all data ---------------### ## these records have no values for the three measures used to create 'size' tmp <- is.na (ab3[,c(14, 17, 23)]) ## which records have missing columns for the size variables? tmp2 <- rowSums (tmp) tl <- tmp2 > 2 ## all have at least one usable body size measure ## first deal with size variable tmp <- is.na (ab3[,c(14, 17, 23)]) ## need to redo these if some have all missing values tmp2 <- rowSums (tmp) tl <- tmp2 > 0 pc3 <- prcomp (ab3[!tl,c(14,17,23)], retx=TRUE, center=TRUE, scale.=TRUE) pr3 <- predict (pc3) ab3$relsize <- numeric (nrow(ab3)) ab3$relsize [!tl] <- pr3[,1] ## fill in missing values of relsize fit3a <- lm (pr3 [,1] ~ ab3[!tl, 14]) fit3b <- lm (pr3 [,1] ~ ab3[!tl, 17]) fit3c <- lm (pr3 [,1] ~ ab3[!tl, 23]) for (i in 1:nrow(ab3)) { if (tmp2 [i] > 0) { if ((tmp [i, 1] == FALSE) & (tmp [i, 2] == TRUE) & (tmp [i, 3] == TRUE)) { # has only var. 14 ab3$relsize [i] <- coef (fit3a) %*% c(1, ab3[i,14]) } if ((tmp [i, 1] == TRUE) & (tmp [i, 2] == FALSE) & (tmp [i, 3] == TRUE)) { # has only var. 17 ab3$relsize [i] <- coef (fit3b) %*% c(1, ab3[i,17]) } if ((tmp [i, 1] == TRUE) & (tmp [i, 2] == TRUE) & (tmp [i, 3] == FALSE)) { # has only var. 23 ab3$relsize [i] <- coef (fit3c) %*% c(1, ab3[i,23]) } if ((tmp [i, 1] == FALSE) & (tmp [i, 2] == FALSE) & (tmp [i, 3] == TRUE)) { # has var. 14 & 17 ab3$relsize [i] <- (coef (fit3a) %*% c(1, ab3[i,14]) + coef (fit3b) %*% c(1, ab3[i,17]))/2 } if ((tmp [i, 1] == FALSE) & (tmp [i, 2] == TRUE) & (tmp [i, 3] == FALSE)) { # has var. 14 & 23 ab3$relsize [i] <- (coef (fit3a) %*% c(1, ab3[i,14]) + coef (fit3c) %*% c(1, ab3[i,23]))/2 } if ((tmp [i, 1] == TRUE) & (tmp [i, 2] == FALSE) & (tmp [i, 3] == FALSE)) { # has var. 17 & 23 ab3$relsize [i] <- (coef (fit3b) %*% c(1, ab3[i,17]) + coef (fit3c) %*% c(1, ab3[i,23]))/2 } } } ## create a new data set with the scaled data ab4 <- ab3 ## change insertum to "S" because there is only one sample (should really have been done earlier) ## now two, so can use ## tl <- ab4$species == "insertum" ## ab4$status [tl] <- "S" ## adjust for size for (i in 7:24) { fit0 <- lm (ab3[,i] ~ ab3$relsize) tl <- !is.na(ab3[,i]) ab4 [tl, i] <- residuals (fit0) ab4 [tl, i] <- scale(ab4 [tl, i]) ## variables are now scaled and size adjusted } ###---------- transform and rescale data ------------#### ## logs seem a little better, so make that tx for (i in c(25:32)) { ab4[,i] <- log(ab4[,i]) } ## scale all variables to mean 0 var 1 for (i in c(7:32)) { ab4[,i] <- scale(ab4[,i]) } ab4$relsize <- scale (ab4$relsize) ##-------- impute missing values and take care of aberrant values ----------## ## 'imputing' is done by sampling from other specimens and using that value for (i in 1:length(unique(ab4$species))) { tl <- ab4$species == unique(ab4$species)[i] for (j in c(7:33)) { tl2 <- tl & is.na(ab4[,j]) tl3 <- tl & !is.na(ab4[,j]) if (sum (tl2) > 0) { if (sum (tl3) > 0) { if (sum (tl2) < sum (tl3)) { ## can sample without replacement ab4[tl2,j] <- sample (ab4[tl3,j], sum (tl2)) } else { ## sample with replacement if (sum (tl3) > 1) { ab4[tl2,j] <- sample (ab4[tl3,j], sum (tl2), replace = TRUE) cat ("sampling with replacement: for", as.character(unique(ab4$species)[i]), colnames (ab4)[j], sum(tl2), sum(tl3), "\n") } else { ## this is due to R's peculiar behavior with n = 1 ab4[tl2,j] <- rep (ab4[tl3,j], sum (tl2)) cat ("sampling with replacement: for", as.character(unique(ab4$species)[i]), colnames (ab4)[j], sum(tl2), sum(tl3), "\n") } } } else { ## nothing to sample from, so leave as NA cat ("no action taken for", as.character(unique(ab4$species)[i]), colnames (ab4)[j], sum(tl2), sum(tl3), "\n") } } } } ## set remaining missing values to zero, which should be the overall mean since the data are scaled for (j in c(7:33)) { tl <- is.na (ab4[,j]) if (sum (tl) > 0) { ab4 [tl,j] <- 0 } } tl <- is.na(ab4[,7:33]) sum(tl) ## 0 so no missing values ###---------- create the boxplots to get an idea of species differences ----------## pdf ("Verified-ValuesBySpecies20170418.pdf", height=12, width=16) par (mfrow=c(3,1), mar=c(8,4,2,1)) for (i in c(7:32,35)) { yrange1 <- range (as.numeric(ab4[,i]), na.rm=TRUE) plot (ab4$species, ab4[,i], xaxt="n", main = names(ab4)[i]) ## this first sorts the species axis(1, at = 1:length(unique(ab4$species)), labels = FALSE) inc1 <- (yrange1[2] - yrange1[1])/20 text(1:length(unique(ab4$species)), par("usr")[3] - inc1, srt = 45, adj = 1, labels = sort(unique(ab4$species)), xpd = TRUE) lcl1 <- mean(ab4[,i], na.rm=TRUE) - 4*sd(ab4[,i], na.rm=TRUE) ucl1 <- mean(ab4[,i], na.rm=TRUE) + 4*sd(ab4[,i], na.rm=TRUE) abline (v = 12.5, lty=3, col="black") tl <- (ab4[,i] < lcl1) | (ab4[,i] > ucl1) tl [is.na(tl)] <- FALSE print (ab4[tl,c(1,2,i)]) } dev.off() ###-------------- create linear discriminant axes -------------### tlstatus <- ab4$status == "V" ab4b <- ab4 [tlstatus,] ab4b$species <- factor (ab4b$species, exclude=NULL) ab4c <- ab4 [!tlstatus,] ab4c$species <- factor (ab4c$species, exclude=NULL) require (MASS) ldab4 <- lda (species ~ ant.1+ant.2+ ## this does the LDA (in MASS package) ant.3+ant.4+ant.5+ant.6+ ant.pt+head+rost.l+rost.w+ fem.l+tib.l+dist.l+siph.l+ caud.l+abdset.l+bl+siph.w+ rost.l.rost.w+rost.l.dist.l+ant.pt.ant.6+siph.l.siph.w+ siph.l.caud.l+siph.l.ant.3+rost.l.caud.l+bl.head+ ant3.4fused+relsize, data=ab4b) cumsum ((ldab4$svd)^2/(sum((ldab4$svd)^2))) ## 3 axes are needed to get to 80% ## [1] 0.5248453 0.7105815 0.8482519 0.9272767 0.9623786 0.9819411 0.9890850 ## [8] 0.9943142 0.9974110 0.9991588 1.0000000 ## get an idea of which features contribute most to the axes for (i in 1:4) { tl <- abs(ldab4$scaling [,i]) > 0.9 cat (dimnames (ldab4$scaling) [[2]] [i], "\n") print (ldab4$scaling [tl,i]) } ## LD1 ## rost.l rost.l.dist.l ant.pt.ant.6 siph.l.siph.w ## 1.632692 1.001594 1.515555 0.998667 ## LD2 ## rost.l ant.pt.ant.6 ## 1.736839 -1.447860 ## LD3 ## rost.l abdset.l ## -1.687624 -1.222202 ## LD4 ## rost.l.dist.l ant3.4fused ## -1.5050392 -0.9055455 ## multiply out to put 'S' species on axes prldab4 <- as.matrix(ab4b[,c(7:33,35)]) %*% ldab4$scaling [,1:4] rownames (prldab4) <- paste (ab4b$Specimen, ab4b$species, sep=".") prldab4s <- as.matrix(ab4c[,c(7:33,35)]) %*% ldab4$scaling [,1:4] rownames (prldab4s) <- paste (ab4c$Specimen, ab4c$species, sep=".") ###---------- make the plots -------------### ## set up for plotting range1 <- range (prldab4[,1], prldab4s[,1]) range2 <- range (prldab4[,2], prldab4s[,2]) range3 <- range (prldab4[,3], prldab4s[,3]) range4 <- range (prldab4[,4], prldab4s[,4]) rangey <- matrix (c(range1, range2, range3, range4), 4, 2, byrow=TRUE) ## colors used (these are pretty well separated) vc2 <- c("#ff0000", "#b00000", "#870000", "#550000", "#e4e400", "#baba00", "#878700", "#545400", "#00ff00", "#00b000", "#008700", "#005500", "#00ffff", "#00b0b0", "#008787", "#005555", "#b0b0ff", "#8484ff", "#4949ff", "#0000ff", "#ff00ff", "#b000b0", "#870087", "#550055", "#e4e4e4", "#bababa", "#878787", "#545454") vc1 <- vc2 [seq (1, 28, 2) + 1] length(unique(ab4b$species)) ## 12 col1 <- vc1 [1:12] col2 <- character (length(ab4b$species)) speciesnumber <- character (length(ab4b$species)) for (i in 1:length(unique(ab4b$species))) { tl <- ab4b$species == unique(ab4b$species)[i] col2 [tl] <- col1[i] speciesnumber [tl] <- LETTERS[i] } symb1 <- letters [1:length(unique(ab4c$species))] symb2 <- character (length(ab4c$species)) for (i in 1:length(unique(ab4c$species))) { tl <- ab4c$species == unique(ab4c$species)[i] symb2 [tl] <- symb1[i] } col3 <- rep ("red", length(ab4c$species)) ## assing colors and symbols tl <- ab4c$species == "Aphisafricana"; col3[tl] <- "#baba00"; symb2 [tl] <- "C.a" tl <- ab4c$species == "Hyadaphissparganii"; col3[tl] <- "#005500"; symb2 [tl] <- "F.d" tl <- ab4c$species == "prunifoliae"; col3[tl] <- "#00b0b0"; symb2 [tl] <- "G.e" tl <- ab4c$species == "pseudoavenae"; col3[tl] <- "#00b0b0"; symb2 [tl] <- "G.f" tl <- ab4c$species == "scirpifolii"; col3[tl] <- "#545400"; symb2 [tl] <- "D.g" tl <- ab4c$species == "laconae"; col3[tl] <- "#550000"; symb2 [tl] <- "B.j" tl <- ab4c$species == "splendens"; col3[tl] <- "#0000ff"; symb2 [tl] <- "J.b" tl <- ab4c$species == "gnaphalii"; col3[tl] <- "#0000ff"; symb2 [tl] <- "J.c" tl <- ab4c$species == "subterraneum"; col3[tl] <- "#0000ff"; symb2 [tl] <- "J.h" ## create a data.frame for 'V' and another for 'S' species ver1 <- data.frame (species = unique(as.character(ab4b$species)), code = unique (speciesnumber), color = col1, symbol = rep(16, 12)) syn1 <- data.frame (species = c("Aphis africana", "Siphocoryne splendens", "gnaphalii", "Hyadaphis sparganii", "Aphis prunifoliae", "Aphis pseudoavenae", "scirpifolii", "subterraneum", "Aphis tahasa", "laconae"), code = unique (symb2), color = c("#baba00", "#0000ff", "#0000ff", "#005500", "#00b0b0", "#00b0b0", "#545400", "#0000ff", "red", "#550000"), symbol = rep(17, 10)) require (scales) # needed for the alpha function, better solution to get transparent colors ## plot LDA1 vs. LDA2 j <- 2 filename1 <- "Apterae20170519-a.pdf" pdf (filename1, height=14, width=14) plot (prldab4[,1], prldab4[,j], pch="", main = "Apterae", xlab="LDA 1", ylab= paste ("LDA", j), xlim = rangey[1,], ylim = rangey[j,]) points (prldab4[,1], prldab4[,j], pch = 16, col=alpha(col2,0.6), cex=3) text (prldab4[,1], prldab4[,j], speciesnumber, col="white", cex=0.6) points (prldab4s[,1], prldab4s[,j], pch = 17, col = alpha(col3,0.6), cex=3) text (prldab4s[,1], prldab4s[,j], symb2, col = "white", cex = 0.6) ## plot the legend text (-10.5, -3, "Verified", pos = 4, font = 2) for (i in 1:12) { points (-11, -3 - i/2.5, pch = ver1 [i, 4], col = alpha (as.character(ver1 [i, 3]),0.6), cex = 3) text (-11, -3 - i/2.5, ver1 [i, 2], col="white", cex=0.6) text (-10.5, -3 - i/2.5, ver1 [i, 1], pos = 4, font = 3) } text (-6.5, -3, "Synonymized", pos = 4, font = 2) for (i in 1:10) { points (-7, -3 - i/2.5, pch = syn1 [i, 4], col = alpha (as.character(syn1 [i, 3]), 0.6), cex = 3) text (-7, -3 - i/2.5, syn1 [i, 2], col="white", cex=0.6) text (-6.5, -3 - i/2.5, syn1 [i, 1], pos = 4, font = 3) } dev.off() ## plot LDA1 vs. LDA3 j <- 3 filename1 <- "Apterae20170519-b.pdf" pdf (filename1, height=14, width=14) plot (prldab4[,1], prldab4[,j], pch="", main = "Apterae", xlab="LDA 1", ylab= paste ("LDA", j), xlim = rangey[1,], ylim = rangey[j,]) points (prldab4[,1], prldab4[,j], pch = 16, col=alpha(col2,0.6), cex=3) text (prldab4[,1], prldab4[,j], speciesnumber, col="white", cex=0.6) points (prldab4s[,1], prldab4s[,j], pch = 17, col = alpha(col3,0.6), cex=3) text (prldab4s[,1], prldab4s[,j], symb2, col = "white", cex = 0.6) ## plot the legend text (-10.5, -4, "Verified", pos = 4, font = 2) for (i in 1:12) { points (-11, -4 - i/2.25, pch = ver1 [i, 4], col = alpha (as.character(ver1 [i, 3]),0.6), cex = 3) text (-11, -4 - i/2.25, ver1 [i, 2], col="white", cex=0.6) text (-10.5, -4 - i/2.25, ver1 [i, 1], pos = 4, font = 3) } text (-6.5, -4, "Synonymized", pos = 4, font = 2) for (i in 1:10) { points (-7, -4 - i/2.25, pch = syn1 [i, 4], col = alpha (as.character(syn1 [i, 3]), 0.6), cex = 3) text (-7, -4 - i/2.25, syn1 [i, 2], col="white", cex=0.6) text (-6.5, -4 - i/2.25, syn1 [i, 1], pos = 4, font = 3) } dev.off() j <- 4 filename1 <- "Apterae20170519-c.pdf" ## this one is not probably not needed pdf (filename1, height=14, width=14) plot (prldab4[,1], prldab4[,j], pch="", main = "Apterae", xlab="LDA 1", ylab= paste ("LDA", j), xlim = rangey[1,], ylim = rangey[j,]) ## points (prldab4[,1], prldab4[,j], pch = 16, col=col2, cex=3) points (prldab4[,1], prldab4[,j], pch = 16, col=alpha(col2,0.6), cex=3) text (prldab4[,1], prldab4[,j], speciesnumber, col="white", cex=0.6) ## points (prldab4s[,1], prldab4s[,j], pch = 17, col = col3, cex=3) points (prldab4s[,1], prldab4s[,j], pch = 17, col = alpha(col3,0.6), cex=3) text (prldab4s[,1], prldab4s[,j], symb2, col = "white", cex = 0.6) text (-10.5, -3, "Verified", pos = 4, font = 2) for (i in 1:12) { points (-11, -3 - i/2.75, pch = ver1 [i, 4], col = alpha (as.character(ver1 [i, 3]),0.6), cex = 3) text (-11, -3 - i/2.75, ver1 [i, 2], col="white", cex=0.6) text (-10.5, -3 - i/2.75, ver1 [i, 1], pos = 4, font = 3) } text (-6.5, -3, "Synonymized", pos = 4, font = 2) for (i in 1:10) { points (-7, -3 - i/2.75, pch = syn1 [i, 4], col = alpha (as.character(syn1 [i, 3]), 0.6), cex = 3) text (-7, -3 - i/2.75, syn1 [i, 2], col="white", cex=0.6) text (-6.5, -3 - i/2.75, syn1 [i, 1], pos = 4, font = 3) } dev.off()