ab1 <- read.csv ("VerifiedApterae20170418.csv") ## with first 3 lines of file below
## Specimen,species,hostorder,hostfamily,host,Depositedat,ant-1,ant-2,ant-3,ant-4,ant-5,ant-6,ant-pt,head,rost-l,rost-w,fem-l,tib-l,dist-l,siph-l,caud-l,abdset-l,bl,siph-w,rost-l.rost-w,rost-l.dist-l,ant-pt.ant-6,siph-l.siph-w,siph-l.caud-l,siph-l.ant-3,rost-l.caud-l,bl-head,ant3-4fused
## 0143-1,chusqueae,Poales,Poaceae,Chusquea,CZULE,93.7,94.9,334.7,194.7,200.3,109.2,460.3,506.3,132.7,79.5,725.2,1262.5,101.0,419.6,187.1,117.6,2345.3,94.8,1.67,1.31,4.22,4.43,2.24,1.25,0.71,4.63,0
## 0144-1,chusqueae,Poales,Poaceae,Chusquea,CZULE,NA,94.5,464.0,255.1,253.3,128.5,493.0,NA,139.0,95.3,800.7,1426.5,132.0,441.7,209.1,118.6,2639.0,104.6,1.46,1.05,3.84,4.22,2.11,0.95,0.66,NA,0

ab2 <- read.csv ("SynonymizedApterae20170418.csv") ## with first 3 lines of file below
## Specimen,species,hostorder,hostfamily,hostgenus,Depositedat,ant-1,ant-2,ant-3,ant-4,ant-5,ant-6,ant-pt,head,rost-l,rost-w,fem-l,tib-l,dist-l,siph-l,caud-l,abdset-l,bl,siph-w,rost-l.rost-w,rost-l.dist-l,ant-pt.ant-6,siph-l.siph-w,siph-l.caud-l,siph-l.ant-3,rost-l.caud-l,bl-head,ant3-4fused
## 0716-1,Aphisafricana,NA,NA,NA,NA,56.1,42.2,267.4,0.0,142.1,81.7,304.5,358.9,NA,NA,NA,656.3,70.7,119.2,153.8,NA,1377.7,50.9,NA,NA,3.73,2.34,0.78,0.45,NA,3.84,1
## 0716-2,Aphisafricana,NA,NA,NA,NA,53.6,42.9,299.2,0.0,153.2,88.5,308.8,359.1,NA,NA,388.7,665.2,77.8,110.8,138.2,NA,1586.8,61.7,NA,NA,3.49,1.80,0.80,0.37,NA,4.42,1

names (ab2) <- names (ab1)
ab1$status <- "V"
ab2$status <- "S"
ab3 <- rbind (ab1, ab2) ## this creates one data set with both 'V' and 'S' taxa

###---------- create a size variable for each species, scale all data ---------------###
## these records have no values for the three measures used to create 'size'
tmp <- is.na (ab3[,c(14, 17, 23)])  ## which records have missing columns for the size variables?
tmp2 <- rowSums (tmp)
tl <- tmp2 > 2  ## all have at least one usable body size measure
## first deal with size variable
tmp <- is.na (ab3[,c(14, 17, 23)])  ## need to redo these if some have all missing values
tmp2 <- rowSums (tmp)
tl <- tmp2 > 0 
pc3 <- prcomp (ab3[!tl,c(14,17,23)], retx=TRUE, center=TRUE, scale.=TRUE)
pr3 <- predict (pc3)
ab3$relsize <- numeric (nrow(ab3))
ab3$relsize [!tl] <- pr3[,1]
## fill in missing values of relsize
fit3a <- lm (pr3 [,1] ~ ab3[!tl, 14])
fit3b <- lm (pr3 [,1] ~ ab3[!tl, 17])
fit3c <- lm (pr3 [,1] ~ ab3[!tl, 23])
for (i in 1:nrow(ab3)) {
    if (tmp2 [i] > 0) {
        if ((tmp [i, 1] == FALSE) & (tmp [i, 2] == TRUE) & (tmp [i, 3] == TRUE)) { # has only var. 14
            ab3$relsize [i] <- coef (fit3a) %*% c(1, ab3[i,14]) }
        if ((tmp [i, 1] == TRUE) & (tmp [i, 2] == FALSE) & (tmp [i, 3] == TRUE)) { # has only var. 17
            ab3$relsize [i] <- coef (fit3b) %*% c(1, ab3[i,17]) }
        if ((tmp [i, 1] == TRUE) & (tmp [i, 2] == TRUE) & (tmp [i, 3] == FALSE)) { # has only var. 23
            ab3$relsize [i] <- coef (fit3c) %*% c(1, ab3[i,23]) }
        if ((tmp [i, 1] == FALSE) & (tmp [i, 2] == FALSE) & (tmp [i, 3] == TRUE)) { # has var. 14 & 17
            ab3$relsize [i] <- (coef (fit3a) %*% c(1, ab3[i,14]) + coef (fit3b) %*% c(1, ab3[i,17]))/2 }
        if ((tmp [i, 1] == FALSE) & (tmp [i, 2] == TRUE) & (tmp [i, 3] == FALSE)) { # has var. 14 & 23
            ab3$relsize [i] <- (coef (fit3a) %*% c(1, ab3[i,14]) + coef (fit3c) %*% c(1, ab3[i,23]))/2 }
        if ((tmp [i, 1] == TRUE) & (tmp [i, 2] == FALSE) & (tmp [i, 3] == FALSE)) { # has var. 17 & 23
            ab3$relsize [i] <- (coef (fit3b) %*% c(1, ab3[i,17]) + coef (fit3c) %*% c(1, ab3[i,23]))/2 }
    }
}
## create a new data set with the scaled data
ab4 <- ab3
## change insertum to "S" because there is only one sample (should really have been done earlier)
## now two, so can use
## tl <- ab4$species == "insertum"
## ab4$status [tl] <- "S"
## adjust for size
for (i in 7:24) {
    fit0 <- lm (ab3[,i] ~ ab3$relsize)
    tl <- !is.na(ab3[,i])
    ab4 [tl, i] <- residuals (fit0)
    ab4 [tl, i] <- scale(ab4 [tl, i])  ## variables are now scaled and size adjusted
}

###---------- transform and rescale data ------------####
## logs seem a little better, so make that tx
for (i in c(25:32)) {
    ab4[,i] <- log(ab4[,i])
}

## scale all variables to mean 0 var 1
for (i in c(7:32)) {
    ab4[,i] <- scale(ab4[,i])
}
ab4$relsize <- scale (ab4$relsize)

##-------- impute missing values and take care of aberrant values ----------##
## 'imputing' is done by sampling from other specimens and using that value
for (i in 1:length(unique(ab4$species))) {
    tl <- ab4$species == unique(ab4$species)[i]
    for (j in c(7:33)) {
        tl2 <- tl & is.na(ab4[,j]) 
        tl3 <- tl & !is.na(ab4[,j])
        if (sum (tl2) > 0) {
            if (sum (tl3) > 0) { 
                if (sum (tl2) < sum (tl3)) {  ## can sample without replacement
                    ab4[tl2,j] <- sample (ab4[tl3,j], sum (tl2))   
                } else {            ## sample with replacement
                    if (sum (tl3) > 1) {
                        ab4[tl2,j] <- sample (ab4[tl3,j], sum (tl2), replace = TRUE)
                        cat ("sampling with replacement: for", as.character(unique(ab4$species)[i]),
                             colnames (ab4)[j], sum(tl2), sum(tl3), "\n")
                    } else {  ## this is due to R's peculiar behavior with n = 1
                        ab4[tl2,j] <- rep (ab4[tl3,j], sum (tl2))
                        cat ("sampling with replacement: for", as.character(unique(ab4$species)[i]),
                             colnames (ab4)[j], sum(tl2), sum(tl3), "\n")
                    }
                }
            } else {   ## nothing to sample from, so leave as NA
                cat ("no action taken for", as.character(unique(ab4$species)[i]), colnames (ab4)[j],
                     sum(tl2), sum(tl3), "\n")
            }
        }
    }
}

## set remaining missing values to zero, which should be the overall mean since the data are scaled
for (j in c(7:33)) {
    tl <- is.na (ab4[,j])
    if (sum (tl) > 0) {
        ab4 [tl,j] <- 0
    }
}
tl <- is.na(ab4[,7:33])
sum(tl)  ## 0 so no missing values

###---------- create the boxplots to get an idea of species differences ----------##
pdf ("Verified-ValuesBySpecies20170418.pdf", height=12, width=16)
par (mfrow=c(3,1), mar=c(8,4,2,1))
for (i in c(7:32,35)) {
    yrange1 <- range (as.numeric(ab4[,i]), na.rm=TRUE)
    plot (ab4$species, ab4[,i], xaxt="n", main = names(ab4)[i]) ## this first sorts the species
    axis(1, at = 1:length(unique(ab4$species)), labels = FALSE)
    inc1 <- (yrange1[2] - yrange1[1])/20
    text(1:length(unique(ab4$species)), par("usr")[3] - inc1, srt = 45, adj = 1,
         labels = sort(unique(ab4$species)), xpd = TRUE)
    lcl1 <- mean(ab4[,i], na.rm=TRUE) - 4*sd(ab4[,i], na.rm=TRUE)
    ucl1 <- mean(ab4[,i], na.rm=TRUE) + 4*sd(ab4[,i], na.rm=TRUE)
    abline (v = 12.5, lty=3, col="black")
    tl <- (ab4[,i] < lcl1) | (ab4[,i] > ucl1)
    tl [is.na(tl)] <- FALSE
    print (ab4[tl,c(1,2,i)])
}
dev.off()

###-------------- create linear discriminant axes -------------###
tlstatus <- ab4$status == "V"
ab4b <- ab4 [tlstatus,]
ab4b$species <- factor (ab4b$species, exclude=NULL)
ab4c <- ab4 [!tlstatus,]
ab4c$species <- factor (ab4c$species, exclude=NULL)
require (MASS)
ldab4 <- lda (species ~ ant.1+ant.2+    ## this does the LDA (in MASS package)
                   ant.3+ant.4+ant.5+ant.6+
                   ant.pt+head+rost.l+rost.w+
                   fem.l+tib.l+dist.l+siph.l+
                   caud.l+abdset.l+bl+siph.w+
                   rost.l.rost.w+rost.l.dist.l+ant.pt.ant.6+siph.l.siph.w+
                   siph.l.caud.l+siph.l.ant.3+rost.l.caud.l+bl.head+
                   ant3.4fused+relsize,
               data=ab4b)

cumsum ((ldab4$svd)^2/(sum((ldab4$svd)^2))) ## 3 axes are needed to get to 80%
 ## [1] 0.5248453 0.7105815 0.8482519 0.9272767 0.9623786 0.9819411 0.9890850
 ## [8] 0.9943142 0.9974110 0.9991588 1.0000000

## get an idea of which features contribute most to the axes
for (i in 1:4) {
    tl <- abs(ldab4$scaling [,i]) > 0.9
    cat (dimnames (ldab4$scaling) [[2]] [i], "\n")
    print (ldab4$scaling [tl,i])
}
## LD1 
##        rost.l rost.l.dist.l  ant.pt.ant.6 siph.l.siph.w 
##      1.632692      1.001594      1.515555      0.998667 
## LD2 
##       rost.l ant.pt.ant.6 
##     1.736839    -1.447860 
## LD3 
##    rost.l  abdset.l 
## -1.687624 -1.222202 
## LD4 
## rost.l.dist.l   ant3.4fused 
##    -1.5050392    -0.9055455 

## multiply out to put 'S' species on axes
prldab4 <- as.matrix(ab4b[,c(7:33,35)]) %*% ldab4$scaling [,1:4] 
rownames (prldab4) <- paste (ab4b$Specimen, ab4b$species, sep=".")
prldab4s <- as.matrix(ab4c[,c(7:33,35)]) %*% ldab4$scaling [,1:4] 
rownames (prldab4s) <- paste (ab4c$Specimen, ab4c$species, sep=".")

###---------- make the plots -------------###
## set up for plotting
range1 <- range (prldab4[,1], prldab4s[,1])
range2 <- range (prldab4[,2], prldab4s[,2])
range3 <- range (prldab4[,3], prldab4s[,3])
range4 <- range (prldab4[,4], prldab4s[,4])
rangey <- matrix (c(range1, range2, range3, range4), 4, 2, byrow=TRUE)

## colors used (these are pretty well separated)
vc2 <- c("#ff0000", "#b00000", "#870000", "#550000", "#e4e400", "#baba00", "#878700", "#545400", "#00ff00", "#00b000", "#008700", "#005500", "#00ffff", "#00b0b0", "#008787", "#005555", "#b0b0ff", "#8484ff", "#4949ff", "#0000ff", "#ff00ff", "#b000b0", "#870087", "#550055", "#e4e4e4", "#bababa", "#878787", "#545454")
vc1 <- vc2 [seq (1, 28, 2) + 1]

length(unique(ab4b$species)) ## 12
col1 <- vc1 [1:12] 
col2 <- character (length(ab4b$species))
speciesnumber <- character (length(ab4b$species))
for (i in 1:length(unique(ab4b$species))) {
    tl <- ab4b$species == unique(ab4b$species)[i]
    col2 [tl] <- col1[i]
    speciesnumber [tl] <- LETTERS[i]
}
symb1 <- letters [1:length(unique(ab4c$species))]
symb2 <- character (length(ab4c$species))

for (i in 1:length(unique(ab4c$species))) {
    tl <- ab4c$species == unique(ab4c$species)[i]
    symb2 [tl] <- symb1[i]
}
col3 <- rep ("red", length(ab4c$species))
## assing colors and symbols
tl <- ab4c$species == "Aphisafricana"; col3[tl] <- "#baba00"; symb2 [tl] <- "C.a"
tl <- ab4c$species == "Hyadaphissparganii"; col3[tl] <- "#005500"; symb2 [tl] <- "F.d"
tl <- ab4c$species == "prunifoliae"; col3[tl] <- "#00b0b0"; symb2 [tl] <- "G.e"
tl <- ab4c$species == "pseudoavenae"; col3[tl] <- "#00b0b0"; symb2 [tl] <- "G.f"
tl <- ab4c$species == "scirpifolii"; col3[tl] <- "#545400"; symb2 [tl] <- "D.g"
tl <- ab4c$species == "laconae"; col3[tl] <- "#550000"; symb2 [tl] <- "B.j"
tl <- ab4c$species == "splendens"; col3[tl] <- "#0000ff"; symb2 [tl] <- "J.b"
tl <- ab4c$species == "gnaphalii"; col3[tl] <- "#0000ff"; symb2 [tl] <- "J.c"
tl <- ab4c$species == "subterraneum"; col3[tl] <- "#0000ff"; symb2 [tl] <- "J.h"

## create a data.frame for 'V' and another for 'S' species
ver1 <- data.frame (species = unique(as.character(ab4b$species)), code = unique (speciesnumber), color = col1, symbol = rep(16, 12))
syn1 <- data.frame (species = c("Aphis africana", "Siphocoryne splendens", "gnaphalii", "Hyadaphis sparganii", "Aphis prunifoliae", "Aphis pseudoavenae", "scirpifolii", "subterraneum", "Aphis tahasa", "laconae"),
                    code = unique (symb2),
                    color = c("#baba00", "#0000ff", "#0000ff", "#005500", "#00b0b0", "#00b0b0", "#545400", "#0000ff", "red", "#550000"), symbol = rep(17, 10))

require (scales)  # needed for the alpha function, better solution to get transparent colors

## plot LDA1 vs. LDA2
j <- 2
filename1 <- "Apterae20170519-a.pdf"  
pdf (filename1, height=14, width=14)
plot (prldab4[,1], prldab4[,j], pch="", main = "Apterae",
      xlab="LDA 1", ylab= paste ("LDA", j), xlim = rangey[1,], ylim = rangey[j,])
points (prldab4[,1], prldab4[,j], pch = 16, col=alpha(col2,0.6), cex=3)
text (prldab4[,1], prldab4[,j], speciesnumber, col="white", cex=0.6)
points (prldab4s[,1], prldab4s[,j], pch = 17, col = alpha(col3,0.6), cex=3)
text (prldab4s[,1], prldab4s[,j], symb2, col = "white", cex = 0.6)
## plot the legend
text (-10.5, -3, "Verified", pos = 4, font = 2)
for (i in 1:12) {
    points (-11, -3 - i/2.5, pch = ver1 [i, 4], col = alpha (as.character(ver1 [i, 3]),0.6), cex = 3)
    text (-11, -3 - i/2.5, ver1 [i, 2], col="white", cex=0.6)
    text (-10.5, -3 - i/2.5, ver1 [i, 1], pos = 4, font = 3)
}
text (-6.5, -3, "Synonymized", pos = 4, font = 2)
for (i in 1:10) {
    points (-7, -3 - i/2.5, pch = syn1 [i, 4], col = alpha (as.character(syn1 [i, 3]), 0.6), cex = 3)
    text (-7, -3 - i/2.5, syn1 [i, 2], col="white", cex=0.6)
    text (-6.5, -3 - i/2.5, syn1 [i, 1], pos = 4, font = 3)
}

dev.off()

## plot LDA1 vs. LDA3
j <- 3
filename1 <- "Apterae20170519-b.pdf"  
pdf (filename1, height=14, width=14)
plot (prldab4[,1], prldab4[,j], pch="", main = "Apterae",
      xlab="LDA 1", ylab= paste ("LDA", j), xlim = rangey[1,], ylim = rangey[j,])
points (prldab4[,1], prldab4[,j], pch = 16, col=alpha(col2,0.6), cex=3)
text (prldab4[,1], prldab4[,j], speciesnumber, col="white", cex=0.6)
points (prldab4s[,1], prldab4s[,j], pch = 17, col = alpha(col3,0.6), cex=3)
text (prldab4s[,1], prldab4s[,j], symb2, col = "white", cex = 0.6)
## plot the legend
text (-10.5, -4, "Verified", pos = 4, font = 2)
for (i in 1:12) {
    points (-11, -4 - i/2.25, pch = ver1 [i, 4], col = alpha (as.character(ver1 [i, 3]),0.6), cex = 3)
    text (-11, -4 - i/2.25, ver1 [i, 2], col="white", cex=0.6)
    text (-10.5, -4 - i/2.25, ver1 [i, 1], pos = 4, font = 3)
}
text (-6.5, -4, "Synonymized", pos = 4, font = 2)
for (i in 1:10) {
    points (-7, -4 - i/2.25, pch = syn1 [i, 4], col = alpha (as.character(syn1 [i, 3]), 0.6), cex = 3)
    text (-7, -4 - i/2.25, syn1 [i, 2], col="white", cex=0.6)
    text (-6.5, -4 - i/2.25, syn1 [i, 1], pos = 4, font = 3)
}

dev.off()

j <- 4
filename1 <- "Apterae20170519-c.pdf"  ## this one is not probably not needed 
pdf (filename1, height=14, width=14)
plot (prldab4[,1], prldab4[,j], pch="", main = "Apterae",
      xlab="LDA 1", ylab= paste ("LDA", j), xlim = rangey[1,], ylim = rangey[j,])
## points (prldab4[,1], prldab4[,j], pch = 16, col=col2, cex=3)
points (prldab4[,1], prldab4[,j], pch = 16, col=alpha(col2,0.6), cex=3)
text (prldab4[,1], prldab4[,j], speciesnumber, col="white", cex=0.6)
## points (prldab4s[,1], prldab4s[,j], pch = 17, col = col3, cex=3)
points (prldab4s[,1], prldab4s[,j], pch = 17, col = alpha(col3,0.6), cex=3)
text (prldab4s[,1], prldab4s[,j], symb2, col = "white", cex = 0.6)
text (-10.5, -3, "Verified", pos = 4, font = 2)
for (i in 1:12) {
    points (-11, -3 - i/2.75, pch = ver1 [i, 4], col = alpha (as.character(ver1 [i, 3]),0.6), cex = 3)
    text (-11, -3 - i/2.75, ver1 [i, 2], col="white", cex=0.6)
    text (-10.5, -3 - i/2.75, ver1 [i, 1], pos = 4, font = 3)
}
text (-6.5, -3, "Synonymized", pos = 4, font = 2)
for (i in 1:10) {
    points (-7, -3 - i/2.75, pch = syn1 [i, 4], col = alpha (as.character(syn1 [i, 3]), 0.6), cex = 3)
    text (-7, -3 - i/2.75, syn1 [i, 2], col="white", cex=0.6)
    text (-6.5, -3 - i/2.75, syn1 [i, 1], pos = 4, font = 3)
}

dev.off()