> ## this script takes alignments, concatenates sequences (i.e., makes super-matrices) and outputs some statistics > > library(shipunov) package 'shipunov', version 1.5-1 > ## function to calculate number of taxonomically informative characters > Is.tax.inform.char <- function(vec) sum(table(vec, useNA="no") > 1) > 1 > ## we will need DNA database again to access all species and IDs > sequences <- read.table("_kubricks_dna.txt", sep="\t", h=TRUE, as.is=TRUE) > ## we will need "treesp" to determine outgroups > treesp <- read.table("_kubricks_treesp.txt", sep="\t", h=TRUE, as.is=TRUE) > ## for subsequent analyses, outgroups are better to place in the beginning of file so we need to know what are our outgroups > outgroups <- treesp$SPECIES.NEW[treesp$TYPE == "outgroup"] > ## now load the actual DNA data > ## define fragments > fragments <- c("abcd", "efgh") # fragile (this means that if you must control this when e.g. fragments change) > ## load alignments per fragment > dna <- structure(vector("list", length(fragments)), names=fragments) # fragile > for (f in fragments) { + dna[[f]] <- Read.fasta(paste0("32_alignments_trimmed_gapcoded/", f, ".fasta")) + sp.id <- do.call(rbind, strsplit(dna[[f]]$name, "__")) # we use here that species name reparated with everything else by _double_ underscore + dna[[f]] <- data.frame(SP=sp.id[, 1], ID=sp.id[, 2], SEQ=dna[[f]][, 2], stringsAsFactors=FALSE) + dna[[f]] <- dna[[f]][order(nchar(gsub("-", "", dna[[f]]$SEQ)), decreasing=TRUE), ] # longest first---it is useful for concatenation + dna[[f]] <- dna[[f]][order(dna[[f]]$SP), ] # by species, whithin species longest still first because order() does not rearrange ties + } > ## now make the list of all IDs with their taxonomic names > ## the main problem now is that some IDs are _compound_ and must be split > sp.ids <- do.call(rbind, c(lapply(dna, "[", 1:2), make.row.names=FALSE)) > tmp1 <- suppressWarnings(do.call(rbind, strsplit(sp.ids$ID, ", "))) # as some IDs are compound, and some not, strsplit() will issue warning and it is better to suppress it > tmp2 <- unique(sort(paste(c(tmp1), sp.ids$SP, sep="__"))) # for simpler sort / unique, IDs and names join with "__" again > ids.sp <- do.call(rbind, strsplit(tmp2, "__")) # then we split them back again. At the end, "ids.sp" will contain the unique IDs and their (species) names > ## === > ## === > ## === > > ## STEP 1. STRICT concatenation: fragments merge on "our" ids. In other words, only our (locally made) sequences with the same ID will be concatenated. > > ## determine which IDs are "ours" > tmp1 <- sequences[sequences$SOURCE == "MISU", "SEQUENCE.ID"] > tmp2 <- suppressWarnings(do.call(rbind, strsplit(tmp1, ", "))) > ourids.all <- unique(sort(tmp2)) > ourids <- ids.sp[ids.sp[, 1] %in% ourids.all, ] > ## make the dataframe to keep concatenated > strict <- data.frame(ourids, stringsAsFactors=FALSE) > names(strict) <- c("ID", "SP") > ## reorder to place outgroups first, other sorted > tmp1 <- strict[strict$SP %in% outgroups, ] > tmp2 <- strict[!strict$SP %in% outgroups, ] > tmp2 <- tmp2[order(tmp2$SP), ] > strict <- rbind(tmp1, tmp2) > ## concatenate! > for (n in fragments) { # per each fragment + strict[[n]] <- NA # empty column, filled with NAs + for (i in 1:nrow(strict)) { + ii <- dna[[n]]$SEQ[grepl(strict$ID[i], dna[[n]]$ID)] # search for sequence which ID corresponds with ID in the current row of "strict" + strict[[n]][i] <- ifelse(length(ii) == 0, NA, ii) # if by any chance "ii" has length > 1, returns the first element; if "ii" is empty, make NA + } + } > ## at the end, "strict" has columns for ID, SP(ecies) and one column for each fragment. To finalize contatenation, we need to join fragment columns (see below) > > ## === > ## === > ## === > > ## STEP 2. SEMISTRICT concatenation: first, add new species which were not in "our" ids; then fill gaps with sequences from the same species and keep these IDs > > ## make a copy > semistrict <- strict > ## add species which are not "ours" > ssp <- sort(unique(semistrict$SP)) > allsp <- sort(unique(sp.ids$SP)) > addsp <- allsp[!allsp %in% ssp] > if (length(addsp) > 0) { # precaution for the case when all species are locally sequenced + semistrict[(nrow(semistrict) + 1) : (nrow(semistrict) + length(addsp)), ] <- NA # fill new rows with NAs + semistrict[nrow(semistrict) : (nrow(semistrict) - (length(addsp) - 1)), "SP"] <- addsp # number of rows changed, hence new calculation; fill new part of SP column with "addsp" + } > ## reorder again to place outgroups first, other sorted > tmp1 <- semistrict[semistrict$SP %in% outgroups, ] > tmp2 <- semistrict[!semistrict$SP %in% outgroups, ] > tmp2 <- tmp2[order(tmp2$SP), ] > semistrict <- rbind(tmp1, tmp2) > ## make new columns to keep IDs of will-be-added sequences > names.tmp3 <- c(paste0(fragments, ".ID")) > tmp3 <- structure(data.frame(matrix(NA, nrow=nrow(semistrict), ncol=length(names.tmp3))), names=names.tmp3) > semistrict <- cbind(semistrict, tmp3) > semistrict <- semistrict[, c("SP", "ID", paste0(fragments, ".ID"), fragments)] > ## concatenate! > for(n in fragments) { # per fragment + d <- dna[[n]] + f.id <- paste0(n, ".ID") # make the fragment ID to determine where to place it + for (i in 1:nrow(semistrict)) { + if (is.na(semistrict[i, n])) { # if there is a gap + dd <- d$SEQ[grepl(semistrict$SP[i], d$SP)] # search for sequence of this fragment with same species name + ii <- d$ID[grepl(semistrict$SP[i], d$SP)] # analogously, search for ID + if (length(dd) > 0 & length(ii) > 0) { # if anything found + semistrict[i, n] <- dd[1] # then select the first (i.e., longest because they were sorted this way) element + semistrict[i, f.id] <- ii[1] # and select its ID + } else { # if nothing found + semistrict[i, f.id] <- "-" # put there the dash + } + }} + } > ## at the end, "semistrict" has columns for our SP, our ID, then columns for added IDs (one per fragment), then columns for each fragment. If, for example, there are 3 fragments, "semistrict" will have 8 columns. > > ## === > ## === > ## === > > ## STEP 3. Output statistics and write files > ## we will output only semistrict concatenation > > ## create new IDs which will hshow the origin of tree terminal > semistrict.f.ids <- c(paste0(fragments, ".ID")) > ids.tmp0 <- semistrict[, c("SP", "ID", semistrict.f.ids)] > ids.tmp1 <- apply(ids.tmp0[, c("SP", "ID")], 1, function(.x) paste(.x, collapse="__")) # as usual, species separated from ID with "__" > for (i in semistrict.f.ids) ids.tmp0[, i] <- ifelse(is.na(ids.tmp0[, i]), "", paste(sub(".ID", "", i), ids.tmp0[, i])) # add name of fragment to sequence ID > ids.tmp2 <- apply(ids.tmp0[, semistrict.f.ids], 1, function(.x) paste(.x, collapse=" ")) # join them all together > ids.tmp2 <- gsub("(^ +)|( +$)", "", ids.tmp2) # then clean ... > ids.tmp2 <- gsub("( +)", " ", ids.tmp2) > ids.tmp <- paste(ids.tmp1, ids.tmp2) > ids.tmp <- gsub(" +", "_", ids.tmp) > ids.tmp <- gsub(",", "", ids.tmp) > ids.tmp <- gsub("_$", "", ids.tmp) > ids.tmp <- gsub("_NA_", "_", ids.tmp) # ... cleaning done > ## at the end, new ID will start with species name, then "our" ID (if any), then name of fragment for added "alien" sequence, then its ID or "-" if fragment absent (please look on trees to understand this system better) > > ## remove duplicated sequences, they appear partly because some sequences in DNA database had compound IDs > semistrict.f <- fragments > groups <- as.numeric(as.factor(apply(semistrict[, c("SP", semistrict.f)], 1, paste, collapse=""))) > ## since some sequences were essentially removed (even if they are full duplicates), it is good to kee their IDs > comb.ids <- aggregate(ids.tmp, by=list(groups), paste, collapse=" + ") > semistrict <- semistrict[!duplicated(groups), ] # remove duplicates in main table > ids <- ids.tmp[!duplicated(groups)] # remove duplicates in joint ids > comb.ids <- comb.ids[groups[!duplicated(groups)], 2] # the same order as in deduplicated data > ## this table contains combined IDs and is useful to understand which IDs are omitted in resulted trees > write.table(data.frame(ids, comb.ids), file="40_concatenated/semistrict_comb_ids.txt", sep="\t", quote=FALSE, row.names=FALSE) > ## output statistics: > semistrict.m <- data.frame(semistrict[, 1:2], apply(semistrict[, semistrict.f], 2, function(.y) ifelse(is.na(.y), 0, 1))) > semistrict.m$COUNT <- rowSums(semistrict.m[, semistrict.f]) > ## what missed after concatenation > semistrict.m[order(semistrict.m$COUNT, decreasing=TRUE), ] SP ID abcd efgh COUNT 8 Kubrickus heus K-008 1 1 2 4 Kubrickus deus K-004 1 1 2 5 Kubrickus eus K-005 1 1 2 6 Kubrickus feus K-006 1 1 2 10 Kubrickus jeus K-010 1 1 2 11 Kubrickus keus K-011 1 1 2 12 Kubrickus leus K-012 1 1 2 1 Kubrickus aus K-001 1 0 1 2 Kubrickus beus K-002 1 0 1 3 Kubrickus ceus K-003 1 0 1 7 Kubrickus geus K-007 1 0 1 9 Kubrickus ieus K-009 1 0 1 > ## how many ids in each missing state > table(apply(semistrict[, semistrict.f], 1, function(.y) sum(!is.na(.y)))) 1 2 5 7 > ## how many ids missed in each fragment > apply(semistrict[, semistrict.f], 2, function(.x) paste0(sum(is.na(.x)), "/", length(.x))) abcd efgh "0/12" "5/12" > ## how many sequences our and total > all.ids <- unlist(semistrict[, c("ID", paste0(semistrict.f, ".ID"))]) > ## our > sum(all.ids %in% ourids.all & !is.na(all.ids) & all.ids != "-") [1] 12 > ## total > sum(!is.na(all.ids) & all.ids != "-") [1] 13 > ## number of potentially informative characters and total number of characters per fragment > for (f in semistrict.f) { + cat(f) + tmp <- semistrict[, f] + tmp <- tmp[!is.na(tmp)] + tmp.table <- do.call(rbind, strsplit(tmp, split="")) + tmp.table[tmp.table == "-" | tmp.table == "N"] <- NA + cat("\tall chars ", ncol(tmp.table)) + tmp.itic <- apply(tmp.table, 2, Is.tax.inform.char) + cat("\tpotentially taxonomically informative chars ", sum(tmp.itic), "\n") + } abcd all chars 747 potentially taxonomically informative chars 5 efgh all chars 372 potentially taxonomically informative chars 45 > ## to make FASTA, we need to replace NA with long strings filled with "N" > for (i in semistrict.f) semistrict[, i] <- ifelse(is.na(semistrict[, i]), paste(rep("N", max(nchar(semistrict[, i]), na.rm=TRUE)), collapse=""), semistrict[, i]) > ## now join all sequences columns into one > seq <- apply(semistrict[, semistrict.f], 1, function(.x) paste(.x, collapse="")) > semistrict.new <- data.frame(ID=ids, SEQ=seq, stringsAsFactors=FALSE) > Write.fasta(semistrict.new, file=paste0("40_concatenated/", "semistrict.fasta"))