> ## this sctipt takes DNA database and for each fragment, make separate FASTA file > ## all "our" sequences used > ## only those external source sequences used which (a) belong to species / fragments absent in "our" and (b) longest > > library(shipunov) package 'shipunov', version 1.3-1 > set <- read.table("_kubricks_dna.txt", sep="\t", h=TRUE, as.is=TRUE) > set <- set[order(set$FRAGMENT, set$SPECIES.NEW), ] > ## do not use deselected sequences > set <- set[set$SELECT == 1 | is.na(set$SELECT), ] > ## split data frame into list > sets <- split(set[, c("SOURCE", "SPECIES.NEW", "SEQUENCE.ID", "SEQUENCE")], set$FRAGMENT) > for (s in names(sets)) { + subs <- sets[[s]] + subs.misu <- subs[subs$SOURCE == "MISU", ] # keep ours + subs.misu.sp <- unique(sort(subs.misu$SPECIES.NEW)) + subs.genbank <- subs[subs$SOURCE != "MISU", ] # not ours + subs.genbank <- subs.genbank[!subs.genbank$SPECIES %in% subs.misu.sp, ] # remove species which we already have + subs.genbank <- subs.genbank[order(subs.genbank$SPECIES.NEW, nchar(subs.genbank$SEQUENCE), decreasing=TRUE), ] # sort by length, longer first + subs.genbank <- subs.genbank[!duplicated(subs.genbank$SPECIES), ] # take the first + subs <- rbind(subs.misu, subs.genbank) # join back + subs$ID <- apply(subs[, c("SPECIES.NEW", "SEQUENCE.ID")], 1, function(.x) paste0(.x, collapse="__")) # separator between species and everything else is "__", the rest is unchanged + subs[, c("SOURCE", "SPECIES.NEW", "SEQUENCE.ID")] <- NULL # remove columns except 2 + subs <- subs[, c("ID", "SEQUENCE")] # ID first + Write.fasta(subs, file=paste0("20_sets/", s, ".fasta")) + }