Wednesday, August 30, 2006

R script for KEGG analysis

###########################################################
### R Script to Map Arabidpsis AGI Keys KEGG Pathways ###
###########################################################
# Author: Thomas Girke, Mar 2006
# (A) Usage
# (A.1) Assign your locus IDs (AGI Keys) to vector
# (A.2) Load kegg_stat_fct as shown under demo (A.4)
# (A.3) Run kegg_stat_fct like this:
# kegg_stat_fct(locus_ids, type=2)
# argument 'type':
# 2 = Pathway
# 4 = PathwayClass
# (A.4) To demo what the script does, run it like this:
# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/KEGG_ATH.txt")
# (B) Dependencies
# (B.1) Pathway category file "KEGG_ATH.annot" created from these pages:
# http://www.genome.jp/kegg/KGML/KGML_v0.6/ath/index.html
# http://www.genome.jp/kegg/KGML/KGML_v0.6/ath/index2.html
# (B.2) Script can be used for other organisms after adjusting the regular expressions in part (1),
# and creating the corresponding pathway category file (KEGG_ATH.annot)

# (C) Script
# (C.1) Import ATH KEGG annotations: the following commands are only necessary to update the info in kegg data frame (see below)
# Import pathway category file "KEGG_ATH.annot"
# kegg <- read.table("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/KEGG_ATH.annot", h=T, sep="\t")
# If an object "mylist" is present then remove it, since the following loop will append to it.
# if(exists("mylist")) { rm(mylist) }
# for loop to import and reformat remote pathway xml files one-by-one by 'PathID' in 'kegg' (103 iterations)
# for(i in 1:length(kegg[,3])) {
# zzz <- readLines(paste("http://www.genome.jp/kegg/KGML/KGML_v0.6/ath/", kegg[i,3], ".xml", sep="")) # read lines into vector
# zzz <- zzz[grep("at\\dg\\d\\d\\d\\d\\d", as.character(zzz), ignore.case = T, perl=T)] # obtain fields with AGI keys
# zzz <- zzz[-grep("link=", as.character(zzz), ignore.case = T, perl=T)] # remove duplications
# zzz <- gsub(".*name=\\\"|\".*", "", as.character(zzz), ignore.case = T, perl=T) # remove xml tags
# zzz <- gsub("ath:", "", as.character(zzz), ignore.case = T, perl=T) # obtain clean AGI keys
# zzz <- unlist(strsplit(zzz, " ")) # split fields with several keys; strsplit generates list of vectors and unlist converts it into one vector
# zzz <- unique(zzz) # makes AGIs unique within each PathID
# if(!exists("mylist")) # if "mylist" does not exist then create it in next line, otherwise append to "mylist" in second line
# { mylist <- list(zzz) } else
# { mylist <- c(mylist, list(zzz)) }
# names(mylist)[i] <- as.vector(kegg[i,3]) # assign names by 'PathID'
# cat(i, as.vector(kegg[i,3]), "\n", sep = " ") # print iteration number and 'PathID' to monitor loop status
# }
# keggat <- data.frame(unlist(mylist)) # transform list of vectors into data frame, tracking of one-to-many relationship is resolved by appending counts to PathIDs
# keggat <- data.frame(PathID=gsub("(ath\\d\\d\\d\\d\\d).*", "\\1", row.names(keggat), perl=T), AGI=keggat[,1]) # remove from PathIDs appended counts
# keggat <- data.frame(PathID=keggat$PathID, AGI=gsub("([A-Z])", "\\U\\1", as.character(keggat$AGI), perl=T, ignore.case=T)) # make AGI keys uniform by setting them to upper case
# kegg <- merge(keggat, kegg, by.x="PathID", by.y="PathID", all.x=T) # merge data frames 'keggat' and 'kegg' (KEGG_ATH.annot) by PathIDs

# If no update is required, then read the Arab KEGG mappings from this file:
kegg <- read.table("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/kegg_ath", h=T, sep="\t")

# (C.2) Define function: "kegg_stat_fct"
kegg_stat_fct <- function(locus_ids, type=2) { # argument 'type': 2 = Pathway, 4 = PathwayClass (see kegg)
my_select <- merge(kegg, as.data.frame(locus_ids), by.x="AGI", by.y=1, all.y=T) # obtain kegg data frame that cotains only rows for AGI test keys (locus_ids)
x <- as.character(my_select[,type]) # create vector with Pathway (2) or PathwayClass (4) strings in my_select
x[is.na(x)] <- "no_annot" # fill NA fields with "no_annot"
my_select <- data.frame(count_col=x, my_select[,-type]) # combine x and my_select
kegg_stat <- data.frame(table(my_select$count_col)) # obtain counts for count_col
kegg_stat <- data.frame(kegg_stat, Percent=round(kegg_stat$Freq/sum(kegg_stat$Freq)*100,2)) # append percent calculation for counts
names(kegg_stat)[1] <- "Cat" # give frist column a descriptive title
if(type==2) # if type==2 then merge kegg_stat with kegg data frame on column 1, if type==4 then merge kegg_stat with kegg data frame on column 4
{ kegg_stat <- merge(kegg_stat, kegg[!duplicated(kegg[,type-1]),], by.x="Cat", by.y=type-1, all.x=T) } else
{ kegg_stat <- merge(kegg_stat, kegg[!duplicated(kegg[,type]),], by.x="Cat", by.y=type, all.x=T) }
kegg_stat
}

# (C.3) Use function: "kegg_stat_fct"
cat("\n", "You have imported the function 'kegg_stat_fct'.", "\n\n", "Here is a short demo for AGI keys:", "\n") # print usage of function
AGI_vec <- c("AT1G23800", "AT5G63620", "AT1G30120", "AT2G07050", "AT2G22830", "AT2G38700", "AT1G29910")
print(AGI_vec)
kegg_stat <- kegg_stat_fct(locus_ids=AGI_vec, type=4)
print(kegg_stat)
pie(kegg_stat[kegg_stat[,2]>0 ,3], labels=as.vector(kegg_stat[kegg_stat[,2]>0 ,1]), main="KEGG") # plot final result

cat("\n", "Usage:", "\n", "\t kegg_stat <- kegg_stat_fct(locus_ids=AGI_vec, type=4)", "\n", "\t Arguments: 'locus_ids', vector of locus keys; 'type=2', Pathway; 'type=4', PathwayClass", "\n", "\t pie(kegg_stat[kegg_stat[,2]>0 ,3], labels=as.vector(kegg_stat[kegg_stat[,2]>0 ,1]), main=\"KEGG\")", "\n")

R code for gene to GO assignment

######################################################################################################
### GOHyperGAll: Global Hypergeometric Test Using Custom Gene-to-GO Mappings Plus GO Slim Analysis ###
######################################################################################################
# Author: Thomas Girke
# Last update: May 26, 2006
# Utility: To test a sample population of genes for over-representation of GO terms, the
# function 'GOHyperGAll' computes for all GO nodes a hypergeometric distribution test and
# returns the corresponding p-values. A subsequent filter function performs a GO Slim
# analysis using default or custom GO Slim categories.
# Note: GOHyperGAll provides similar utilities as the GOHyperG function in the GOstats package
# from BioConductor. The main difference is that GOHyperGAll simplifies the usage of custom
# chip-to-gene and gene-to-GO mappings.
#
# How it works:
# (A) Generate the required data objects (slow, but needs to be done only once)
# (B) Define GOhyperG_All function
# (C) Subsetting and plotting of results by assigned nodes or goSlim categories
# To demo the script and import all required functions, run the following source() command:
# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/GOHyperGAll.txt")
# HTML Instructions:
# http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/R_BioCondManual.html#GOHyperGAll

########################################
# (A) Generate the required data objects
########################################
# (A.1) Generate sample data frames with assigned gene-to-GO mappings,
# one for MF, one for BP and one for CC mappings
# custom mappings can be used here, but need to have the same format as GO_XX_DFs in the following examples
# (A.1.1) Obtain mappings from geneontology.org
readGOorg <- function(myfile = "gene_association.tair", colno = c(5,3,9), org = "Arabidopsis") { go_org <- read.delim(myfile, na.strings = "", header=F, comment.char = "!", sep="\t") if(org == "Arabidopsis") { go_org <- read.delim("gene_association.tair", na.strings = "", header=F, comment.char = "!", sep="\t") id_vec <- gsub(".*(AT\\dG\\d\\d\\d\\d\\d).*", "\\1", as.character(go_org[,11]), perl=T) go_org <- data.frame(go_org, GeneID=id_vec) go_org <- go_org[grep("^AT\\dG\\d\\d\\d\\d\\d", as.character(go_org$GeneID), perl=T),] go_org <- go_org[!duplicated(paste(go_org[,5], gsub("\\.\\d{1,}", "", as.character(go_org[,16]), perl=T), sep="_")),] go_org <- go_org[,c(1:2,16,4:15,3)] } go_org <- go_org[ ,colno] go_org <- na.omit(go_org) names(go_org) <- c("GOID", "GeneID", "GOCAT") GO_MF_DF <<- go_org[go_org[,3]=="F",] write.table(GO_MF_DF, file="GO_MF_DF", quote=T, sep="\t") cat("\n", "Object 'GO_MF_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") GO_BP_DF <<- go_org[go_org[,3]=="P",] write.table(GO_BP_DF, file="GO_BP_DF", quote=T, sep="\t") cat("\n", "Object 'GO_BP_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") GO_CC_DF <<- go_org[go_org[,3]=="C",] write.table(GO_CC_DF, file="GO_CC_DF", quote=T, sep="\t") cat("\n", "Object 'GO_CC_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") } # (A.1.2) Obtain mappings from BioC sampleDFgene2GO <- function(lib="ath1121501") { require(GOstats); require(lib, character.only=T) affyGOMF <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "MF") # generates list with GeneID components containing MFGOs GO_MF_DF <<- data.frame(GOID=unlist(affyGOMF), GeneID=rep(names(affyGOMF), as.vector(sapply(affyGOMF, length))), Count=rep(as.vector(sapply(affyGOMF, length)), as.vector(sapply(affyGOMF, length)))) write.table(GO_MF_DF, file="GO_MF_DF", quote=F, sep="\t") cat("\n", "Object 'GO_MF_DF' created containing assigned gene-to-MFGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") affyGOBP <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "BP") # generates list with GeneID components containing BPGOs GO_BP_DF <<- data.frame(GOID=unlist(affyGOBP), GeneID=rep(names(affyGOBP), as.vector(sapply(affyGOBP, length))), Count=rep(as.vector(sapply(affyGOBP, length)), as.vector(sapply(affyGOBP, length)))) write.table(GO_BP_DF, file="GO_BP_DF", quote=F, sep="\t") cat("\n", "Object 'GO_BP_DF' created containing assigned gene-to-BPGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") affyGOCC <- eapply(get(paste(lib, "GO", sep=""), mode = "environment"), getOntology, "CC") # generates list with GeneID components containing CCGOs GO_CC_DF <<- data.frame(GOID=unlist(affyGOCC), GeneID=rep(names(affyGOCC), as.vector(sapply(affyGOCC, length))), Count=rep(as.vector(sapply(affyGOCC, length)), as.vector(sapply(affyGOCC, length)))) write.table(GO_CC_DF, file="GO_CC_DF", quote=F, sep="\t") cat("\n", "Object 'GO_CC_DF' created containing assigned gene-to-CCGO mappings. To use custom mappings, generate data frame with the same structure in col 1-2.", "\n") } cat("\n", "(A.1) To use the GOHyperGAll() function, one needs 3 data frames containing the gene-to-GO mappings MF, BP and CC. \n Demo data sets from GO.org or BioC can be created with one of these commands: \n \t (A.1.1) For annotations from geneontology.org: \n \t readGOorg(myfile = \"gene_association.tair\", colno = c(5,3,9), org = \"Arabidopsis\") \n \t \t myfile: download annotation table from geneontology.org and unzip it. Then point function to file name. \n \t \t colno: required column numbers; default 'c(3,5,9)' should work in most cases \n \t \t org: needs to be specified only for Arabidopsis mappings \n \t (A.1.2) For annotations from BioC: \n \t sampleDFgene2GO(lib=\"ath1121501\") \n \t \t lib: defines annotation library, default is \"ath1121501\"", "\n") # (A.2) Generate list containing gene-to-GO-OFFSPRING associations including assiged nodes # This is very slow (3x3 minutes), but needs to be done only once! gene2GOlist <- function() { require(GOstats) for(i in c("MF","BP","CC")) { if(i=="MF") { go_offspr_list <- as.list(GOMFOFFSPRING) } if(i=="BP") { go_offspr_list <- as.list(GOBPOFFSPRING) } if(i=="CC") { go_offspr_list <- as.list(GOCCOFFSPRING) } go_offspr_list <- lapply(go_offspr_list, unlist); go_offspr_list <- lapply(go_offspr_list, as.vector) # clean-up step for the list go_offspr_list_temp <- lapply(names(go_offspr_list), function(x) c(x, go_offspr_list[[x]]) ) # include list component (GOID) names in corresponding (GOID) vectors names(go_offspr_list_temp) <- names(go_offspr_list) # names list components after go_offspr_list go_offspr_list <- go_offspr_list_temp go_offspr_list <- lapply(go_offspr_list, function(x) x[!is.na(x)]) # remove NAs in vectors if(i=="MF") { MF_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_MF_DF[GO_MF_DF$GOID %in% x, 2]))) # retrieve gene/affy IDs for GOID vectors save(MF_node_affy_list, file="MF_node_affy_list") } if(i=="BP") { BP_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_BP_DF[GO_BP_DF$GOID %in% x, 2]))) save(BP_node_affy_list, file="BP_node_affy_list") } if(i=="CC") { CC_node_affy_list <<- lapply(go_offspr_list, function(x) unique(as.vector(GO_CC_DF[GO_CC_DF$GOID %in% x, 2]))) save(CC_node_affy_list, file="CC_node_affy_list") } cat("\n", paste("Object '", i, "_node_affy_list'", sep=""), "with gene-to-GO-OFFSPRING associations created and saved in your working directory.", "\n") } } cat("\n", "(A.2) The corresponding gene-to-GO-OFFSPRING associations are created from the three data frames with the following \n command. This creates 3 list objects with the required MF, BP and CC associations. \n \t gene2GOlist()", "\n") # (A.3) Generate AffyID-to-GeneID mappings when working with chip feature IDs # This function creates a AffyID-to-GeneID mapping data frame using by default the TAIR mappings for the Arabidopsis ATH1 chip. # Once the decoding data frame 'affy2locusDF' is created, the function returns for a query set of AffyIDs the corresponding GeneIDs. # To use the function for the mappings of other chips, one needs to create the corresponding decoding data frame 'affy2locusDF'. AffyID2GeneID <- function(map = "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2006-07-14.txt", affyIDs) { if(!exists("affy2locusDF")) { cat("\n", "Downloading AffyID-to-GeneID mappings, creating object 'affy2locusDF' and saving it in your working directory", "\n") affy2locus <- read.delim(map, na.strings = "", fill=TRUE, header=T, sep="\t")[,-c(2:4,7:9)] names(affy2locus) <- c("AffyID", "AGI", "Desc") row.names(affy2locus) <- as.vector(affy2locus[,1]) my_list <- apply(affy2locus[,-c(3)], 1, list); my_list <- lapply(my_list, unlist) my_list <- lapply(my_list, function(x) as.vector(x[-1])) my_list <- lapply(my_list, strsplit, ";"); my_list <- lapply(my_list, unlist) affy2locusDF <- data.frame(unlist(my_list)) affy2locusDF <- data.frame(rep(names(unlist(lapply(my_list, length))), as.vector(unlist(lapply(my_list, length)))), affy2locusDF) names(affy2locusDF) <- c("AffyID", "GeneID") affy2locusDF <<- affy2locusDF write.table(affy2locusDF, file="affy2locusDF", quote=F, sep="\t") } if(!missing(affyIDs)) { GeneIDs <- unique(as.vector(affy2locusDF[affy2locusDF[,1] %in% affyIDs, 2])) return(GeneIDs) } } cat("\n", "(A.3) To work with AffyIDs, the function AffyID2GeneID() can be used to import custom AffyID-to-GeneID mappings. \n \t AffyID2GeneID(map = \"ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2006-07-14.txt\") \n \t \t map: location of custom AffyID-to-GeneID mappings", "\n") # (A.4) Next time things are much faster by reading the 6 data objects from file loadData <- function() { need_affy2gene <- dir() if(length(need_affy2gene[need_affy2gene=="affy2locusDF"]) > 0) {
affy2locusDF <<- read.table(file="affy2locusDF", header=T, colClasses = "character") } GO_MF_DF <<- read.table(file="GO_MF_DF", header=T, colClasses = "character") GO_BP_DF <<- read.table(file="GO_BP_DF", header=T, colClasses = "character") GO_CC_DF <<- read.table(file="GO_CC_DF", header=T, colClasses = "character") } cat("\n", "(A.4) In future R sessions one can can omit the previous 3 steps (A.1-A.3) by importing all 6 (7) data objects like this: \n \t loadData(); load(file=\"MF_node_affy_list\"); load(file=\"BP_node_affy_list\"); load(file=\"CC_node_affy_list\")", "\n") ########################### # (B) GOhyperG_All function ########################### # (B.1) Define GOhyperG_All function GOHyperGAll <- function(gocat="MF", sample) { # (go_df): Generates data frame containing the commonly used components for all GO nodes: GOID, GO Term and Ontology Type. require(GOstats) if(!exists("go_df")) { go_df <- data.frame(GOID=unlist(eapply(GOTERM, function(x) x@GOID)), Term=unlist(eapply(GOTERM, function(x) x@Term)), Ont=unlist(eapply(GOTERM, function(x) x@Ontology))) go_df <<- na.omit(go_df) cat("\n", "Object 'go_df' created containing for all GO nodes the commonly used components: GOID, GO Term and Ontology Type", "\n") } # (m): Obtain for every node in GO tree their number of associated genes or chip features if(gocat=="MF") {node_affy_list <- MF_node_affy_list} if(gocat=="BP") {node_affy_list <- BP_node_affy_list} if(gocat=="CC") {node_affy_list <- CC_node_affy_list} node_stats_df <- data.frame(NodeSize=sapply(node_affy_list, length)) node_stats_df <- data.frame(GOID=row.names(node_stats_df), node_stats_df) row.names(node_stats_df) <- 1:length(node_stats_df[,1]) m <- as.vector(node_stats_df$NodeSize) # (x): Obtain for every node in GO tree the number of matching genes in sample set node_sample_stats <- sapply(node_affy_list, function(x) { sum(unlist(x) %in% sample) } ) node_sample_stats <- as.vector(node_sample_stats) x <- node_sample_stats # (n): Obtain the number of unique genes associated at all GO nodes if(gocat=="MF") { GO_DF <- GO_MF_DF } if(gocat=="BP") { GO_DF <- GO_BP_DF } if(gocat=="CC") { GO_DF <- GO_CC_DF } n <- length(unique(GO_DF[, 2])) # (k): Obtain number of unique genes in test sample that have GO mappings k <- length(unique(GO_DF[GO_DF[,2] %in% sample, 2])) # Obtain gene/chip keys matching at GO nodes match_key <- sapply(node_affy_list, function(x) { x[unlist(x) %in% sample] } ) match_key <- sapply(match_key, function(x) { paste(x, collapse=" ") } ) match_key <- as.vector(match_key) key <- match_key; key[key==""] <- "NA" # Apply phyper function phyp_v <- phyper(x-1, m, n-m , k, lower.tail = FALSE) result_df <- data.frame(node_stats_df, SampleMatch=x, Phyper=phyp_v, SampleKeys=key) result_df <- merge(result_df, go_df, x.by="GOID", y.by="GOID", all.x=T) result_df <- result_df[order(result_df$Phyper), ] result_df <- result_df[,c(1:4,6:7,5)] result_df } cat("\n", "(B.1) The function GOHyperGAll() runs the phyper test against all nodes in the GO network. \n Usage: \n \t GOHyperGAll(gocat=\"MF\", sample=test_sample)[1:20,] \n \t \t gocat: \"MF\", \"BP\", \"CC\" \n \t \t test_sample <- unique(as.vector(GO_MF_DF[1:40,2])) # for GeneIDs\n \t \t test_sample <- AffyID2GeneID(affyIDs=affy_sample) # for AffyIDs \n \t \t affy_sample <- c(\"266592_at\", \"266703_at\", \"266199_at\", \"246949_at\", \"267370_at\", \"267115_s_at\", \"266489_at\", \"259845_at\", \"266295_at\", \"262632_at\")", "\n") ################################################################################### # (C) Subsetting of results from GOHyperGAll by assigned nodes or goSlim categories ################################################################################### # (C.1) Define subsetting function GOHyperGAll_Subset <- function(GOHyperGAll_result, sample=test_sample, type="goSlim", myslimv) { # type: "goSlim" or "assigned"; optional argument "myslimv" to privde custom goSlim vector if(type=="goSlim") { if(missing(myslimv)) { slimv <- c("GO:0030246","GO:0008289","GO:0003676","GO:0000166","GO:0019825","GO:0005515","GO:0003824","GO:0030234","GO:0005554","GO:0003774","GO:0004871","GO:0005198","GO:0030528","GO:0045182","GO:0005215","GO:0000004","GO:0006519","GO:0007154","GO:0007049","GO:0016043","GO:0006412","GO:0006464","GO:0006810","GO:0007275","GO:0007049","GO:0006519","GO:0005975","GO:0006629","GO:0006139","GO:0019748","GO:0015979","GO:0005618","GO:0005829","GO:0005783","GO:0005768","GO:0005794","GO:0005739","GO:0005777","GO:0009536","GO:0005840","GO:0005773","GO:0005764","GO:0005856","GO:0005634","GO:0005886","GO:0008372","GO:0005576") } else { slimv <- myslimv } GOHyperGAll_subset <- GOHyperGAll_result[GOHyperGAll_result[,1] %in% slimv, ] } if(type=="assigned") { termGO <- c(as.vector(GO_MF_DF[GO_MF_DF$GeneID %in% test_sample, 1]), as.vector(GO_BP_DF[GO_BP_DF$GeneID %in% test_sample, 1]), as.vector(GO_CC_DF[GO_CC_DF$GeneID %in% test_sample, 1])) subset_v <- unique(termGO) GOHyperGAll_subset <- GOHyperGAll_result[GOHyperGAll_result[,1] %in% subset_v, ] } GOHyperGAll_subset } cat("\n", "(C.1) The function GOHyperGAll_Subset() allows subsetting of the GOHyperGAll() results by assigned GO nodes or custom goSlim categories.", "\n", "Usage:", "\n", "\t GOHyperGAll_result <- GOHyperGAll(gocat=\"MF\", sample=test_sample)", "\n", "\t GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type=\"goSlim\")", "\n", "\t \t type: \"goSlim\" or \"assigned\"", "\n", "\t \t myslimv: optional argument allows usage of a custom goSlim vector", "\n") # Apply subsetting function # GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type="goSlim") # (C.2) Plotting of subsetted results # subset <- GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type="goSlim") # pie(subset[subset$SampleMatch>0 ,3], labels=as.vector(subset[subset$SampleMatch>0 ,1]), main=unique(as.vector(subset[subset$SampleMatch>0 ,6])))
cat("\n", "(C.2) Plot pie chart of subsetted results: \n \t subset <- GOHyperGAll_Subset(GOHyperGAll_result, sample=test_sample, type=\"goSlim\") \n \t pie(subset[subset$SampleMatch>0 ,3], labels=as.vector(subset[subset$SampleMatch>0 ,1]), main=unique(as.vector(subset[subset$SampleMatch>0 ,6])))", "\n")

########################################################
# (D) Reduce GO Term Redundancy in 'GOHyperGAll_results'
########################################################
# (D.1) The function 'GOHyperGAll_Simplify' subsets the data frame 'GOHyperGAll_result' by a user
# specified p-value cutoff and removes from it all GO nodes with overlapping children sets
# (OFFSPRING). Only the best scoring nodes remain in the data frame.
# The argument 'correct' is experimental. It aims to favor the selection of distal (information rich)
# GO terms that have at the same time a large number of sample matches. The following calculation is used
# for this adjustment: phyper x Number_of_children / SampleMatch
# Define GOHyperGAll_Simplify()
GOHyperGAll_Simplify <- function(GOHyperGAll_result, gocat="MF", cutoff=0.001, correct=T) { # gocat: "MF", "BP" or "CC"; cutoff: p-value cutoff; correct: TRUE or FALSE if(gocat!=as.vector(GOHyperGAll_result$Ont[!is.na(GOHyperGAll_result$Ont)])[1]) { stop("The GO categories in GOHyperGAll_Simplify() and GOHyperGAll_result need to match") } testDF <- GOHyperGAll_result[GOHyperGAll_result$Phyper<=cutoff,] testDF <- data.frame(testDF, test=rep(0, times=length(testDF[,1]))) testDF <- testDF[!is.na(testDF$Ont),] GOIDv <- NULL GO_OL_Matchv <- NULL while(sum(testDF$test==0)>0) {
clusterv <- NULL test <- as.vector(testDF[,1]) for(j in 1:length(test)) { if(gocat=="MF") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOMFOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOMFOFFSPRING)[[test[1]]])))) if(mymatch==1) { mymatch <- length(as.list(GOMFOFFSPRING)[[test[j]]]) } } if(gocat=="BP") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOBPOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOBPOFFSPRING)[[test[1]]])))) if(mymatch==1) { mymatch <- length(as.list(GOBPOFFSPRING)[[test[j]]]) } } if(gocat=="CC") { mymatch <- sum(unique(na.omit(c(test[j], as.list(GOCCOFFSPRING)[[test[j]]])) %in% na.omit(c(test[1], as.list(GOCCOFFSPRING)[[test[1]]])))) if(mymatch==1) { mymatch <- length(as.list(GOCCOFFSPRING)[[test[j]]]) } } clusterv <- c(clusterv, mymatch) } clusterv[clusterv==0] <- NA testDF <- data.frame(testDF[,-8], test=clusterv) if(correct==T) { testDF <- data.frame(testDF, decide=testDF$Phyper * (testDF$test/testDF$SampleMatch)) } else { testDF <- data.frame(testDF, decide=testDF$Phyper) } GOIDv <- c(GOIDv, as.vector(testDF[order(testDF[,9]),][1,1])) GO_OL_Matchv <- c(GO_OL_Matchv, length(unique(unlist(strsplit(as.vector(testDF[!is.na(testDF$test),7]), " "))))) testDF <- testDF[is.na(testDF$test),] testDF <- testDF[order(testDF[,4]),-c(8,9)] testDF <- data.frame(testDF, test=rep(0, times=length(testDF[,1]))) cat(GOIDv, "\n") } simplifyDF <- data.frame(GOID=GOIDv, GO_OL_Match=GO_OL_Matchv) simplifyDF } # Apply GOHyperGAll_Simplify # simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result, gocat="MF", cutoff=0.001, correct=T) # data.frame(GOHyperGAll_result[GOHyperGAll_result[,1] %in% simplifyDF[,1], -7], GO_OL_Match=simplifyDF[,2]) ####################################### # (D.2) Batch Analysis of Gene Clusters ####################################### # The function 'GOCluster_Report' performs a 'GOHyperGAll_Simplify' analysis in batch mode. # It processes many groups of genes (e.g. from gene expression clusters) and organizes the results # in a single data frame. # The gene sets or clusters need to be provided in a data frame of this structure: # probeID/geneID ClusterID ClusterSize # id1 CL1 2 # id2 CL1 2 # id3 CL2 1 # ... ... ... # # Define 'GOCluster_Report()' GOCluster_Report <- function(CL_DF=CL_DF, CLSZ=10, cutoff=0.001, gocat=c("MF", "BP", "CC"), recordSpecGO=NULL) { # CLSZ: cluster size cutoff; gocat: "MF", "BP" or "CC"; cutoff: p-value cutoff; recordSpecGO: argument to include one specific GOID in each of the 3 ontologies, e.g: recordSpecGO=c("GO:0005554", "GO:0000004", "GO:0008372") cluster_loop <- unique(as.vector(CL_DF[CL_DF[,3]>CLSZ,2]))
if(length(cluster_loop[grep("CL", cluster_loop)])>0) {
cluster_loop <- paste("CL", sort(as.numeric(gsub("CL","", as.character(cluster_loop)))), sep="") containerDF <- data.frame(CLID=NULL, CLSZ=NULL, GOID=NULL, NodeSize=NULL, SampleMatch=NULL, Phyper=NULL, Term=NULL, Ont=NULL, GO_OL_Match=NULL) } for(i in cluster_loop) { cat("\n", "Processing cluster no", i, "\n") affy_sample <- CL_DF[CL_DF[,2]==i, 1] test_sample <- AffyID2GeneID(affyIDs=affy_sample) containerDF2 <- data.frame(GOID=NULL, NodeSize=NULL, SampleMatch=NULL, Phyper=NULL, Term=NULL, Ont=NULL, GO_OL_Match=NULL) count <- 0 for(j in c("MF", "BP", "CC")) { count <- count+1 GOHyperGAll_result <- GOHyperGAll(gocat=j, sample=test_sample) simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result, gocat=j, cutoff=cutoff, correct=T) if(length(simplifyDF)==0) { simplifyDF <- GOHyperGAll_Simplify(GOHyperGAll_result[1:2,], gocat=j, cutoff=1, correct=T) } tempDF <- data.frame(GOHyperGAll_result[GOHyperGAll_result[,1] %in% simplifyDF[,1], -7], GO_OL_Match=simplifyDF[,2]) containerDF2 <- rbind(containerDF2, tempDF) if(length(recordSpecGO)>0) {
containerDF2 <- rbind(containerDF2, data.frame(GOHyperGAll_result[GOHyperGAll_result[,1]==recordSpecGO[count],-7], GO_OL_Match=GOHyperGAll_result[GOHyperGAll_result[,1]==recordSpecGO[count],3])) } no_annot <- test_sample[!test_sample %in% eval(parse(text=paste("GO_", j, "_DF", sep="")))[,2]]; no_annot <- length(no_annot[no_annot!="no_match"]) containerDF2 <- rbind(containerDF2, data.frame(GOID=paste("no_annot_", j, sep=""), NodeSize=NA, SampleMatch=no_annot, Phyper=NA, Term=NA, Ont=j, GO_OL_Match=no_annot)) } tempDF2 <- data.frame(CLID=rep(i, times=length(containerDF2[,1])), CLSZ=rep(unique(as.vector(CL_DF[CL_DF[,2]==i,3])), times=length(containerDF2[,1])), containerDF2) containerDF <- rbind(containerDF, tempDF2) } containerDF } # Apply GOCluster_Report # BatchResult <- GOCluster_Report(CL_DF=CL_DF, CLSZ=10, cutoff=0.001, gocat=c("MF", "BP", "CC"), recordSpecGO=c("GO:0005554", "GO:0000004", "GO:0008372"))

R code for vector and phred score trimming

####################################################################################################
### R & BioConductor Script for Trimming Adaptor and Low Quality Positions of 454 sRNA Sequences ###
####################################################################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Utility: This script brings 454 sequences into the proper orientation, trims off the
# adaptor segments and tags the low quality calls (default score <14) with Ns. The
# current draft implementation is not very fast. The processing of 50,000 sequences
# will take 15-25 minutes.
# Requirements: cross_match program from the Phred/Phrap/Consed package
# Code tag where one can adjust filter settings: '### adjust'
# How it works:
# (A) Some format editing on the command-line
# Requirements:
# clean directory containing
# (a) 454.seq and 454.seq.qual files
# (b) The 2 adaptor files are exprected to be in parent directory under the names
# adaptfor.txt
# adaptrev.txt
# Make sure the extension of *.qual matches the one expected by cross_match, e.g.
# $ mv 454.qual 454.seq.qual
# For the import of the *.qual data into R, the *.qual file needs to have spaces at end of lines:
# $ perl -p -i -w -e 's/(^.*)/$1 /g' 454.seq.qual
# (B) Processing in R
# Run the following script from the directory with your 454 sequences like this:
# source("454_sRNA_Trim.R")
# or
# R CMD BATCH --no-save 454_sRNA_Trim.R

# Start of R script
# (1) Split sequence batch into smaller sets of 50,000 per file
# Define Sequence Import Function 'seq_imp_fct'
seq_imp_fct <- function(fileloc) {
my_fasta <- readLines(fileloc) # reads file line-wise into vector
y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
y <- as.vector(y); y[y==-1] <- 0
index <- which(y==0)
distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry
distance <- data.frame(distance, dist=distance[,2]-distance[,1])
seq_no <- 1:length(y[y==0])
index <- rep(seq_no, as.vector(distance[,3]))
my_fasta <- data.frame(index, y, my_fasta)
my_fasta[my_fasta[,2]==0,1] <- 0
seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
seq <- as.vector(seq[2:length(seq)])
Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
my_fasta
}
# Generate smaller sequence batches of max 50,000 sequences per file. This is required for cross_match
seq <- seq_imp_fct(fileloc="454.seq")
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
slice <- data.frame(FROM=seq(1,length(seq[,1]), by=50000), TO=seq(1,length(seq[,1]), by=50000)+49999)
slice[length(slice[,1]),2] <- length(seq[,1])
for (i in 1:length(slice[,1])) {
write.table(as.vector(as.character(t(export_seq[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
}
qual <- seq_imp_fct(fileloc="454.seq.qual")
export_qual <- data.frame(Desc=qual[,1], seq=qual[,3])
for (i in 1:length(slice[,1])) {
write.table(as.vector(as.character(t(export_qual[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq.qual", sep=""), quote=F, row.names=F, col.names=F)
}

# (2) Run cross_match in 2 steps to tag forward adaptors with Ns and reverse adaptors with Xs
# Run cross_match on all generated *.seq files:
for (i in 1:length(slice[,1])) {
if(i==1) { system(paste("rm -f ", "all.seq.screen", sep="")) } # removes in fist loop an already existing "all.seq.screen" file, since data slices will be appended to a file of this name
# next import steps protect Ns that are inserted by 454.com for positions with score=0
seq <- seq_imp_fct(fileloc=paste(i, ".seq", sep=""))
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
export_seq$seq <- gsub("N", "Z", as.character(seq$seq))
write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
### adjust
system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptfor.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for forward adaptor only
# next import steps tag forward adaptor with N's
seq <- seq_imp_fct(fileloc=paste(i, ".seq.screen", sep=""))
export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
export_seq$seq <- gsub("X", "N", as.character(seq$seq))
write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq.screen", sep=""), quote=F, row.names=F, col.names=F)
system(paste("mv ", paste(i, ".seq", sep=""), ".screen ", paste(i, ".seq", sep=""), sep=""))
### adjust
system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptrev.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for reverse adaptor only, gets tagged with X's
system(paste("cat ", paste(i, ".seq", sep=""), ".screen ", ">> ", "all.seq.screen", sep=""))
system(paste("rm ", "-f ", paste(i, ".seq ", sep=""), paste(i, ".seq.qual ", sep=""), paste(i, ".seq.screen ", sep=""), paste(i, ".seq.log", sep=""), sep=""))
}

# (3) Import of cross_matched sequences into R
# Import of 'all.seq.screen'
seq <- seq_imp_fct(fileloc="all.seq.screen")
cat("\n", "Sequence batch imported containing ", length(seq[,1]), " entries", "\n")
# Tag forward adapter with "F" and reverse adapter with "R", and revert Zs (454 Ns) back to Ns
seqedit <- gsub("N", "F", as.character(seq$seq))
seqedit <- gsub("X", "R", as.character(seqedit))
seqedit <- gsub("Z", "N", as.character(seqedit))
seq <- data.frame(seq[,-3], seq=seqedit)
seq <- data.frame(Desc=gsub(" .*", "", as.character(seq[,1]), perl=T), seq[,-1])
qual <- seq_imp_fct(fileloc="454.seq.qual")
cat("\n", "Quality batch imported containing ", length(qual[,1]), " entries", "\n")
qual <- data.frame(qual[,-3], seq=gsub("[ ]{2,}", " ", as.character(qual$seq)))
# qual[qual[,3]=="N",3] <- 0 # removed May 08, 06
qual <- data.frame(Desc=gsub(" .*", "", as.character(qual[,1]), perl=T), qual[,-1])

# (4) For larger data sets, loop over slices of 10,000 sequences
# In this loop the sequences are (1) vector trimmed, (2) quality trimmed and (3) orientation adjusted.
slicedf <- data.frame(FROM=seq(1, length(seq[,1]), by=10000),
TO=c(seq(0, length(seq[,1]), by=10000)[2:((as.integer(length(seq[,1])/10000))+1)],
length(seq[,1])))
appendDF <- data.frame(Seq=NULL, Phred=NULL)
for(i in 1:length(slicedf[,1])){
cat("\n", "Start of 10K slice no: ", i, "\n")
seqslice <- seq[slicedf[i,1]:slicedf[i,2],]
qualslice <- qual[slicedf[i,1]:slicedf[i,2],]
# Write sequences and qual data in vectorized format into list files
seqslicel <- strsplit(as.character(seqslice[,3]),"")
qualslicel <- strsplit(as.character(qualslice[,3])," ")
names(seqslicel) <- seqslice[,1]
names(qualslicel) <- qualslice[,1]
qualslicel <- lapply(qualslicel, as.numeric)
# Replaces all NTs with Phred < 14 by Ns; this includes vector/adaptor positions
### adjust
seqtriml <- lapply(names(seqslicel), function(x) replace(seqslicel[[x]], which(qualslicel[[x]]<14), "N"))
names(seqtriml) <- names(seqslicel)
# Convert all Ns that fall into adaptor regions back to Fs and Rs
seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="F"), "F"))
names(seqtriml) <- names(seqslicel)
seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="R"), "R"))
names(seqtriml) <- names(seqslicel)

# To determine orientation of sequences, generate data frame with adaptor positions
zzz <- lapply(seqtriml, paste, collapse="")
posF <- lapply(names(zzz), function(x) gregexpr("(F){1,}", as.character(zzz[[x]]), perl=T))
posR <- lapply(names(zzz), function(x) gregexpr("(R){1,}", as.character(zzz[[x]]), perl=T))
names(posF) <- names(seqtriml); names(posR) <- names(seqtriml)
myposF <- unlist(lapply(names(posF), function(x) as.vector(unlist(posF[[x]][[1]]))[1]))
myposR <- unlist(lapply(names(posR), function(x) as.vector(unlist(posR[[x]][[1]]))[1]))
myposF[myposF==-1] <- 0; myposR[myposR==-1] <- 0
mylengthF <- lapply(names(posF), function(x) as.vector(unlist(attributes(posF[[x]][[1]])))[1])
mylengthF <- unlist(mylengthF)
mylengthF[mylengthF==-1] <- 0
mylengthR <- lapply(names(posR), function(x) as.vector(unlist(attributes(posR[[x]][[1]])))[1])
mylengthR <- unlist(mylengthR)
mylengthR[mylengthR==-1] <- 0
orientationDF <- data.frame(F1=myposF, F2=myposF+mylengthF-1, R1=myposR, R2=myposR+mylengthR-1, Length=seqslice[,2])
orientationDF <- data.frame(orientationDF,
Orient=(orientationDF[,3]-orientationDF[,2])/abs(orientationDF[,3]-orientationDF[,2]))
# The following steps are for sequences that contain only one adaptor. This approximation seems to give fairly good results.
orientationDF[orientationDF[,2]==-1 & orientationDF[,3] < (orientationDF[,5]/2), 6] <- -1
orientationDF[orientationDF[,4]==-1 & orientationDF[,1] < ((orientationDF[,5]/2)*0.4), 6] <- 1
orientationDF <- data.frame(ID=names(seqtriml), orientationDF)

# Generate data frame with insert parsing positions
zzz <- lapply(seqtriml, paste, collapse="")
# fix for gregexpr version change between R 2.2.0 and 2.3.0
# R version 2.0.0:# pos <- lapply(names(zzz), function(x) gregexpr("[ATGCN]{1,}", as.character(zzz[[x]]), perl=T))
pos <- lapply(names(zzz), function(x) gregexpr("[^ATGCN][ATGCN]+", paste("", as.character(zzz[[x]])), perl=T))
names(pos) <- names(seqtriml)
mypos <- as.vector(unlist(pos))
mypos[mypos==-1] <- 0
mylength <- lapply(names(pos), function(x) as.vector(unlist(attributes(pos[[x]][[1]]))))
mylength <- unlist(mylength)
# fix for gregexpr version change between R 2.2.0 and 2.3.0
mylength[mylength==-1] <- 0
mylength <- mylength-1
mylength[mylength==-1] <- 0
count <- lapply(names(pos), function(x) length(pos[[x]][[1]]))
count <- as.vector(unlist(count))
IDs <- rep(names(pos), count)
InsertFrame <- data.frame(ID=IDs, UniID=paste(IDs, unlist(sapply(count, function(x) seq(1,x))), sep="."),
InsertCount=rep(count,count), Pos=mypos, Length=mylength)
InsertFrame <- merge(InsertFrame, orientationDF, by.x="ID", by.y="ID", all.x=T)

# Position parsing of inserts using InsertFrame
InsertFrame <- data.frame(InsertFrame, End=InsertFrame[,4]+InsertFrame[,5]-1)
InsertFrame <- InsertFrame[,c(1:5, length(InsertFrame), 6:(length(InsertFrame)-1))]
InsertFrame <- InsertFrame[!InsertFrame$Pos==0, ] # added May 8, 06 to remove no-insert sequences
mylist <- apply(InsertFrame[,c(4,6)], 1, list)
mylist <- lapply(mylist, function(x) as.vector(as.matrix(unlist(x))))
names(mylist) <- as.vector(InsertFrame$UniID)
myindex <- as.vector(InsertFrame$ID)
seqtriml_long <- seqtriml[myindex]; names(seqtriml_long) <- as.vector(InsertFrame$UniID) # creates long version of 'seqtriml' that contains entries with several inserts in duplicates
quall_long <- qualslicel[myindex]; names(quall_long) <- as.vector(InsertFrame$UniID) # creates long version of 'quall' that contains entries with several inserts in duplicates
insert_list <- lapply(names(mylist), function(x) seqtriml_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
insert_qual_list <- lapply(names(mylist), function(x) quall_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
names(insert_list) <- as.vector(InsertFrame$UniID)
names(insert_qual_list) <- as.vector(InsertFrame$UniID)

# Reverse and complement of antisense sequences
insert_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_list[[x]]))
insert_list_rev <- lapply(insert_list_rev, paste, collapse="")
names(insert_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("A", "1", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("T", "2", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("C", "3", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("G", "4", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("1", "T", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("2", "A", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("3", "G", x))
insert_list_rev <- lapply(insert_list_rev, function(x) gsub("4", "C", x))
insert_qual_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_qual_list[[x]]))
names(insert_qual_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
insert_qual_list_rev <- lapply(insert_qual_list_rev, paste, collapse=" ")

# Combine everything in final sequence object
insert_list_for <- insert_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
insert_list_for <- lapply(insert_list_for, paste, collapse="")
insert_list_final <- c(insert_list_rev, insert_list_for)
insert_list_final <- insert_list_final[names(insert_list)]
insert_qual_list_for <- insert_qual_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
insert_qual_list_for <- lapply(insert_qual_list_for, paste, collapse=" ")
insert_qual_list_final <- c(insert_qual_list_rev, insert_qual_list_for)
insert_qual_list_final <- insert_qual_list_final[names(insert_list)]

# Remove terminal Ns. Most of the code here is to adjust qual data set.
tempDF <- data.frame(as.data.frame(unlist(insert_list_final)), as.data.frame(unlist(insert_qual_list_final)))
names(tempDF)[1:2] <- c("Seq", "Phred")
remove_start <- nchar(as.character(tempDF$Seq)) - nchar(gsub("^N{1,}", "", as.character(tempDF$Seq), perl=T))
remove_end <- nchar(as.character(tempDF$Seq)) - nchar(gsub("N{1,}$", "", as.character(tempDF$Seq), perl=T))
NremoveDF <- data.frame(remove_start, remove_end)
tempDF2 <- data.frame(row.names=row.names(tempDF), Seq=gsub("^N{1,}|N{1,}$", "", as.character(tempDF$Seq), perl=T), tempDF[,2])
qual_removel <- strsplit(as.character(tempDF2[,2]), " ")
names(qual_removel) <- row.names(tempDF2)
NremoveDF <- data.frame(remove_start, remove_end)
Nremovel <- apply(NremoveDF, 1, list)
Nremovel <- lapply(Nremovel, function(x) as.vector(as.matrix(unlist(x))))
names(Nremovel) <- row.names(tempDF2)
qual_removel <- lapply(names(qual_removel), function(x) qual_removel[[x]][(1+Nremovel[[x]][1]):(length(as.vector(qual_removel[[x]]))-Nremovel[[x]][2])])
qual_removel <- lapply(qual_removel, paste, collapse=" ")
tempDF <- data.frame(row.names=row.names(tempDF2), Seq=tempDF2[,1], Phred=unlist(qual_removel))

appendDF <- rbind(appendDF, tempDF)
cat("\n", "End of 10K slice no: ", i, "\n")
}
# (5) Create final result data frame
finalDF <- appendDF
zzz <- gregexpr("N", as.character(finalDF[,1]), perl=T)
Ncount <- lapply(zzz, function(x) length(unlist(x))); Ncount <- unlist(Ncount)
# Obtain Ncount: for_loop here necessary for very large sequence sets (>100,000)
names(zzz) <- rownames(finalDF)
incDF <- data.frame(FROM=seq(1, length(zzz), by=10000),
TO=c(seq(0, length(zzz), by=10000)[2:((as.integer(length(zzz)/10000))+1)], length(zzz)))
Ncountfix <- NULL
for(i in 1:length(incDF[,1])) {
Ncountfix <- c(Ncountfix, unlist(lapply(names(zzz[incDF[i,1]:incDF[i,2]]), function(x) as.vector(unlist(zzz[[x]][[1]]))[1])))
cat("\n", "N count loop no: ", i, "\n")
}
Ncount[which(Ncountfix==-1)] <- 0

finalDF <- data.frame(finalDF,
InsertLength=nchar(as.character(finalDF[,1])),
mPhred=as.vector(apply(finalDF, 1, function(x) mean(as.numeric(unlist(strsplit((as.character(x[2]))," ")))))),
Ncount=Ncount
)
names(finalDF)[1:2] <- c("Seq", "Phred")
finalDF <- data.frame(ID=rownames(finalDF), finalDF)

# (6) Filtering steps
# Remove all sequences that have insert lengths of less than 15bp
### adjust
finalDF <- finalDF[finalDF$InsertLength>=15,]
# Remove inserts with >=20% Ns
### adjust
finalDF <- finalDF[(finalDF$Ncount/finalDF$InsertLength)<=0.2,]

# Adjust name extensions to filter results
namev <- gsub("\\..*$", "", as.character(rownames(finalDF)), perl=T)
namev <- paste(sort(namev), unlist(lapply(as.vector(table(sort(namev))), function(x) 1:x)), sep=".")
finalDF <- data.frame(finalDF, sort=1:length(finalDF[,1]))
finalDF <- data.frame(ID=sort(namev), finalDF[order(rownames(finalDF)),-1])
finalDF <- finalDF[order(finalDF$sort),]

# Export sequences and qual data to two fasta batch files
export_seq <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,2])
write.table(as.vector(as.character(t(export_seq))), file="final_seq.txt", quote=F, row.names=F, col.names=F)
export_qual <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,3])
write.table(as.vector(as.character(t(export_qual))), file="final_qual.txt", quote=F, row.names=F, col.names=F)

# # Optional steps
# # Average Phred scores of inserts
# mylist <- strsplit(as.character(finalDF[,3]), " "); mylist <- lapply(mylist, as.numeric); mean(unlist(lapply(mylist, mean)))


parsing sequence by positional co-ordinates using R

##############################################
# Sequence Parsing by Positional Coordinates #
##############################################
# Utility:
# Script parses sequences based on positional information provided
# by a coordinate matrix like the one below. The parsing is performed
# on vectorized sequences in a list container.
# Format of coordinate matrix
# ID start end
# ID001 20 60
# ID001 120 190
# ID002 20 60
# ID002 120 190
#
# To demo what the script does, run it like this:
# source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/positionParsing.txt")

# Create a sample set of sequences (here random sequences)
fx <- function(test) {x <- as.integer(runif(200, min=1, max=5)) # sets length of each random sequence to '200'
x[x==1] <- "A"; x[x==2] <- "T"; x[x==3] <- "G"; x[x==4] <- "C"; paste(x, sep = "", collapse ="")
}
z1 <- c()
for(i in 1:5) { # sets number of generated sequences to '5'
z1 <- c(fx(i), z1)
}
SeqID <- c("ID001","ID002","ID003","ID004","ID005")
z1 <- data.frame(SeqID=SeqID, Seq=z1)
cat("Source sequences:", "\n")
print(z1)

# Write the sequence data frame into a list object.
# This is an important step for handling sets of vectorized sequences of variable length, since
# list objects are more versatile in handling them than data frames.
seq_list <- apply(as.matrix(z1[,2]), 1, list) # generates list with list components
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
seq_list <- lapply(seq_list, strsplit, "") # vectorizes sequences by splitting them into separate vector fields
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
names(seq_list) <- SeqID # names sequence entries, e.g. by their IDs

# Generate coordinate data frame with positions to be parsed from sequences.
pos_matrix <- data.frame(ID=sort(rep(SeqID, 2)), start=rep(c(20,120),5), end=rep(c(60,190),5))
cat("Position matrix:", "\n")
print(pos_matrix)

# Define sequence parse function
seq_parse_fct <- function(seq_list, pos_matrix) {
# Parse sequences in nested 'for loop'
# nesting required to parse more than one fragment per sequence ID
parse_list_all <- list(NULL) # create empty list as list container
if (length(setdiff(names(seq_list), as.vector(pos_matrix[,1]))) > 1) stop("The number of sequences and coordinate groups differ") # Check if all sequence IDs are contained in the sequence list and in the coordinate matrix
for(i in names(seq_list)) { # loop over each sequence ID
sub_pos_matrix <- pos_matrix[pos_matrix[,1]==i,]
parse_list <- list(NULL) # creates empty list of vectorslength container
for(j in 1:length(sub_pos_matrix[,1])) { # loop over all fragments of a sequence
index <- eval(parse(text=paste(sub_pos_matrix[j,2],":", sub_pos_matrix[j,3])))
parse_list <- c(parse_list, list(seq_list[[i]][index]))
if (length(parse_list[[1]][[1]])==0) { parse_list <- parse_list[-1] } # removes in first loop first (empty) component
}
parse_list_all <- c(parse_list_all, list(parse_list))
if (length(parse_list_all[[1]][[1]])==0) { parse_list_all <- parse_list_all[-1] } # removes in first loop first (empty) component
}
names(parse_list_all) <- SeqID # names sequence entries, e.g. by their IDs
parse_list_all
}
parse_list_all <- seq_parse_fct(seq_list, pos_matrix)
cat("Vectorized sequence fragments are managed in the list object 'parse_list_all'.", "\n", "Print first fragment set as sample:", "\n")
print(parse_list_all[[1]])

# Collapse sequence vectors into strings
parse_seq_list <- lapply(parse_list_all, unlist) # joins sequence vectors in each list
parse_seq_list <- lapply(parse_seq_list, paste, collapse="") # collapses vectorised sequences into strings
cat("Concatenated sequence fragments:", "\n")
parsed_result <- data.frame(SeqID, Parsed_Seq=unlist(parse_seq_list))
print(parsed_result)

Role of R in sequence Analysis

######################################
# (A) Sequence Batch Import Function #
######################################
# This script provides the following functionality for sequence management and analysis:
# (1) Batch sequence import into R data frame
# (2) Motif searching with hit statistics
# (3) Analysis of sequence composition
# (4) All-against-all sequence comparisons
# (5) Generation of phylogenetic trees

# Download sample proteome from NCBI in fasta format. The chosen example is from Halobacterium sp. which contains 2058 proteins:
myfileloc <- "ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp/AE004437.faa"
# Larger example from Arabidopsis with 30690 proteins:
# ftp://ftp.arabidopsis.org/home/tair/home/tair/Sequences/blast_datasets/TAIR6_pep_20051108

# (A1) Define Sequence Import Function 'seq_imp_fct'
# This function does the following:
# (a) reads fasta batch file into a single vector
# (b) writes each sequence into a single field using a grouping vector
# (c) provides length information for each sequence
# (d) organizes description lines and sequences in a single data frame

seq_imp_fct <- function(fileloc) {
my_fasta <- readLines(fileloc) # reads file line-wise into vector
y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
y <- as.vector(y); y[y==-1] <- 0

# The 'rep' function is used here to generate a grouping vector/column for writing each sequence into a single field
# This 'indirect' approach runs much faster than the alternative 'for loop' below.
index <- which(y==0)
distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry
distance <- data.frame(distance, dist=distance[,2]-distance[,1])
seq_no <- 1:length(y[y==0])
index <- rep(seq_no, as.vector(distance[,3]))
my_fasta <- data.frame(index, y, my_fasta)
my_fasta[my_fasta[,2]==0,1] <- 0

# Alternative 'for loop' that generates above grouping function, but is 100 times slower
# counter <- 0
# index <- NULL
# for(i in y) {
# if(i==1) { index <- c(index, counter) } else
# { index <- c(index, 0); counter <- counter + 1 }
# }
# my_fasta <- data.frame(index, y, my_fasta)

# Use the 'paste' function in a 'tapply' loop to write each sequence into a single field
# This runs much faster than the alternative 'for loop' below.
seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
seq <- as.vector(seq[2:length(seq)])

# Alternative 'for loop' that does the same as above 'tapply' function, but is >20 times slower
# seq <- NULL
# for(i in 1:max(my_fasta[,1])) {
# seq <- c(seq, paste(my_fasta[my_fasta[,1]==i,3], collapse=""))
# }
Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
my_fasta
}
# (A2) Apply 'seq_imp_fct' to import Halobacterium proteome
Halobact <- seq_imp_fct(fileloc=myfileloc)
# Generate separate column with accession numbers. This regular expression needs to be adjusted for other title line formats
Acc <- gsub(".*gb\\|(.*)\\|.*" , "\\1", as.character(Halobact[,1]), perl=T) # regular expression specific for this example
Halobact <- data.frame(Acc, Halobact)

#################################
# (B) Pattern Matching Function #
#################################
# (B1) Define pattern function
# This function counts the matches of a user defined pattern (reg expr) in a sequence data frame generated by above 'seq_imp_fct'.
# In addition to the pattern counts, the function provides the positions of the pattern matches.
pattern_fct <- function(pattern, sequences) { # expects sequences in fourth column of 'sequence' data frame
pos <- gregexpr(pattern, as.character(sequences[,4]), perl=T) # 'gregexpr' returns list with pattern positions
posv <- unlist(lapply(pos, paste, collapse=", ")); posv[posv==-1] <- 0 # retrieves positions where pattern matches in each sequence
hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence
sequences <- data.frame(sequences[,1:3], Position=as.vector(posv), Hits=hitsv, sequences[,4])
sequences
}
# (B2) Apply pattern function
patterndf <- pattern_fct(pattern="H..H{1,2}", sequences=Halobact)
# Print a title message to explain the following output
cat("These sequences in the Halobacterium proteome contain the motif \"H..H{1,2}\" more than two times:","\n")
print(patterndf[patterndf[,5]>2, 1:5])
# Tag pattern in sequence by setting pattern to upper case and remaining sequence to lower case
tag <- gsub("([A-Z])", "\\L\\1", as.character(patterndf[patterndf[,5]>2, 6]), perl=T, ignore.case=T)
tag <- gsub("(H..H{1,2})", "\\U\\1", as.character(tag), perl=T, ignore.case=T)
export <- data.frame(patterndf[patterndf[,5]>2,-6], tag)
export <- data.frame(Acc=paste(">", export[,1], sep=""), seq=export[,6])
# Possibility to export any sequence subset in fasta format to an external file
write.table(as.vector(as.character(t(export))), file="export.txt", quote=F, row.names=F, col.names=F)
# Print a title message to explain the output
cat("The sequences with the \"H..H{1,2}\" pattern in upper case have been written to a file 'export.txt' in your current working directory", "\n", "\n")

# (B3) Check AA distribution in sequences
AA <- data.frame(AA=c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"))
AAstat <- lapply(strsplit(as.character(patterndf[patterndf[,5]>2, 6]),""), table)
for(i in 1:length(AAstat)) {
AAperc <- AAstat[[i]] / patterndf[patterndf[,5]>2, 3][i] *100 # calculates AA content in percent using length information
AA <- merge(AA, as.data.frame(AAperc), by.x="AA", by.y="Var1", all=T)
}
names(AA)[2:length(AA)] <- as.vector(patterndf[patterndf[,5]>2, 1])
AASummary <- data.frame(AA, Mean=apply(AA[,2:length(AA)], 1, mean, na.rm=T))
for(i in 1:length(patterndf[patterndf[,5]>2, 3])) {
patterndf[patterndf[,5]>2, 3][i]
}
# Print a title message to explain the following output
cat("The identified sequences with two or more \"H..H{1,2}\" motifs have the following AA composition:", "\n")
print(AASummary)
cat("If you encounter in the next steps error messages, then Needle and/or Phylip may not be available on your system!!!!!!!", "\n")

######################################
# Run Needle in All-Against-All Mode #
######################################
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# This part is for UNIX-like OSs that have EMBOSS installed
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# The following code runs the Needle program from EMBOSS in all-against-all mode
# to generate a distance matrix based on scores obtained in each pairwise alignment.
# The same code can be used to run Water, BLAST2 or other pairwise sequence analysis
# programs in all-against-all mode.
# In the last step the obtained distance matrix is used to plot a similarity tree in R.
# The same matrix can also be used to calculate a phylogenetic tree with the Phylip
# software (see below).
patterndf <- patterndf[patterndf[,5]>2,] # start with above patterndf
system("rm -f my_needle_file")
for(i in 1:length(patterndf[,1])) {
cat(as.character(paste(">", as.vector(patterndf[i,1]),sep="")), as.character(as.vector(patterndf[i,6])), file="file1", sep="\n")
for(j in 1:length(patterndf[,1])) {
cat(as.character(paste(">", as.vector(patterndf[j,1]),sep="")), as.character(as.vector(patterndf[j,6])), file="file2", sep="\n")
system("needle file1 file2 stdout -gapopen 10.0 -gapextend 0.5 >> my_needle_file")
}
}

# Import needle results and plot them as tree
score <- readLines("my_needle_file")
score <- score[grep("^# Score", score, perl=T)]
score <- gsub(".* " , "", as.character(score), perl=T)
score <- as.numeric(score)
scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1])))
scorem <- as.dist(1/scorem)
hc <- hclust(scorem, method="complete")
plot(hc, hang = -1, main="Distance Tree Based on Needle All-Against-All Comparison")
# Print a title message to explain the generated output
cat("The created all-against-all distance matrix ('scorem') of Needle scores is plotted in form of a distance tree.", "\n")

###################
# PHYLIP Analysis #
###################
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# This part is for UNIX-like OSs that have PHYLIP installed
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# In this step the obtained distance matrix is exported in Phylip format to a file named 'infile'.
# This file is used to generate a phylogenetic tree with the 'neighbor' program in Phylip. After this
# step the retree module is used to view and edit the tree.
scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1])))
z <- 1/scorem
zv <- as.vector(z)
zv[seq(1,64, by=9)] <- 0
z <- matrix(zv, 8, 8, dimnames=list(names(as.data.frame(scorem)), names(as.data.frame(scorem))))
z <- round(z, 7)
write.table(as.data.frame(z), file="infile", quote=F, col.names=F, sep="\t")
# Print a title message to explain the generated output
cat("The distance matrix was exported in Phylip format to a file named 'infile'. Follow the instructions in the next comment lines to continue with the Phylip anaylsis.", "\n")
# system("vim infile") # add number of sequences in first line
# system("phylip neighbor") # choose: r, n, y
# system("mv -f outtree intree")
# system("phylip retree") # choose: y, m, x, y, r

genomics: June 2006

genomics: June 2006