#' --- #' title: "Generate mRNA isoform level functional labels for mouse" #' authors: "Gaurav Kandoi, Julie A. Dickerson" #' date: "November 13, 2018" #' affiliation: "Iowa State University of Science and Technology" #' license: "CC BY 4.0" #' --- #' ## Load libraries. #' #' Load the required libraries and define the *tuples* function required to create the pairs and *make_table* function to re-format BioCyc results. #' #install.packages("data.table") #install.packages("gtools") #install.packages("plyr") #source("http://bioconductor.org/biocLite.R") #biocLite("GO.db") #install.packages("splitstackshape") #install.packages("tidyverse") #install.packages("ontologyIndex") library(data.table) library(gtools) library(plyr) library(GO.db) library(splitstackshape) library(tidyverse) library(ontologyIndex) options(stringsAsFactors = F) tuples <- function(x) { return(data.frame( permutations(length(x), 2, x, repeats.allowed = T, set = F), stringsAsFactors = F )) } make_table <- function(infile) { example <- cSplit(infile, "Genes.of.pathway", sep = " // ") row.names(example) <- row.names(infile) infile1 <- as.matrix(example) row.names(infile1) <- row.names(example) class(infile1) <- "table" new <- as.data.frame(infile1) new1 <- new[complete.cases(new), ] new1$Var2 <- NULL colnames(new1) <- c("Pathway", "Gene_ID") new1 } #' #' ## Define all files to read #' #' + Download the latest GO data from: http://geneontology.org/page/download-annotations #' + Download the Gene to Transcript mapping from: ftp://ftp.ncbi.nih.gov/genomes/M_musculus/GFF/ #' + Download the latest Gene Ontology structure (Core) in obo format from: http://www.geneontology.org/page/download-ontology #' + Download the current gene-pathway data for Mouse from KEGG: http://rest.kegg.jp/link/mmu/pathway #' + Download mapview file to convert EntrezID in Kegg to GeneName for Genome version: GRCm38.p4: ftp://ftp.ncbi.nlm.nih.gov/genomes/M_musculus/mapview/seq_gene.md.gz #' + Download Encode meta data: https://www.encodeproject.org/metadata/type=Experiment&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&assay_title%21=ChIP-seq&assay_title%21=DNase-seq&assay_title%21=ATAC-seq&files.file_type=fastq&assay_title%21=WGBS/metadata.tsv #' ## ------------------------------------------------------------------------ GO_Annotations <- "../Data/MMU_GO_23Oct17.gz" Gene_to_Trans <- "../Data/ref_GRCm38.p4_top_level.gff3.gz" GOstruct <- "../Data/go_25Oct17.obo.gz" KeggPath <- "../Data/MMU_Kegg_25Sept17.txt.gz" CycPath <- "../Data/MouseCyc_GeneName_25Sept17.txt.gz" mapview <- "../Data/seq_gene.md.gz" APID <- "../Data/MMU_APID_L2_25Sept17.txt.gz" BioGRID <- "../Data/MMU_BioGrid_3.4.152_25Sept17.txt.gz" IID <- "../Data/MMU_IID_042017_25Sept17.txt.gz" mentha <- "../Data/MMU_Mentha_25Sept17.txt.gz" IntAct <- "../Data/MMU_IntAct_Web_25Sept17.txt.gz" #' #' ## Format GO data #' #' The gene to transcript mapping was obtained using the gff3 files (GRCm38.p4) downloaded from [NCBI](ftp://ftp.ncbi.nih.gov/genomes/M_musculus/GFF/). We will use this to classify genes into Single/Multi isoform genes. #' We read the mapping file and select only those entries which correspond to an mRNA and then extract the gene and transcript IDs. ## ------------------------------------------------------------------------ MGI_GeneName <- read.table( Gene_to_Trans, header = F, sep = "\t", quote = "\"", stringsAsFactors = F ) MGI_GeneName <- MGI_GeneName %>% filter(MGI_GeneName$V3 == "mRNA") MGI_GeneName <- droplevels(unique(MGI_GeneName)) MGI_GeneName <- MGI_GeneName %>% mutate(Trans_ID = sapply(MGI_GeneName$V9, function(x) sub(".*ID= *(.*?) *;.*", "\\1", x))) %>% mutate(Transcript_ID = sapply(MGI_GeneName$V9, function(x) sub(".*transcript_id= *(.*?) *", "\\1", x))) %>% mutate(Gene_ID = sapply(MGI_GeneName$V9, function(x) sub(".*gene= *(.*?) *;.*", "\\1", x))) %>% dplyr::select(Transcript_ID, Gene_ID, Trans_ID) #' #' Finding GO term ancestors using the OntologyIndex package. #' ## ------------------------------------------------------------------------ GO <- get_ontology( GOstruct, propagate_relationships = c("is_a", "part_of"), extract_tags = "everything" ) GO_Type <- data.frame( GO_ID = rep(names(GO$namespace), lapply(GO$namespace, length)), Type = unlist(GO$namespace), row.names = NULL ) %>% filter(Type == "biological_process") GO_Anc <- data.frame( GO_ID = rep(names(GO$ancestors), lapply(GO$ancestors, length)), Ancestors = unlist(GO$ancestors), row.names = NULL ) BP_Anc_Table1 <- unique(droplevels(GO_Anc[GO_Anc$GO_ID %in% GO_Type$GO_ID, ])) %>% filter(GO_ID != Ancestors) BP_Child_Table1 <- BP_Anc_Table1 %>% rename(Children = GO_ID, GO_ID = Ancestors) rm(GO) rm(GO_Type) rm(GO_Anc) gc() gc() #' #' Read the gene ontology annotations #' ## ------------------------------------------------------------------------ MMU_GO <- read.table( GO_Annotations, header = F, sep = "\t", comment.char = "!", quote = "\"" ) #' #' Remove annotations with evidence codes, *IEA: Inferred from Electronic Annotation*, *NAS: Non-traceable Author Statement* and *ND:No biological Data available*. For now, we are only interested in the **Biological Process** ontology. #' ## ------------------------------------------------------------------------ MMU_GOP <- MMU_GO %>% filter( MMU_GO$V9 == "P" & MMU_GO$V4 == "" & MMU_GO$V7 != "IEA" & MMU_GO$V7 != "ND" & MMU_GO$V7 != "NAS" & MMU_GO$V13 == "taxon:10090" ) %>% select(2, 3, 5) %>% unique() colnames(MMU_GOP) <- c("Gene_ID", "GeneName", "GO_ID") #' #' Finding single and multiple isoform coding genes. #' ## ------------------------------------------------------------------------ Counts <- plyr::count(unique(MGI_GeneName[, -3]), vars = "Gene_ID") MMU_SIG <- Counts %>% dplyr::filter(freq == 1) %>% dplyr::select(Gene_ID) rm(Counts) #' #' Now, we can merge the biological process annotations with the gene-transcript id mapping to generate transcript level annotations. #' ## ------------------------------------------------------------------------ MMU_GOP1 <- MMU_GOP %>% left_join(MGI_GeneName, by = c("GeneName" = "Gene_ID")) %>% select(1, 2, 3, 4) %>% unique() rm(MMU_GOP) #' #' Propagating the GO term labels to all it's ancestor GO terms. #' ## ------------------------------------------------------------------------ MMU_GOP_Propagated <- MMU_GOP1 %>% left_join(BP_Anc_Table1, by = "GO_ID") %>% unique() colnames(MMU_GOP_Propagated) <- c("GOTerm", "GeneName", "Gene_ID", "Transcript_ID", "GO_ID") MMU_GOP_Prop <- rbind(MMU_GOP1, MMU_GOP_Propagated[, -1]) %>% unique() rm(MMU_GOP_Propagated) rm(MMU_GOP1) rm(BP_Anc_Table1) gc() #' #' There are several GO terms which are very broad, like *GO:0008150 - biological_process*, or very specific like *GO:0044849 - estrous cycle*. We will remove GO terms which have more than 1000 or less than 10 genes annotated to them and remove duplicate annotations. #' ## ------------------------------------------------------------------------ GO_Remove <- MMU_GOP_Prop %>% count(GO_ID, GeneName) %>% count(GO_ID) %>% filter(nn < 10 | nn > 1000) MMU_GOP_Prop1 <- MMU_GOP_Prop %>% filter(!GO_ID %in% GO_Remove$GO_ID) %>% unique() rm(GO_Remove) #' #' Next, let's split the annotations into single and multiple isoform producing genes. #' ## ------------------------------------------------------------------------ MMU_GOP_Prop_SIG <- MMU_GOP_Prop1[(MMU_GOP_Prop1$GeneName) %in% (MMU_SIG$Gene_ID), ] %>% unique() #MMU_GOP_Prop_MIG <- MMU_GOP_Prop[(MMU_GOP_Prop$GeneName) %in% (MMU_MIG$GeneName),] #MMU_GOP_Prop_MIG <- droplevels(MMU_GOP_Prop_MIG) rm(MMU_GOP_Prop1) #' #' ## Format Pathway data #' #' Download the current gene-pathway data for Mouse from KEGG: http://rest.kegg.jp/link/mmu/pathway. Also download the gene-pathway data from BioCyc. In this, we consider that the Biocyc data downloaded is in the following format: #' #' Pathway1 Gene1 // Gene2 // ... // GeneN #' Pathway2 Gene1 // Gene2 // ... // GeneN #' Pathway3 Gene1 // Gene2 // ... // GeneN #' #' ## ------------------------------------------------------------------------ KEGG <- unique(read.table(KeggPath, header = F, sep = "\t")) MMU_BioCyc <- read.table( CycPath, header = T, sep = "\t", quote = "\"'", row.names = 1 ) MMU_BioCyc$Genes.of.pathway <- gsub(pattern = "\"", replacement = "", MMU_BioCyc$Genes.of.pathway) MMUCyC <- make_table(MMU_BioCyc) rm(MMU_BioCyc) #' #' Because the gene identifiers in KEGG are the Entrez_GeneID, we also need to map these IDs to GeneNames. For this, we will use the mapview files from NCBI Mouse Build37.2 ftp://ftp.ncbi.nlm.nih.gov/genomes/M_musculus/mapview/seq_gene.md.gz # Genome version: GRCm38.p4 #' ## ------------------------------------------------------------------------ Gene_mapping <- read.table(mapview, sep = "\t", header = T, comment.char = "") %>% filter(feature_type == "GENE") %>% select(10, 11) %>% mutate(feature_id = gsub("GeneID:" , "", feature_id, ignore.case = T)) %>% unique() #' #' The identifiers used in KEGG are gene id's. So, we map these to gene names and then merge it with the gene name-transcript mapping to get the transcript level pathway information. #' ## ------------------------------------------------------------------------ KEGG$V2 <- gsub(pattern = "mmu:", replacement = "", KEGG$V2) KEGG$V1 <- gsub(pattern = "path:", replacement = "", KEGG$V1) KEGG_GeneID_Mapped <- KEGG %>% left_join(Gene_mapping, by = c("V2" = "feature_id")) %>% unique() colnames(KEGG_GeneID_Mapped) <- c("Pathway", "GeneID", "Gene_ID") rm(KEGG) rm(Gene_mapping) KEGG_Transcript_Mapped <- KEGG_GeneID_Mapped %>% left_join(MGI_GeneName, by = "Gene_ID") %>% select(-GeneID,-Trans_ID) %>% unique() rm(KEGG_GeneID_Mapped) #' #' Similarly, we map transcripts to the BioCyc pathway data. #' ## ------------------------------------------------------------------------ MMUCyC_Mapped <- MMUCyC %>% left_join(MGI_GeneName, by = "Gene_ID") %>% select(-Trans_ID) %>% unique() rm(MMUCyC) #' #' We can now merge the KEGG and BioCyc data into a single dataframe. #' ## ------------------------------------------------------------------------ KEGG_Cyc_Merged <- rbind(MMUCyC_Mapped, KEGG_Transcript_Mapped) %>% unique() rm(KEGG_Transcript_Mapped) rm(MMUCyC_Mapped) gc() gc() #' #' Now, let's split the pathways data into single and multiple isoform producing genes. #' ## ------------------------------------------------------------------------ MMU_KEGG_Cyc_SIG <- KEGG_Cyc_Merged %>% filter(Gene_ID %in% MMU_SIG$Gene_ID) %>% unique() %>% droplevels() MMU_KEGG_Cyc_SIG <- unique(data.table(apply(MMU_KEGG_Cyc_SIG, c(1, 2), as.character))) %>% droplevels() rm(KEGG_Cyc_Merged) #' #' ## Format Protein-Protein Interaction data #' #' Download the current protein-protein interaction data from: #' - Agile Protein Interactomes DataServer (APID) #' #' Level 2 dataset which includes interactions with atleast 2 experimental evidences was obtained. ## ------------------------------------------------------------------------ MMU_APID <- read.table(APID, header = T, sep = "\t") MMU_APID_PPI <- MMU_APID[, c(4, 7)] MMU_APID_PPI <- droplevels(unique(MMU_APID_PPI)) rm(MMU_APID) colnames(MMU_APID_PPI) <- c("GeneA", "GeneB") #' #' - Biological General Repository for Interaction Datasets (BioGRID) #' #' We remove the interactions which involve non-mouse species. #' ## ------------------------------------------------------------------------ MMU_Biogrid <- read.table(BioGRID, header = T, sep = "\t", comment.char = "") MMU_Biogrid_PPI <- MMU_Biogrid %>% filter(Organism.Interactor.A == 10090 & Organism.Interactor.B == 10090) %>% dplyr::select(Official.Symbol.Interactor.A, Official.Symbol.Interactor.B) MMU_Biogrid_PPI <- droplevels(unique(MMU_Biogrid_PPI)) rm(MMU_Biogrid) colnames(MMU_Biogrid_PPI) <- c("GeneA", "GeneB") #' #' - Integrated Interactions Database (IID) #' #' Because there is no perfect way to accurately verify the interactions inferred from orthology, we remove all such interactions for which there is only orthologous evidence. #' ## ------------------------------------------------------------------------ MMU_IID <- read.table(IID, header = T, sep = "\t") MMU_IID_PPI <- MMU_IID %>% filter(evidence.type != "ortho" & symbol1 != "-" & symbol2 != "-") %>% dplyr::select(symbol1, symbol2) MMU_IID_PPI <- droplevels(unique(MMU_IID_PPI)) rm(MMU_IID) colnames(MMU_IID_PPI) <- c("GeneA", "GeneB") #' #' - mentha #' #' Used interactions with a score greater than 0.2 based on the line, "a single high throughput pull down experiment will receive a score of 0.2" in the mentha manual (http://www.nature.com/nmeth/journal/v10/n8/extref/nmeth.2561-S1.pdf ; Supp Note 2). #' ## ------------------------------------------------------------------------ MMU_Mentha <- read.table(mentha, header = T, sep = ";") MMU_Mentha_PPI <- MMU_Mentha %>% filter(Score >= 0.2) %>% dplyr::select(Gene.A, Gene.B) MMU_Mentha_PPI <- droplevels(unique(MMU_Mentha_PPI)) rm(MMU_Mentha) colnames(MMU_Mentha_PPI) <- c("GeneA", "GeneB") #' #' - IntAct Molecular Interaction Database #' #' First we remove interactions which have atleast one non-mouse interactor. The primary identifiers in IntAct data are the uniprot IDs. However, there is information about the gene name in the Aliases columns. We will extract the gene names using these aliases values. #' ## ------------------------------------------------------------------------ MMU_IntAct <- read.table( IntAct, header = T, sep = "\t", stringsAsFactors = F, strip.white = T, quote = "\"", comment.char = "" ) MMU_IntAct_PPI <- MMU_IntAct %>% filter( Taxid.interactor.A == "taxid:10090(mouse)|taxid:10090(Mus musculus)" & Taxid.interactor.B == "taxid:10090(mouse)|taxid:10090(Mus musculus)" ) %>% dplyr::select(Alias.es..interactor.A, Alias.es..interactor.B) %>% mutate(GeneA = sapply(Alias.es..interactor.A, function(x) sub(".*psi-mi: *(.*?) *\\(display_short.*", "\\1", x))) %>% mutate(GeneB = sapply(Alias.es..interactor.B, function(x) sub(".*psi-mi: *(.*?) *\\(display_short.*", "\\1", x))) %>% dplyr::select(GeneA, GeneB) MMU_IntAct_PPI$GeneA <- sapply(MMU_IntAct_PPI$GeneA, gsub, pattern = "_mouse_gene", replacement = "") MMU_IntAct_PPI$GeneB <- sapply(MMU_IntAct_PPI$GeneB, gsub, pattern = "_mouse_gene", replacement = "") MMU_IntAct_PPI <- droplevels(unique(MMU_IntAct_PPI)) rm(MMU_IntAct) gc() #' #' We can now merge all PPI data. #' ## ------------------------------------------------------------------------ MMU_PPI <- unique(rbind( MMU_APID_PPI, MMU_IID_PPI, MMU_IntAct_PPI, MMU_Mentha_PPI, MMU_Biogrid_PPI )) %>% mutate_if(is.factor, as.character) %>% filter(GeneA != GeneB) %>% unique() rm(MMU_APID_PPI) rm(MMU_Biogrid_PPI) rm(MMU_IID_PPI) rm(MMU_IntAct_PPI) rm(MMU_Mentha_PPI) gc() #' #' Now we extract the PPIs which involve single isoform producing genes and remove self-interactions. #' ## ------------------------------------------------------------------------ MMU_PPI_SIG <- MMU_PPI %>% filter(GeneA %in% MMU_SIG$Gene_ID, GeneB %in% MMU_SIG$Gene_ID, GeneA != GeneB) %>% unique() #' #' Now, we can map the transcripts to the PPI data. #' ## ------------------------------------------------------------------------ MMU_PPI_Mapped_SIG <- MMU_PPI_SIG %>% left_join(MGI_GeneName, by = c("GeneA" = "Gene_ID")) %>% left_join(MGI_GeneName, by = c("GeneB" = "Gene_ID")) %>% select(Transcript_ID.x, Transcript_ID.y) %>% mutate(Pathway = "PPI") %>% unique() colnames(MMU_PPI_Mapped_SIG) <- c("X1", "X2", "Pathway") #' #' Because the PPIs are undirected, we replicate A-B interaction to B-A interactions and combine them. #' ## ------------------------------------------------------------------------ MMU_PPI_SIG_Pairs1 <- MMU_PPI_Mapped_SIG %>% unique() MMU_PPI_SIG_Pairs2 <- MMU_PPI_Mapped_SIG %>% rename(X1 = X2, X2 = X1) %>% unique() MMU_PPI_SIG_Pairs <- rbind(MMU_PPI_SIG_Pairs1, MMU_PPI_SIG_Pairs2) %>% unique() rm(MMU_SIG) rm(MMU_PPI_SIG) rm(MMU_PPI) rm(MMU_PPI_Mapped_SIG) rm(MMU_PPI_SIG_Pairs2) rm(MMU_PPI_SIG_Pairs1) gc() gc() #' #' ## Generate functional transcript isoform pairs #' #' The GO annotations contain few entries that are marked as "NOT". This implies that the the gene product is not involved in that specific function. Although these are very few, we can still use these to create some *negative* transcript isoform pairs. #' We propagate the annotations for a GO term to all its child terms. #' If a gene is annotated as "Not" for one GO term, all pairs of this gene with all genes annotated to this term are labelled as Negative. ## ------------------------------------------------------------------------ MMU_Not_GOP <- MMU_GO %>% filter( MMU_GO$V9 == "P" & MMU_GO$V4 == "NOT" & MMU_GO$V7 != "IEA" & MMU_GO$V7 != "ND" & MMU_GO$V7 != "NAS" & MMU_GO$V13 == "taxon:10090" ) %>% select(2, 3, 5) %>% unique() colnames(MMU_Not_GOP) <- c("Gene_ID", "GeneName", "GO_ID") MMU_Not_GOP_Transcript <- MMU_Not_GOP %>% left_join(MGI_GeneName, by = c("GeneName" = "Gene_ID")) %>% select(-Trans_ID) %>% unique() MMU_GOP_Not_Prop <- MMU_Not_GOP_Transcript %>% left_join(BP_Child_Table1, by = "GO_ID") %>% select(-GO_ID) %>% rename(GO_ID = Children) %>% rbind(MMU_Not_GOP_Transcript) %>% as.data.table() %>% unique() rm(BP_Child_Table1, MMU_Not_GOP_Transcript, MMU_Not_GOP, MGI_GeneName, MMU_GO) gc() gc() MMU_GOP_Not_Prop_Transcript_Pairs <- MMU_GOP_Not_Prop[, tuples(Transcript_ID), by = GO_ID] %>% filter(X1 != X2) %>% rename(Transcript_ID.x = X1, Transcript_ID.y = X2) %>% unique() MMU_GOP_Not_All_GO_Entries_Transcript <- left_join(MMU_GOP_Not_Prop, MMU_GOP_Prop, by = "GO_ID") %>% filter(GeneName.x != GeneName.y) %>% select(GO_ID, Transcript_ID.x, Transcript_ID.y) %>% as.data.table() %>% unique() MMU_GOP_Not_All <- rbind(MMU_GOP_Not_Prop_Transcript_Pairs, MMU_GOP_Not_All_GO_Entries_Transcript) %>% na.omit() %>% unique() rm( MMU_GOP_Not_Transcript, MMU_GOP_Not_All_GO_Entries_Transcript, MMU_GOP_Not_Pairs, MMU_GOP_Not_Prop, MMU_GOP_Not_Prop_Transcript_Pairs, MMU_GOP_Prop ) gc() gc() colnames(MMU_GOP_Not_All) <- c("Pathway", "X1", "X2") #' #' Now, we will form positive pairs of transcripts produced by single isoform producing genes which are annotated to the same GO term or pathway. We will also remove self-pairs and finally merge these with the PPI pairs. #' ## ------------------------------------------------------------------------ colnames(MMU_GOP_Prop_SIG) <- c("GeneName", "Gene_ID", "Pathway", "Transcript_ID") MMU_GOP_KEGG_CYC_SIG <- rbind(MMU_GOP_Prop_SIG[, -c(1, 2)], MMU_KEGG_Cyc_SIG[, -2]) %>% unique() rm(MMU_GOP_Prop_SIG) rm(MMU_KEGG_Cyc_SIG) MMU_GOP_KEGG_CYC_SIG_Pairs <- MMU_GOP_KEGG_CYC_SIG[, tuples(Transcript_ID), by = Pathway] rm(MMU_GOP_KEGG_CYC_SIG) MMU_GOP_KEGG_CYC_PPI_SIG_Pairs <- rbind(MMU_GOP_KEGG_CYC_SIG_Pairs, MMU_PPI_SIG_Pairs) %>% filter(X1 != X2) %>% na.omit() %>% unique() %>% mutate(Label = "1") fwrite( unique(MMU_GOP_KEGG_CYC_PPI_SIG_Pairs[, 2:4]), "../Data/MMU_GOP_KEGG_CYC_PPI_SIG_Pairs_Labelled_11Nov18.txt", sep = ",", quote = F, row.names = F, col.names = F ) rm(MMU_GOP_KEGG_CYC_SIG_Pairs, MMU_PPI_SIG_Pairs) gc() gc() identical(table(unique(MMU_GOP_KEGG_CYC_PPI_SIG_Pairs$X1)), table(unique(MMU_GOP_KEGG_CYC_PPI_SIG_Pairs$X2))) #' ## Generate negative transcript isoform pairs #' #' If a mRNA isoform pair is annotated as both positive and negative, we consider such mRNA isoform pairs as positive because they are involved in at least one same GO term, pathway or a PPI. ## ------------------------------------------------------------------------ negativeNetwork <- MMU_GOP_Not_All %>% select(-Pathway) %>% igraph::graph.data.frame(directed = F) %>% igraph::simplify() positiveNetwork <- MMU_GOP_KEGG_CYC_PPI_SIG_Pairs %>% select(-Pathway,-Label) %>% unique() %>% igraph::graph.data.frame(directed = F) %>% igraph::simplify() MMU_Negative_pairs_final <- igraph::graph.difference(negativeNetwork, positiveNetwork) %>% igraph::simplify() %>% igraph::as_data_frame("edges") %>% mutate(Label = "0") fwrite( unique(MMU_Negative_pairs_final), "../Data/MMU_Negative_Pairs_Labelled_11Nov18.txt", sep = ",", quote = F, row.names = F, col.names = F ) fwrite( unique(MMU_Negative_pairs_final[, c(2, 1, 3)]), "../Data/MMU_Negative_Pairs_Labelled_11Nov18.txt", sep = ",", quote = F, row.names = F, col.names = F, append = T ) identical(table(unique(MMU_Negative_pairs_final$X1)), table(unique(MMU_Negative_pairs_final$X2)))