-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathvalidate_input_annotation.R
130 lines (112 loc) · 6.9 KB
/
validate_input_annotation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# PiGx RNAseq Pipeline.
#
# Copyright © 2022 Bora Uyar <[email protected]>
#
# This file is part of the PiGx RNAseq Pipeline.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Check the contents of the annotation files (e.g. GTF/Fasta)
# for potential errors to abort early.
args <- commandArgs(trailingOnly = TRUE)
gtfFile <- args[1] # path to gene annotations in GTF format
cDNAfastaFile <- args[2] # path to transcriptome sequences in fasta format
genomeFastaFile <- args[3] # path to DNA sequences in fasta format
outDir <- args[4] # where to save the annotation stats
message(date(), " Checking annotation files for potential issues")
# Check to see if the input GTF file is parseable.
message(date()," => Checking to see if GTF file can be properly imported from here: ",gtfFile)
gtf <- rtracklayer::import.gff(gtfFile)
message(date()," => Imported ",length(gtf)," features from the GTF file")
if(length(gtf) == 0) {
stop("ERROR: GTF file seems to be empty here:",gtfFile)
}
# check if gene_id field is accessible
if(!'gene_id' %in% colnames(GenomicRanges::mcols(gtf))){
stop("ERROR: Can't access gene_id column in the GTF File, check the format of your GTF file")
}
if(!'transcript_id' %in% colnames(GenomicRanges::mcols(gtf))){
stop("ERROR: Can't access transcript_id column in the GTF File, check the format of your GTF file")
}
message(date(), " => GTF file contains annotations for ",length(unique(gtf$gene_id)), " genes ",
"and ",length(unique(gtf$transcript_id)), " transcripts")
# Check to see if the cDNA fasta file can be imported
cDNA <- Biostrings::readDNAStringSet(cDNAfastaFile, format = 'fasta')
if(length(cDNA) == 0){
stop("ERROR: cDNA fasta file seems to be empty")
}
message(date(), " => Imported ",length(cDNA), " transcripts from the cDNA fasta file at ", cDNAfastaFile)
# Check to see if the transcript ids in the GTF file match the transcript ids in the cDNA Fasta file
# if not, give a warning how this will affect id matching for Salmon and give a suggestion on
# how to update the cDNA file
matched <- length(intersect(names(cDNA), unique(gtf$transcript_id)))
message(date(), " => Number of transcript ids matching between the GTF file and the cDNA fasta file: ",matched)
if(matched < 0.1 * length(unique(gtf$transcript_id))) {
warning(date(), " => Couldn't match the transcript ids between the GTF file and the cDNA fasta file.
This will cause problems in getting gene-level expression estimates when running Salmon.
However, the transcript-level expression estimation won't suffer.
Possible reason is the source databases of these annotations are different.
Another possible reason is the additional annotations on the transcript IDs in the
fasta file: example entry for the transcript 'ENST00000202017' in GTF file is
'ENST00000202017.5 cdna chromosome:GRCh38:20:31944342:31952092 ...' in the Fasta File.
If using annotations from ENSEMBL, the fasta file can be cleaned up with the following
`sed` command: ",
"sed 's/\\(ENST[0-9]*\\)\\.[0-9]*.*/\\1/g' <cDNA_file_path> > cDNA_file.cleaned.fasta")
}
# Check to see if the genome fasta can be imported and
# see if the chromosome names in the genome fasta are on
# the same convention as used in the GTF file
DNA <- Biostrings::readDNAStringSet(genomeFastaFile, format = 'fasta')
message(date(), " => Imported ",length(DNA)," chromosomes/contigs from the DNA fasta files at ",genomeFastaFile)
# ignore annotations in the fasta file next the chromosome ids:
# we have to assume the fasta header contains chromosome name + optional annotations separated by space
names(DNA) <- gsub(" .+$", "", names(DNA))
# abort if the chromosome names don't match in DNA and GTF
matched <- intersect(GenomeInfoDb::seqlevels(DNA), GenomeInfoDb::seqlevels(gtf))
message(date(), " => Number of chromosomes/contigs matching between the GTF file and the genome fasta file: ",length(matched))
message(date(), " => List of chromosomes/contigs matching between the GTF file and the genome fasta file:\n",
paste(matched, collapse = ' '))
if(length(matched) == 0) {
stop("ERROR: Couldn't match any of the chrosomosome ids between the GTF file and the genome fasta file.",
"\nThis will cause problems in getting expression estimates.",
"\nExample chromosome names in the genome fasta file: ", paste(head(GenomeInfoDb::seqlevels(DNA)), collapse = ' '),
"\nExample chromosome names in the GTF file: ", paste(head(GenomeInfoDb::seqlevels(gtf)), collapse = ' '))
}
# print out some stats about the input annotations
input_stats <- rbind(data.frame('Type' = 'GTF',
'Description' = 'Number of genes',
'Value' = length(unique(gtf$gene_id))),
data.frame('Type' = 'GTF',
'Description' = 'Number of transcripts',
'Value' = length(unique(gtf$transcript_id))),
data.frame('Type' = 'GTF',
'Description' = 'Naming convention',
'Value' = paste(GenomeInfoDb::seqlevelsStyle(gtf), collapse = ' ')),
data.frame('Type' = 'DNA',
'Description' = 'Number of chromosomes',
'Value' = length(DNA)),
data.frame('Type' = 'DNA',
'Description' = 'Naming convention',
'Value' = paste(GenomeInfoDb::seqlevelsStyle(DNA), collapse = ' ')),
data.frame('Type' = 'cDNA',
'Description' = 'Number of transcripts',
'Value' = length(cDNA)),
data.frame('Type' = 'cDNA',
'Description' = 'Mean length of transcripts',
'Value' = mean(lengths(cDNA))),
data.frame('Type' = 'cDNA vs GTF',
'Description' = 'Number of matching transcript identifiers',
'Value' = length(intersect(names(cDNA), unique(gtf$transcript_id)))))
write.table(input_stats, file = file.path(outDir, 'input_annotation_stats.tsv'),
sep = '\t', quote = F, row.names = F)
message(date(), " => Finished checking annotation files.")