RefSeq 数据库 GFF 格式转换

最近一项目被RefSeq数据库中gff注释格式折磨死,流程总是不能正确识别,各种方法改了格式,勉强运行,后续步骤继续报错,想着以后肯定还有更多这种情况,自己对文件格式的理解,写了格式转换脚本。

GFF和GTF格式非常类似,都被用来基因结构注释,由9列表示不同意义的特征组成,多行共同注释一个基因的详细结构。然而两者又有很大差异,脚本处理起来也是不同。相比而言GTF处理起来简单些,第三列feature可能是gene, transcript, exon, CDS这少数几种情况,最重要的描述基因类型名称的第九列attr很统一,都会包括一般常用的gene_id, gene_biotype, transcript_id, transcript_biotype等特征。而GFF文件包括一个明确的层级关系,genetranscript(存在mRNA,lncRNA,ncRNA,rRNA,tRNA等多种不同组织方式)的父节点,而transcript又是exonCDS的父节点,特征层级不同也决定了第九列attr的注释信息不同,gene id, name等信息也是层次关系关联,文件处理较麻烦。

NCBI RefSeq数据库中GFF注释格式感觉更是复杂怪异,废话不多说,直接上代码,愚蠢方法手撕GFF

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os
import sys
import gzip
from collections import defaultdict
from collections import Counter

def GFF_parser(file):
openfile = gzip.open if file.endswith('gz') else open
with openfile(file) as anno_file:
for line in anno_file:
if line.startswith('#'):
continue
seqname, source, feature, start, end, score, strand, \
frame, attributeStr = line.strip().split('\t')
attribute = [i.strip() for i in attributeStr.split(';') if i.strip() != '']
attr = {i.partition('=')[0]:i.partition('=')[2] for i in attribute}
yield seqname, source, feature, int(start), int(end), score, strand, frame, attr

def format_gtf_line(chro, feature, start, end, strand, attr, \
source=None, score=None, frame=None):
source = source if source else 'RefSeq'
score = score if score else '.'
frame = frame if frame else '.'
attrstr = ''.join(['{} "{}"; '.format(i[0], i[1]) for i in sorted(attr.items())])
gtf_line = '{chro}\t{source}\t{feature}\t{start}\t{end}\t{score}\t\
{strand}\t{frame}\t{attrstr}'.format(**locals())
return gtf_line

def main(gff, output):
gtf_out = open(output, 'w')

# RefSeq数据库中GFF文件feature收集,或去除无意义项,或替换
skip = ['region', 'miRNA', 'cDNA_match', 'repeat_region', 'D_loop', \
'match', 'origin_of_replication', 'centromere', 'sequence_feature']
transcript = ['mRNA', 'lnc_RNA', 'primary_transcript', 'tRNA', 'rRNA', \
'C_gene_segment', 'V_gene_segment', 'SRP_RNA', 'ncRNA', 'transcript']
biotype = ['lnc_RNA', 'tRNA', 'rRNA', 'SRP_RNA']

order = []
gene_info = defaultdict(dict)
transcript_info = defaultdict(dict)
check_gene_name = {} #检查重复gene_name
check_transcript_name = {}
for i in GFF_parser(gff):
chro, source, feature, start, end, score, strand, frame, attr = i

if feature in skip:
continue
elif feature == 'gene':
gene_id = attr['ID']
gene_info[gene_id]['name'] = attr['Name']
gene_info[gene_id]['biotype'] = attr['gene_biotype']
check_gene_name[gene_id] = attr['Name']
#使用字典transcript_info储存转录本信息
elif feature in transcript:
transcript_id = attr['ID']
gene_id = attr['Parent'] if 'Parent' in attr else transcript_id
transcript_name = attr['transcript_id'] if 'transcript_id' \
in attr else transcript_id
order.append(transcript_id)
transcript_info[transcript_id]['gtf_info'] = [chro, start, end, strand]
transcript_info[transcript_id]['name'] = transcript_name
transcript_info[transcript_id]['gene'] = gene_id
# 部分转录本不存在对应gene信息,直接关联转录类型
if feature in biotype:
transcript_info[transcript_id]['biotype'] = feature
check_transcript_name[transcript_id] = transcript_name
elif feature == 'exon':
transcript_id = attr['Parent']
try:
transcript_info[transcript_id]['exon'][1].append(start)
transcript_info[transcript_id]['exon'][2].append(end)
except KeyError:
transcript_info[transcript_id]['exon'] = [chro, [start,], \
[end,], strand]
elif feature == 'CDS':
transcript_id = attr['Parent']
try:
transcript_info[transcript_id]['CDS'][1].append(start)
transcript_info[transcript_id]['CDS'][2].append(end)
transcript_info[transcript_id]['CDS'][4].append(frame)
except KeyError:
transcript_info[transcript_id]['CDS'] = [chro, [start,], \
[end,], strand, [frame,]]
else:
sys.stderr.write('Warning: Annotation Feature "{}" is not collected, '\
'Unknown error may occur.\n'.format(feature))

# 检查重复gene_name, transcript_names,进行替换
dup_name = lambda list_in: set([i[0] for i in Counter(list_in).items() if i[1] > 1])
gene_name = [i[1] for i in check_gene_name.items()]
dup_gene_name = dup_name(gene_name)

transcript_name = [i[1] for i in check_transcript_name.items()]
dup_transcript_name = dup_name(transcript_name)

# 输出GTF文件,order控制输出顺序
for transcript in order:
attr = {}
gene_id = transcript_info[transcript]['gene']
try:
gene_name = gene_info[gene_id]['name']
gene_name = gene_id if gene_name in dup_gene_name else gene_name
except KeyError:
gene_name = gene_id
attr['gene_id'] = gene_name
transcript_name = transcript_info[transcript]['name']
transcript_name = transcript if transcript_name in
dup_transcript_name else transcript_name
attr['transcript_id'] = transcript_name
try:
attr['transcript_biotype'] = transcript_info[transcript]['biotype']
except KeyError:
attr['transcript_biotype'] = gene_info[gene_id]['biotype']
try:
attr['gene_biotype'] = gene_info[gene_id]['biotype']
except KeyError:
attr['gene_biotype'] = transcript_info[transcript]['biotype']
chro, start, end, strand = transcript_info[transcript]['gtf_info']
gtf_out.write('{}\n'.format(format_gtf_line(chro,'transcript',、
start,end,strand,attr)))
try:
exon_feature = transcript_info[transcript]['exon']
except KeyError:
exon_feature = transcript_info[transcript]['CDS'][:-1]
chro, start, end, strand = exon_feature
for i,j in enumerate(start):
gtf_out.write('{}\n'.format(format_gtf_line(chro,'exon',、
start[i],end[i],strand,attr)))
try:
cds_feature = transcript_info[transcript]['CDS']
chro, start, end, strand, frame = cds_feature
for i,j in enumerate(start):
gtf_out.write('{}\n'.format(format_gtf_line(chro, 'CDS', start[i], \
end[i], strand, attr, frame=frame[i])))
except KeyError:
continue
#部分注释信息不完善,多次使用try... except...输出替代的对应注释信息
gtf_out.close()

GFF格式包括gene, transcript, exon(CDS)三个层级,其中transcript进行上下关联最为重要,这里脚本以transcript作为键值进行数据处理,另外脚本还有一个假设就是gene, transcript对应ID在文件中唯一。

---------本文结束,感谢您的阅读---------