parse blast XML use python

2017.05


blast 输出 xml 格式结果,包括最完整的比对信息,经常需要自己从 blast.xml 结果中提取想要的信息,最开始都自己循环 xml 一行行匹配,一点都不 pythonic,陆续发现了一些新的 xml 解析实现,整理如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def parse_BLAST_XML(infile):
get = lambda x: re.search('>.*?<', x).group(0)[1:-1]

r_query_id = re.compile("<Iteration_query-ID>")
r_query_def = re.compile("<Iteration_query-def>")
r_query_len = re.compile("<Iteration_query-len>")
r_hit_num = re.compile("<Hit_num>")
r_hit_id = re.compile("<Hit_id>")
r_hit_def = re.compile("<Hit_def>")
r_hit_accession = re.compile("<Hit_accession>")
r_hit_len = re.compile("<Hit_len>")
r_hsp_num = re.compile("<Hsp_num>")
r_hsp_bit_score = re.compile("<Hsp_bit-score>")
r_hsp_score = re.compile("<Hsp_score>")
r_hsp_evalue = re.compile("<Hsp_evalue>")
r_hsp_query_from = re.compile("<Hsp_query-from>")
r_hsp_query_to = re.compile("<Hsp_query-to>")
r_hsp_hit_from = re.compile("<Hsp_hit-from>")
r_hsp_hit_to = re.compile("<Hsp_hit-to>")
r_hsp_query_frame = re.compile("<Hsp_query-frame>")
r_hsp_hit_frame = re.compile("<Hsp_hit-frame>")
r_hsp_identity = re.compile("<Hsp_identity>")
r_hsp_positive = re.compile("<Hsp_positive>")
r_hsp_gaps = re.compile("<Hsp_gaps>")
r_hsp_align_len = re.compile("<Hsp_align-len>")
r_hsp_qseq = re.compile("<Hsp_qseq>")
r_hsp_hseq = re.compile("<Hsp_hseq>")
r_hsp_midline = re.compile("<Hsp_midline>")

for line in open(infile,'r'):
if r_query_id.search(line) is not None:
query_id = get(line)
elif r_query_def.search(line) is not None:
query_def = get(line)
elif r_query_len.search(line) is not None:
query_len = get(line)
elif r_hit_num.search(line) is not None:
hit_num = get(line)
elif r_hit_id.search(line) is not None:
hit_id = get(line)
elif r_hit_def.search(line) is not None:
hit_def = get(line)

elif r_hit_accession.search(line) is not None:
hit_accession = get(line)
elif r_hit_len.search(line) is not None:
hit_len = get(line)
elif r_hsp_num.search(line) is not None:
hsp_num = get(line)
elif r_hsp_bit_score.search(line) is not None:
hsp_bit_score = get(line)
elif r_hsp_score.search(line) is not None:
hsp_score = get(line)
elif r_hsp_evalue.search(line) is not None:
hsp_evalue = get(line)
elif r_hsp_query_from.search(line) is not None:
hsp_query_from = get(line)
elif r_hsp_query_to.search(line) is not None:
hsp_query_to = get(line)
elif r_hsp_hit_from.search(line) is not None:
hsp_hit_from = get(line)
elif r_hsp_hit_to.search(line) is not None:
hsp_hit_to = get(line)
elif r_hsp_query_frame.search(line) is not None:
hsp_query_frame = get(line)
elif r_hsp_hit_frame.search(line) is not None:
hsp_hit_frame = get(line)
elif r_hsp_identity.search(line) is not None:
hsp_identity = get(line)
elif r_hsp_positive.search(line) is not None:
hsp_positive = get(line)
elif r_hsp_gaps.search(line) is not None:
hsp_gaps = get(line)
elif r_hsp_align_len.search(line) is not None:
hsp_align_len = get(line)
elif r_hsp_qseq.search(line) is not None:
hsp_qseq = get(line)
elif r_hsp_hseq.search(line) is not None:
hsp_hseq = get(line)
elif r_hsp_midline.search(line) is not None:
hsp_midline = get(line)

percent_ident = "%5.2f" % (((float(hsp_identity)/float(hsp_align_len))*100))
mismatch = str(int(hsp_align_len) - int(hsp_identity))
# ['query id', 'query name', 'subject id', 'identity(%)', 'alignment length', 'mismatches', 'gap opens', 'q. start', 'q. end', 's. start', 's. end', 'evalue', 'bit score', 'subject description']
yield [query_id, query_def, hit_id, percent_ident, hsp_align_len, mismatch, hsp_gaps, hsp_query_from, hsp_query_to, hsp_hit_from, hsp_hit_to, hsp_evalue, hsp_bit_score, hit_def]

Bio.Blast.NCBIXML.parse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from __future__ import division
from Bio.Blast import NCBIXML
# biopython

def parse_blast_xml(result, threshold=1e-5):
'''
result: opened file handle of xml format
'''
for blastrecords in NCBIXML.parse(result):
query_id, _, query_descri = blastrecords.query.partition(' ')
query_len = blastrecords.query_length
for align in blastrecords.alignments:
sbjct_title = align.title
sbjct_accession = align.accession
sbjct_descri = align.hit_def
sbjct_len = align.length
for hsp in align.hsps:
hsp_evalue = hsp.expect
if hsp_evalue > threshold:
continue
hsp_score = hsp.score
hsp_gap = hsp.gaps
hsp_len = hsp.align_length
hsp_match = hsp.identities
hsp_frame = hsp.frame # nucl blast to protein database
hsp_strand = hsp.strand # nucl2prot
query_seq = hsp.query
query_start = hsp.query_start
query_end = hsp.query_end
sbjct_seq = hsp.sbjct
sbjct_start = hsp.sbjct_start
sbjct_end = hsp.sbjct_end
tmp = [
query_id,
query_descri,
query_seq,
query_len,
# hsp_frame,
# hsp_strand,
sbjct_accession,
# sbjct_title,
sbjct_descri,
sbjct_seq,
sbjct_len,
hsp_match / hsp_len,
hsp_len,
hsp_len - hsp_match,
hsp_gap,
query_start,
query_end,
sbjct_start,
sbjct_end,
hsp_evalue,
hsp_score,
]
# qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore
yield tmp

other python modules

1
2
3
# http://pythonguidecn.readthedocs.io/zh/latest/scenarios/xml.html
# http://www.runoob.com/python/python-xml.html
# 懒得再去看各种文档,挖坑...
---------本文结束,感谢您的阅读---------