Python生物信息学数据管理——第3、4章课后习题

103 阅读 0 评论 68 点赞

我是靠谱客的博主爱笑发带，这篇文章主要介绍Python生物信息学数据管理——第3、4章课后习题，现在分享给大家，希望可以做个参考。

#初学小白，仅供参考
3.1读取和写入文件

复制代码

1
2
3
4
5
6
7
fo=open('neuron_data.txt','r')
fi=open('neuron_data_副本.txt','w')
txt=fo.read()
fi.write(txt)
fo.close()
fi.close()

3.2计算平均值和标准差

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import math
fo=open('neuron_data.txt','r')
fi=open('结果.txt','w')
txt=fo.readlines()
data=[]
for i in txt:
data.append(float(i.strip()))
average=sum(data)/len(data)
total=0
for value in data:
total+=(value-average)**2
stddev=math.sqrt(total/len(data))
fi.write('平均长度'+str(average)+'n')
fi.write('标准差'+str(round(stddev,2)))
fo.close()
fi.close()

3.3/4/5核苷酸的频率、DNA序列的GC含量

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
fo=open('a序列.txt','r')
fi=open('结果.txt','w')
txt=fo.read()
data={}
for i in txt:
data[i]=data.get(i,0)+1
del data['n']
total=sum(data.values())
maxfre=0
for i in data:
fi.write('{}的频率为{:.1%}n'.format(i,data[i]/total))
if data[i]>maxfre:
maxfre=data[i]
base=i
fi.write('{}出现的频率最高:{:.1%}n'.format(base,maxfre/total))
fi.write('GC含量为{:.1%}'.format((data['G']+data['C'])/total))
fo.close()
fi.close()

4.1读取和写入多序列FASTA文件

复制代码

1
2
3
4
5
6
7
8
9
10
fasta_file=open('input.fasta','r')
for line in fasta_file:
if line[0]=='>':
AC=line.split('|')[1]
outfile=open(AC+'.fasta','w')
outfile.write(line)
if line[0]!='>':
outfile.write(line)
outfile.close()

4.2读取和过滤FASTA文件

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
fasta_file=open('input.fasta','r')
out_file=open('result.fasta','w')
seq=''
for line in fasta_file:
if line[0]=='>' and seq=='':
header=line
elif line[0]!='>':
seq+=line
else:
if seq[0]=='M' and seq.count('W')>1:
out_file.write(header+seq)
seq=''
header=line
if seq[0]=='M' and seq.count('W')>1:
out_file.write(header+seq)
fasta_file.close()
out_file.close()

4.3计算在FASTA格式中单个DNA序列的核苷酸频率

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
input_file=open('a序列.txt','r')
seq=''
for line in input_file:
if line[0]=='>':
header=line
elif line[0]!='>':
seq+=line
Total=float(len(seq))
count_A=seq.count('A')
count_T=seq.count('T')
count_C=seq.count('C')
count_G=seq.count('G')
print('A的频率{:.2}nT的频率{:.2}nC的频率{:.2}nG的频率{:.2}'.format(count_A/Total,count_T/Total,count_C/Total,count_G/Total))
input_file.close()

4.4计算在FASTA格式中多个DNA序列的核苷酸频率
4.5计算在GenBank中的多个DNA序列的核苷酸频率

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
fasta_file=open('a序列.txt','r')
seq=''
for line in fasta_file:
if line[0]=='>' and seq=='':
AC=line.split('|')[1].split('_cds_')[0]
elif line[0]!='>':
seq+=line
else:
a=seq.count('A')/float(len(seq))
t=seq.count('T')/float(len(seq))
c=seq.count('C')/float(len(seq))
g=seq.count('G')/float(len(seq))
print('{}tA:{:.2}tT:{:.2}tC:{:.2}tG:{:.2}'.format(AC,a,t,c,g))
seq=''
AC=line.split('|')[1].split('_cds_')[0]
a=seq.count('A')/float(len(seq))
t=seq.count('T')/float(len(seq))
c=seq.count('C')/float(len(seq))
g=seq.count('G')/float(len(seq))
print('{}tA:{:.2}tT:{:.2}tC:{:.2}tG:{:.2}'.format(AC,a,t,c,g))
fasta_file.close()