Chapter5 HOW DO WE COMPARE DNA SEQUENCES
Bioinformatics Algorithms-An_Active Learning Approach
http://bioinformaticsalgorithms.com/
一、
1983年,Russell Doolitte 将血小板源生长因子[platelet derived growth factor(PDGF),一种刺激细胞增殖的物质]和其它已知基因比对,发现它的序列和原癌基因(oncogene)的序列有很高的相似度。这让科学家猜测某些癌症是因为基因在不合适的时机作用所致(scientists hypothesized that some forms of cancer might be caused by a good gene doing the right thing at the wrong time.)。
二、提出问题 序列比对:动态规划法
1.全局比对:
'''
Code Challenge: Solve the Global Alignment Problem.
Input: Two protein strings written in the single-letter amino acid alphabet.
Output: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the
BLOSUM62 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
----------
Sample Input:
PLEASANTLY
MEANLY
----------
Sample Output:
8
PLEASANTLY
-MEA--N-LY
----------
@ Lo Kowngho 2018.9
'''
import numpy
from os.path import dirname def Grade(Symb1,Symb2):
Indx1 = symbolList[Symb1]
Indx2 = symbolList[Symb2]
return matrix[Indx1][Indx2] def Init_Graph_Global(l1,l2):
Graph = numpy.zeros([l2,l1])
for x in range(1,l2):
Graph[x][0] = Graph[x-1][0]-5
for y in range(1,l1):
Graph[0][y] = Graph[0][y-1]-5
return Graph def Init_Path(l1,l2):
Path = numpy.zeros([l2,l1])
for x in range(1,l2):
Path[x][0] = 1
for y in range(1,l1):
Path[0][y] = 2
return Path def buildGlobalAlignmentGraph(text1,text2): l1 = len(text1)
l2 = len(text2)
Graph = Init_Graph_Global(l1, l2)
Path = Init_Path(l1, l2) for x in range(1,l2):
for y in range(1,l1):
Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[y],text2[x]))
if Graph[x-1][y]-5==Graph[x][y]:
Path[x][y]=1
elif Graph[x][y-1]-5==Graph[x][y]:
Path[x][y]=2
else:
Path[x][y]=3
return [Graph,Path] def OutputGlobalAligement(Path,Graph,text1,text2):
outT1 = ''
outT2 = ''
l1 = len(text1)
l2 = len(text2)
x = l2-1
y = l1-1
while(x!=0 or y!=0):
if Path[x][y]==1:
outT1 += '-'
outT2 += text2[x]
x -= 1
elif Path[x][y]==2:
outT1 += text1[y]
outT2 += '-'
y -= 1
else:
outT1 += text1[y]
outT2 += text2[x]
x -= 1
y -= 1
return [outT1[::-1],outT2[::-1]] def ImportScoreMatrix():
dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n')
symbolList = dataset[0].split()
for i in range(len(symbolList)):
symbolList[i]=[symbolList[i],i]
symbolList = dict(symbolList)
print(symbolList)
matrix = []
for i in range(1,len(dataset)):
matrix.append(dataset[i].split()[1:])
for l in range(len(matrix)):
for i in range(len(matrix[l])):
matrix[l][i]=int(matrix[l][i])
return [matrix,symbolList] if __name__ == '__main__': [matrix,symbolList] = ImportScoreMatrix() dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
text1 = ''+dataset[0]
text2 = ''+dataset[1] [Graph,Path] = buildGlobalAlignmentGraph(text1, text2) [outT1,outT2] = OutputGlobalAligement(Path,Graph,text1,text2) print(int(Graph[-1][-1]))
print(outT1)
print(outT2)
全局比对 python
2. 局部比对
可以把局部比对想象成下面的Free Taxi场景,在开始和结尾都不受罚分约束,只在中间的某一过程受罚分约束。
在全局比对的基础上,状态转移方程在加上一个0,表示每一个点,既可以由→、↓、↘经过罚分得到,也可以直接由起点,不经罚分得到(Free Taxi)。
'''
Code Challenge: Solve the Local Alignment Problem.
Input: Two protein strings written in the single-letter amino acid alphabet.
Output: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum
score. Use the PAM250 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
---------------
Sample Input:
MEANLY
PENALTY
---------------
Sample Output:
15
EANL-Y
ENALTY
---------------
Lo Kwongho 2018.9
'''
import numpy
from os.path import dirname def Grade(Symb1,Symb2):
Indx1 = symbolList[Symb1]
Indx2 = symbolList[Symb2]
return matrix[Indx1][Indx2] def Init_Graph_Local(l1,l2):
Graph = numpy.zeros([l1,l2])
return Graph def Init_Path(l1,l2):
Path = numpy.ones([l1,l2])*-1
for x in range(1,l1):
Path[x][0] = 1
for y in range(1,l2):
Path[0][y] = 2
return Path def buildLocalAlignmentGraph(text1,text2):
l1 = len(text1)
l2 = len(text2)
Graph = Init_Graph_Local(l1, l2)
Path = Init_Path(l1, l2) for x in range(1,l1):
for y in range(1,l2):
Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[x],text2[y]),0)
if Graph[x-1][y]-5 == Graph[x][y]:
Path[x][y] = 1
elif Graph[x][y-1]-5==Graph[x][y]:
Path[x][y] = 2
elif Graph[x][y] == 0:
Path[x][y] = 0
else:
Path[x][y] = 3
maxVal = 0
maxIndx = [-1,-1]
for x in range(1,l1):
for y in range(1,l2):
if Graph[x][y]>maxVal:
maxVal=Graph[x][y]
maxIndx=[x,y] return [Graph,Path,maxIndx] def OutputLocalAligement(Path,Graph,text1,text2,maxIndx):
outT1 = ''
outT2 = ''
l1 = len(text1)
l2 = len(text2)
x = maxIndx[0]
y = maxIndx[1]
while(x!=0 or y!=0):
if Path[x][y]==1:
outT1 += text1[x]
outT2 += '-'
x -= 1
elif Path[x][y]==2:
outT1 += '-'
outT2 += text2[y]
y -= 1
elif Path[x][y]==3:
outT1 += text1[x]
outT2 += text2[y]
x -= 1
y -= 1
else:
x=0
y=0
return [outT1[::-1],outT2[::-1]] def ImportScoreMatrix():
dataset = open(dirname(__file__)+'PAM250.txt').read().strip().split('\n')
symbolList = dataset[0].split()
for i in range(len(symbolList)):
symbolList[i]=[symbolList[i],i]
symbolList = dict(symbolList)
matrix = []
for i in range(1,len(dataset)):
matrix.append(dataset[i].split()[1:])
for l in range(len(matrix)):
for i in range(len(matrix[l])):
matrix[l][i]=int(matrix[l][i])
return [matrix,symbolList] if __name__ == '__main__':
[matrix,symbolList] = ImportScoreMatrix() dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
text1 = ''+dataset[0]
text2 = ''+dataset[1] [Graph,Path,maxIndx] = buildLocalAlignmentGraph(text1,text2) [outT1,outT2]=OutputLocalAligement(Path,Graph,text1,text2,maxIndx)
print(int(Graph[maxIndx[0]][maxIndx[1]]))
print(outT1)
print(outT2)
局部比对 Python
3. Overlarp Alignment
'''
Code Challenge: Solve the Overlap Alignment Problem.
>>Input: Two strings v and w, each of length at most 1000.
>>Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of v and a prefix w' of w.
achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch and indel penalties are 2.
-------------------
Sample Input:
PAWHEAE
HEAGAWGHEE
-------------------
Sample Output:
1
HEAE
HEAG
-------------------
coder: Lo Kwongho
''' import numpy
from os.path import dirname def Init_Graph_Overlap(l1,l2):
Graph = numpy.zeros([l1,l2])
for y in range(1,l2):
Graph[0][y] = Graph[0][y-1]-1
return Graph def Init_Path(l1,l2):
Path = numpy.ones([l1,l2])*-1
for x in range(1,l1):
Path[x][0] = 1
for y in range(1,l2):
Path[0][y] = 2
return Path def buildOverlapAlignmentGraph(text1,text2):
l1 = len(text1)
l2 = len(text2)
Graph = Init_Graph_Overlap(l1, l2)
Path = Init_Path(l1,l2)
for x in range(1,l1):
for y in range(1,l2):
if text1[x]==text2[y]:
Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-2,Graph[x][y-1]-2)
else:
Graph[x][y]=max(Graph[x-1][y-1]-2,Graph[x-1][y]-2,Graph[x][y-1]-2)
if Graph[x][y]==Graph[x-1][y]-2:
Path[x][y]=1
elif Graph[x][y]==Graph[x][y-1]-2:
Path[x][y]=2
else:
Path[x][y]=3 maxVal = 0
maxIndx = -1
for i in range(l2):
if Graph[l1-1][i]>maxVal:
maxVal=Graph[l1-1][i]
maxIndx=i return [Graph,Path,maxIndx,maxVal] def OutputOverlapAligement(Path,Graph,text1,text2,maxIndx):
outT1 = ''
outT2 = ''
l1 = len(text1)
l2 = len(text2)
x = l1-1
y = maxIndx
while(y!=0):
if Path[x][y]==1:
outT1 += text1[x]
outT2 += '-'
x -= 1
elif Path[x][y]==2:
outT1 += '-'
outT2 += text2[y]
y -= 1
elif Path[x][y]==3:
outT1 += text1[x]
outT2 += text2[y]
x -= 1
y -= 1
else:
x=0
y=0
return [outT1[::-1],outT2[::-1]] if __name__ == '__main__':
dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
text1 = ''+dataset[0]
text2 = ''+dataset[1]
l1 = len(text1)
l2 = len(text2)
[Graph,Path,maxIndx,maxVal] = buildOverlapAlignmentGraph(text1,text2)
#print(Graph) [outText1,outText2]=OutputOverlapAligement(Path, Graph, text1, text2, maxIndx) print(int(maxVal))
print(outText1)
print(outText2)
Overlarp in python
4.Fitting Alignment
'''
Fitting Alignment Problem: Construct a highest-scoring fitting alignment between two strings.
>>Input: Strings v and w as well as a matrix Score.
>>Output: A highest-scoring fitting alignment of v and w as defined by the scoring matrix Score.
-------------------
Sample Input:
GTAGGCTTAAGGTTA
TAGATA
-------------------
Sample Output:
2
TAGGCTTA
TAGA--TA
-------------------
coder: Lo Kwongho
''' import numpy
from os.path import dirname def Init_Graph_Fiting(l1,l2):
Graph = numpy.zeros([l1,l2])
for y in range(1,l2):
Graph[0][y] = Graph[0][y-1]-1
return Graph def Init_Path(l1,l2):
Path = numpy.ones([l1,l2])*-1
for x in range(1,l1):
Path[x][0] = 1
for y in range(1,l2):
Path[0][y] = 2
return Path def buildFittingAlignmentGraph(text1,text2):
l1 = len(text1)
l2 = len(text2)
Graph = Init_Graph_Fiting(l1, l2)
Path = Init_Path(l1,l2)
for x in range(1,l1):
for y in range(1,l2):
if text1[x]==text2[y]:
Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-1,Graph[x][y-1]-1)
else:
Graph[x][y]=max(Graph[x-1][y-1]-1,Graph[x-1][y]-1,Graph[x][y-1]-1)
if Graph[x][y]==Graph[x-1][y]-1:
Path[x][y]=1
elif Graph[x][y]==Graph[x][y-1]-1:
Path[x][y]=2
else:
Path[x][y]=3 maxVal = 0
maxIndx = -1
for i in range(l1):
if Graph[i][l2-1]>maxVal:
maxVal=Graph[i][l2-1]
maxIndx=i return [Graph,Path,maxIndx,maxVal] def OutputFittingAligement(Path,Graph,text1,text2,maxIndx):
outT1 = ''
outT2 = ''
l1 = len(text1)
l2 = len(text2)
x = maxIndx
y = l2-1
while(y!=0):
if Path[x][y]==1:
outT1 += text1[x]
outT2 += '-'
x -= 1
elif Path[x][y]==2:
outT1 += '-'
outT2 += text2[y]
y -= 1
elif Path[x][y]==3:
outT1 += text1[x]
outT2 += text2[y]
x -= 1
y -= 1
else:
x=0
y=0
return [outT1[::-1],outT2[::-1]] if __name__ == '__main__':
dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
text1 = ''+dataset[0]
text2 = ''+dataset[1]
l1 = len(text1)
l2 = len(text2)
[Graph,Path,maxIndx,maxVal] = buildFittingAlignmentGraph(text1,text2) [outText1,outText2]=OutputFittingAligement(Path, Graph, text1, text2, maxIndx)
#print(Graph)
print(int(maxVal))
print(outText1)
print(outText2)
Fitting Alignment in python
这四种比对的关系如图:
全局比对 局部比对
Overlarp Alignment Fitting Alignment
5、基因的插入和删除,通常都是连续的一段,故在比对出现的连续空位,应该把它们当作一个整体看待。在比对的空位罚分中,生物学家认为,在每一条链上新开一个空位,应罚重分,而空位的延续,罚分应较少:
解决问题的方法是:开三个矩阵,每个矩阵代表一种方向。在→、↓方向上行走,代表产生空位。故每当从↘转移到→、↓,或者→、↓间转移,代表在某链上产生新空位,重罚,而在→、↓内转移,代表空位延续,轻罚。
'''
Code Challenge: Solve the Alignment with Affine Gap Penalties Problem.
>>Input: Two amino acid strings v and w (each of length at most 100).
>>Output: The maximum alignment score between v and w, followed by an alignment of v and w achieving this maximum score. Use the
BLOSUM62 scoring matrix, a gap opening penalty of 11, and a gap extension penalty of 1.
---------------------
Sample Input:
PRTEINS
PRTWPSEIN
---------------------
Sample Output:
8
PRT---EINS
PRTWPSEIN-
---------------------
coder: Lo Kwongho
'''
import numpy
from os.path import dirname
negINFINITY = -999
#Penalties
gs = -10 #gap_Start
ge = -1 #gap_Extend
#
def Grade(Symb1,Symb2):
Indx1 = symbolList[Symb1]
Indx2 = symbolList[Symb2]
return matrix[Indx1][Indx2] def initGraph(l1,l2):
Graph = [numpy.zeros([l1,l2] ,dtype=int) for i in range(3)] Graph[1][0][0] = negINFINITY
Graph[2][0][0] = negINFINITY
for x in range(1,l1):
Graph[0][x][0]=negINFINITY
if x==1:
Graph[1][x][0]=ge+gs
else:
Graph[1][x][0]=Graph[1][x-1][0]+ge
Graph[2][x][0]=negINFINITY
for y in range(1,l2):
Graph[0][0][y]=negINFINITY
if y ==1:
Graph[2][0][y]=ge+gs
else:
Graph[2][0][y]=Graph[2][0][y-1]+ge
Graph[1][0][y]=negINFINITY
return Graph def Init_Path(l1,l2):
Path = [numpy.ones([l1,l2])*-1 for i in range(3)]
'''for x in range(1,l1):
Path[x][0] = 1
for y in range(1,l2):
Path[0][y] = 2'''
return Path def buildAlignmentGraph(text1,text2,l1,l2): Graph = initGraph(l1,l2)
#Path = #Init_Path(l1,l2)
for x in range(1,l1):
for y in range(1,l2):
# X ######
Graph[1][x][y]=max(gs+ge+Graph[0][x-1][y],gs+ge+Graph[2][x-1][y],ge+Graph[1][x-1][y]) # Y ######
Graph[2][x][y]=max(gs+ge+Graph[0][x][y-1],gs+ge+Graph[1][x][y-1],ge+Graph[2][x][y-1]) # M ######
Graph[0][x][y]=Grade(text1[x], text2[y])+max(Graph[0][x-1][y-1],Graph[1][x-1][y-1],Graph[2][x-1][y-1]) maxVal = 0
maxIndx = -1
for i in range(3):
if Graph[i][l1-1][l2-1]>maxVal:
maxVal=Graph[i][l1-1][l2-1]
maxIndx=i
return [Graph,maxIndx,maxVal] def trackBack(Graph,maxIndx,text1,text2):
x = len(text1)-1
y = len(text2)-1
otext1 = ''
otext2 = ''
Indx = maxIndx
while(1):
if Indx==0:
otext1 += text1[x]
otext2 += text2[y]
if x ==1:
break
if Graph[0][x][y]==Graph[1][x-1][y-1]+Grade(text1[x], text2[y]):
Indx = 1
elif Graph[0][x][y]==Graph[2][x-1][y-1]+Grade(text1[x], text2[y]):
Indx = 2
else:
Indx = 0
x -= 1
y -= 1
elif Indx==1:
otext1 += text1[x]
otext2 += '-'
if x == 1:
break
if Graph[1][x][y]==Graph[0][x-1][y]+ge+gs:
Indx = 0
elif Graph[1][x][y]==Graph[2][x-1][y]+ge+gs:
Indx = 2
else:
Indx = 1
x -= 1
else:
otext1 += '-'
otext2 += text2[y]
if y == 1:
break
if Graph[2][x][y]==Graph[0][x][y-1]+ge+gs:
Indx = 0
elif Graph[2][x][y]==Graph[1][x][y-1]+ge+gs:
Indx = 1
else:
Indx = 2
y -= 1 return [otext1[::-1],otext2[::-1]] def ImportScoreMatrix():
dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n')
symbolList = dataset[0].split()
for i in range(len(symbolList)):
symbolList[i]=[symbolList[i],i]
symbolList = dict(symbolList)
matrix = []
for i in range(1,len(dataset)):
matrix.append(dataset[i].split()[1:])
for l in range(len(matrix)):
for i in range(len(matrix[l])):
matrix[l][i]=int(matrix[l][i])
return [matrix,symbolList] if __name__ == '__main__':
[matrix,symbolList] = ImportScoreMatrix() # 打分矩阵 dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
text1 = ''+dataset[0]
text2 = ''+dataset[1]
l1 = len(text1)
l2 = len(text2)
[Graph,maxIndx,maxVal] = buildAlignmentGraph(text1, text2, l1, l2)
#print(Graph) [output_text1,output_text2] = trackBack(Graph,maxIndx,text1,text2)
print(maxVal)
print(output_text1)
print(output_text2)
Alignment with Affine Gap Penalties
6 * 一种线性空间的比对方法 Space-Efficient Sequence Alignment(分治+动态规划)
https://www.cs.rit.edu/~rlaz/algorithms20082/slides/SpaceEfficientAlignment.pdf
'''
Code Challenge: Implement LinearSpaceAlignment to solve the Global Alignment Problem for a large dataset.
>>>Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet.
>>>Output: The maximum alignment score of these strings, followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix and indel penalty σ = 5.
------------
Sample Input:
PLEASANTLY
MEANLY
------------
Sample Output:
8
PLEASANTLY
-MEA--N-LY
------------
coder: Lo Kwongho in 2018.9
'''
from os.path import dirname
import numpy
#
indel = -5
negINF = -9999
#
#
def Grade(Symb1,Symb2):
Indx1 = symbolList[Symb1]
Indx2 = symbolList[Symb2]
return matrix[Indx1][Indx2] def ImportScoreMatrix():
dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n')
symbolList = dataset[0].split()
for i in range(len(symbolList)):
symbolList[i]=[symbolList[i],i]
symbolList = dict(symbolList)
matrix = []
for i in range(1,len(dataset)):
matrix.append(dataset[i].split()[1:])
for l in range(len(matrix)):
for i in range(len(matrix[l])):
matrix[l][i]=int(matrix[l][i])
return [matrix,symbolList]
#
def half_Alignment(v,w):
nv = len(v)
mw = len(w)
s = numpy.zeros(shape=(nv+1,2),dtype=int)
for i in range(nv+1):
s[i,0] = indel*i
if mw==0:
return s[:,0] #
for j in range(mw):
s[0,1]=s[0,0]+indel
for i in range(nv):
s[i+1,1]=max(s[i,1]+indel,s[i+1,0]+indel,s[i,0]+Grade(w[j],v[i]))
s[:,0]=s[:,1]
return s[:,1] def midEdge(v,w):
nv = len(v)
mw = len(w)
mid = int((mw-1)/2)
wl = w[:mid]
wr = w[mid+1:]
pre = half_Alignment(v,wl)
suf = half_Alignment(v[::-1],wr[::-1])[::-1]
hs = [pre[i]+suf[i]+indel for i in range(nv+1)]
ds = [pre[i]+suf[i+1]+Grade(w[mid],v[i]) for i in range(nv)]
maxhs = max(hs)
maxds = max(ds)
if maxhs>maxds:
return ( (hs.index(maxhs),mid) ,(hs.index(maxhs),mid+1) )
else:
return ( (ds.index(maxds),mid) ,(ds.index(maxds)+1,mid+1) ) def build_Alignment_track(v,w):
vn = len(v)
wm = len(w)
if vn==0 and wm==0:
return []
elif vn==0:
return ['-']*wm
elif wm==0:
return ['|']*vn
((x1,y1),(x2,y2)) = midEdge(v,w)
if x1==x2:
edge = ['-']
else:
edge = ['\\']
wleft = w[:y1]
wright = w[y2:]
vupper = v[:x1]
vbotm = v[x2:] upper_left_track = build_Alignment_track(vupper,wleft)
bottom_right_track = build_Alignment_track(vbotm,wright)
return upper_left_track+edge+bottom_right_track def trackToString(v,w,track):
vi = 0
wj = 0
outv = ''
outw = ''
score = 0
for i in track:
if i == '|':
outv += v[vi]
outw += '-'
score += indel
vi += 1
elif i == '-':
outv += '-'
outw += w[wj]
score += indel
wj += 1
else:
outv += v[vi]
outw += w[wj]
score += Grade(v[vi], w[wj])
vi += 1
wj += 1 return [score,outv,outw] def LinearSpaceAlignment(v,w):
track = build_Alignment_track(v,w)
[score,outv, outw] = trackToString(v,w,track)
print(score)
print(outv)
print(outw) if __name__ == '__main__':
dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
[matrix,symbolList] = ImportScoreMatrix()
v = dataset[0]
w = dataset[1]
LinearSpaceAlignment(v,w)
Linear-Space Alignment