1
2 """
3 Created 2012
4 core Script for the generation of the artificial reference genome
5
6 The functions purpose is to to go through a list of positions and find balanced mutations
7 which fulfill the demands on the artificial reference genome. Once a initial start positions is
8 randomly selected all possible triplets with hamming distance 1 are generated and looked up
9 in a dictionary which contains all triplet positions in the input genome. If a suitable partner
10 is found for the initial mutation the next start positions is chosen randomly. Else: try all other
11 triplets with hamming distance 1 until no one is left. This process can be accelerated by allowing
12 unbalanced mutations, but this will cause differences in the NUC/AA distribution and the AA neighborhood.
13
14
15 @author: Sven Giese
16 """
17
18 import random as r
19 import Prep as INI
20
21
22
24 """
25 Returns a random mutation for a given AA and its Codon(DNA). The mutation is done in a way which supports the equilibrium
26 of the nucleotide distribution by only regarding hamming distance=1 Codons as possible mutations
27
28 @type AA: string
29 @param AA: Single AA.
30 @type Codon: string
31 @param Codon: 3-letter dna code.
32 @rtype: list,list
33 @return: A list of all valid mutations (triplet) and the coresponding AA.
34 """
35 temp_mutationlist = []
36 '''create a list of possible triplets within hamming distance 1 '''
37 for item in INI.genetic_code.keys():
38 isvalid = INI.isvalidtriplet(item,Codon)
39 ''' Hamming distance 1, AA is not equal to the given AA,forbid mutation to stopcodon '''
40 if (isvalid == True and AA !=INI.genetic_code[item] and INI.genetic_code[item]!="*"):
41 temp_mutationlist.append(item)
42
43
44 aalist = []
45
46 for item in temp_mutationlist:
47 if (item in INI.genetic_code):
48 aalist.append(INI.genetic_code[item])
49 else:
50 aalist.append("n")
51
52 return(temp_mutationlist,aalist)
53
54
56 """
57 Given two triplets, returns the differences between them plus the position
58
59 @type triplet_old: string
60 @param triplet_old: AA triplet.
61 @type triplet_new: string
62 @param triplet_new: AA triplet.
63 @rtype: Char,Char,int
64 @return: The new aminoacid, the old aminoacid and the position.
65 """
66 for i in range(0,3):
67 if (triplet_new[i]!=triplet_old[i]):
68
69 return (triplet_new[i],triplet_old[i],i)
70
71
73 """
74 Checks if a position is valid for mutation. It queries all neighboring positions (iprime +-distance) to check whether there already was a mutation in pdic
75
76 @type pdic: dictionary
77 @param pdic: Diciontary containing mutations and start/ stop codons..
78 @type iprime: int
79 @param iprime: Position of the prospective mutation (DNA level)
80 @type distance: int
81 @param distance: User defined parameter which limits the distance between two mutations.
82 @rtype: Bool
83 @return: Boolean which decides if the position is valid (1= yes,0 = no)
84 """
85
86
87 distance = distance-2
88
89 istforbidden = 0
90 for o in range(-distance,distance+2,1):
91 if (iprime+o in pdic):
92
93
94 if((pdic[iprime+o]=="E") or (pdic[iprime+o]=="S")):
95 if((o >3) or (o <-3)):
96 pass
97 else:
98 istforbidden = 1
99 break
100 else:
101 istforbidden = 1
102 break
103 else:
104 pass
105
106 return(istforbidden)
107
108
109
110
111 -def mutate_random(DNA,AminoAcid,distance,pdic,rev,header,Random,outputpath):
112 """
113 Mutates a given DNA(AminoAcid) Genomesequence on several positions (distance based on DISTANCE var. If one mutation is done
114 a compareable Triplet is searched to "reverse" the changes made in AA distribution, N distribution, AA neighborhood
115
116 @type DNA: list
117 @param DNA: DNA sequence of the reference genome.
118 @type AminoAcid: list
119 @param AminoAcid: AA sequence of the reference genome.
120 @type rev: Bool
121 @param rev: Boolean which decides if unbalanced mutations are allowed (only initial mutation is performed)
122 @type pdic: dictionary
123 @param pdic: Diciontary containing mutations and start/ stop codons..
124 @type header: string
125 @param header: Header for the resulting artificial reference file (fasta format).
126 @type Random: Bool
127 @param Random: Boolean for choosing on of the mutation modes (linear = 0,random = 1)
128 @type distance: int
129 @param distance: User defined parameter which limits the distance between two mutations.
130 @rtype: list
131 @return: Artificial reference genome sequence.
132 """
133
134 start = []
135 both = []
136 fobj2= open(outputpath+header+"_CompleteLog.txt","a")
137 fobj2.write("BalancedMutation"+"\t"+"NewAA" + "\t" + "OldAA"+"\t"+"NewAAPos"+"\t"+"OldAAPos" +"\t"+ "NewDNA"+"\t"+ "OldDNA"+ "\t"+"NewDNAPos"+"\t"+"OldDNAPos"+"\n")
138 fobj2.close()
139
140
141
142 samplespace = []
143 for i in range (2,len(AminoAcid),distance/3):
144 samplespace.append(i)
145
146
147
148 if (Random ==1):
149 r.shuffle(samplespace)
150 else:
151 pass
152
153 dna_list = list(DNA)
154 AminoAcid_list = list(AminoAcid)
155
156 '''the lookup dictionary for the aa triplets '''
157 lookup_dic = INI.createdic(AminoAcid)
158
159
160 gotit=False
161
162 succ_counter = 0
163 fail_counter = 0
164 skip = 0
165
166 ''' Main loop over the AminoAcid'''
167 for i in samplespace:
168 ''' no triplet left --> break '''
169 if(i+2 >len(AminoAcid)):
170 print("\t(finished...exceeded length of AA)")
171 continue
172
173 ''' AA which is going to be mutated'''
174 AA = AminoAcid_list[i]
175
176 '''index for dna : i*3 --> AminoAcid --> DNA
177 #not i*3+3 because i starts at AA 2 since we need a right and left neighbor'''
178 iprime = i*3
179
180 '''AA and corresponding DNA triplet for the middle AA '''
181 AA_triplet= AminoAcid_list[i-1]+AminoAcid_list[i]+AminoAcid_list[i+1]
182 DNA_triplet = DNA[iprime:iprime+3]
183
184
185 mutationsliste,aaliste = getMutation(AA,DNA_triplet)
186
187
188
189 val = isvalidposition(pdic, iprime, distance)
190 if (val ==1):
191 skip+=1
192 fobj2= open(outputpath+header+"_CompleteLog.txt","a")
193 fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(skipped)'"+"\n")
194 fobj2.close()
195 continue
196
197 else:
198 pass
199
200
201 for q,item in enumerate(mutationsliste):
202
203 if gotit==True:
204 break
205 else:
206 pass
207
208 ''' old and new variables for before/after the mutation '''
209 new_triplet = mutationsliste[q]
210 new_AA = aaliste[q]
211 new_N,old_N,position = getdifference(DNA_triplet,new_triplet)
212 new_AA_triplet = AA_triplet[0]+new_AA+AA_triplet[2]
213 tempdic = pdic
214 tempdic[iprime+position]="M"
215
216 if (new_AA_triplet in lookup_dic):
217 '''templist--> contains all starting positions of the "new_AA_triplet" which we want to substitute back '''
218 templist = lookup_dic[new_AA_triplet]
219
220
221
222 tempposition = [iprime+position,"M"]
223 for l in range(0,len(templist)):
224 posi = templist[l]
225
226 ''' suitable dna position found? '''
227 if (new_triplet == dna_list[posi*3+3]+dna_list[posi*3+3+1]+dna_list[posi*3+3+2]):
228 val = isvalidposition(tempdic, posi*3+3+position, distance)
229
230 if (val ==1):
231 skip+=1
232 continue
233 else:
234 pass
235
236 '''back substitution & do subs on 1st position'''
237 pdic[posi*3+3+position]="R"
238 dna_list[posi*3+3+position]= old_N
239
240 pdic[iprime+position]="M"
241 dna_list[iprime+position]= new_N
242
243 AminoAcid_list[i]= new_AA
244 AminoAcid_list[posi+1]= AA
245
246 gotit = True
247 succ_counter+=1
248
249 lookup_dic[new_AA_triplet].remove(posi)
250
251 '''writing the log file '''
252 fobj= open(outputpath+header+"_CompleteLog.txt","a")
253 fobj.write(str(1)+"\t"+AA_triplet + "\t" + new_AA_triplet+"\t"+str(i)+"\t"+str(posi) +"\t"+ DNA_triplet+"\t"+ str(new_triplet)+ "\t"+str(iprime+position)+"\t"+str(posi*3+3+position)+"\n")
254 fobj.close()
255
256
257 start.append(iprime+position)
258 both.extend([iprime+position,posi*3+3+position])
259 break
260
261
262 else:
263 continue
264
265
266 if (gotit==False):
267 fobj2= open(outputpath+header+"_CompleteLog.txt","a")
268 fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(tried)'"+"\n")
269 fobj2.close()
270 fail_counter+=1
271
272 if (rev==0):
273 pdic[iprime+position]="M"
274 dna_list[iprime+position]= new_N
275 AminoAcid_list[i]= new_AA
276 start.append(iprime+position)
277 both.extend([iprime+position])
278 elif (gotit==True):
279 gotit = False
280
281
282 print("\r\n########Some stats:########")
283 print("DNA length:\t" + str(len(DNA)))
284 print("max substitutions:\t" + str(len(DNA)/distance))
285 print("#Balanced Mutations:\t" + str(succ_counter))
286
287
288 return ("".join(dna_list))
289