Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
'''
Created on May 27, 2019
To execute the script, first copy it in the folder containing the input file
calling-syntax: python script.py high_scoring_hits.fasta domain_positions.txt > output_sequence_file.fasta
@author: mering_group
'''
import sys
## first, open the file containing the protein sequences:
input_file_handle = open(sys.argv[1])
## this dictionary will store the protein sequences from the file:
protein_sequence_dict = {}
sequence = "" #### Empty string
for line in input_file_handle: ## this loop is repeated for each line in the file
l = line.strip() ## remove any 'white space' characters at the end of the line
if l.startswith(">"): ## selects the lines starting with ">"
if sequence!= "":
protein_sequence_dict[uniprot_identifier] = sequence
identifier_line = l.split(" ") ## splits the line containing the identifier into words.
uniprot_identifier = identifier_line[0].strip(">")
sequence = ""
else:
sequence = sequence + l.strip("\n") ## removes the newline character and joins the sequence without newline character for a particular protein
## For the last entry:
protein_sequence_dict[uniprot_identifier] = sequence
input_file_handle.close()
## ok, now we have read in all protein sequences, and stored them in a hash.
## next, open the file with the domain coordinates:
domain_input_file_handle = open (sys.argv[2])
for line in domain_input_file_handle:
if not line.startswith("#"): ###### removes the line starting with hash
l = line.strip().split() ############# for each line, split it into the various words according to the file format
target_name = l[0]; accession = l[1]; tlen = l[2] ; query_name =l[3]; domain_accession = l[4];
qlen = l[5]; e_value = l[6]; score = l[7] ; bias = l[8];
index_nr = l[9]; index_of = l[10] ; c_evalue = l[11]; i_evalue = l[12];
domain_score = l[13]; domain_bias = l[14]; hmm_from = l[15]; hmm_to = l[16];
ali_from = l[17]; ali_to = l[18];env_from = l[19]; env_to = l[20] ;
acc = l[21];
# [Q1] Change the criteria to be use the domain score
corrected_evalue = float (c_evalue)
# [Q2] Change the threshold accordingly to only report
# domain scores higher than 190
if corrected_evalue <= 0.000001:
full_sequence = protein_sequence_dict[target_name] ## retrieve the corresponding protein sequence from our hash
domain_start = int(env_from)
domain_stop = int(env_to)
# [Q3] Change this line such that only the domain part of the
# full_sequence is used. String slicing is your friend
domain_sequence = full_sequence
print(">" + target_name + "." + env_from)
print(domain_sequence)
domain_input_file_handle.close()
## that's it, we're done.