''' Created on May 27, 2019 To execute the script, first copy it in the folder containing the input file calling-syntax: python script.py high_scoring_hits.fasta domain_positions.txt > output_sequence_file.fasta @author: mering_group ''' import sys ## first, open the file containing the protein sequences: input_file_handle = open(sys.argv[1]) ## this dictionary will store the protein sequences from the file: protein_sequence_dict = {} sequence = "" #### Empty string for line in input_file_handle: ## this loop is repeated for each line in the file l = line.strip() ## remove any 'white space' characters at the end of the line if l.startswith(">"): ## selects the lines starting with ">" if sequence!= "": protein_sequence_dict[uniprot_identifier] = sequence identifier_line = l.split(" ") ## splits the line containing the identifier into words. uniprot_identifier = identifier_line[0].strip(">") sequence = "" else: sequence = sequence + l.strip("\n") ## removes the newline character and joins the sequence without newline character for a particular protein ## For the last entry: protein_sequence_dict[uniprot_identifier] = sequence input_file_handle.close() ## ok, now we have read in all protein sequences, and stored them in a hash. ## next, open the file with the domain coordinates: domain_input_file_handle = open (sys.argv[2]) for line in domain_input_file_handle: if not line.startswith("#"): ###### removes the line starting with hash l = line.strip().split() ############# for each line, split it into the various words according to the file format target_name = l[0]; accession = l[1]; tlen = l[2] ; query_name =l[3]; domain_accession = l[4]; qlen = l[5]; e_value = l[6]; score = l[7] ; bias = l[8]; index_nr = l[9]; index_of = l[10] ; c_evalue = l[11]; i_evalue = l[12]; domain_score = l[13]; domain_bias = l[14]; hmm_from = l[15]; hmm_to = l[16]; ali_from = l[17]; ali_to = l[18];env_from = l[19]; env_to = l[20] ; acc = l[21]; # [Q1] Change the criteria to be use the domain score corrected_evalue = float (c_evalue) # [Q2] Change the threshold accordingly to only report # domain scores higher than 190 if corrected_evalue <= 0.000001: full_sequence = protein_sequence_dict[target_name] ## retrieve the corresponding protein sequence from our hash domain_start = int(env_from) domain_stop = int(env_to) # [Q3] Change this line such that only the domain part of the # full_sequence is used. String slicing is your friend domain_sequence = full_sequence print(">" + target_name + "." + env_from) print(domain_sequence) domain_input_file_handle.close() ## that's it, we're done.