Commit dbf813b8 authored by Mark Robinson's avatar Mark Robinson

add exam files

parent 5f473eab
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
'''
Created on May 27, 2019
to execute the script, first copy it in the folder containing the input file
calling-syntax: python script.py input_alignment_file.fasta > output_sequence_file.fasta
@author: mering_group
'''
import sys
input_file = open(sys.argv[1],"r") # read the BLAST generated alignment file
for line in input_file: #loop over each line of input file
if line.startswith(">"): # check for fasta header
header=line
print(header, end='') # print the fasta header
else: # when the line is the sequence
## Q 3.b modify using replace function to change "-" gap characters to ""
seq_without_gaps = line #add your code
if seq_without_gaps.isspace(): #checks if the new sequence is empty
pass #do nothing, i.e. do not write the new sequence in the output file
else:
print(seq_without_gaps, end='') # print the sequence without gaps
input_file.close()
'''
Created on May 27, 2019
To execute the script, first copy it in the folder containing the input file
calling-syntax: python script.py high_scoring_hits.fasta domain_positions.txt > output_sequence_file.fasta
@author: mering_group
'''
import sys
## first, open the file containing the protein sequences:
input_file_handle = open(sys.argv[1])
## this dictionary will store the protein sequences from the file:
protein_sequence_dict = {}
sequence = "" #### Empty string
for line in input_file_handle: ## this loop is repeated for each line in the file
l = line.strip() ## remove any 'white space' characters at the end of the line
if l.startswith(">"): ## selects the lines starting with ">"
if sequence!= "":
protein_sequence_dict[uniprot_identifier] = sequence
identifier_line = l.split(" ") ## splits the line containing the identifier into words.
uniprot_identifier = identifier_line[0].strip(">")
sequence = ""
else:
sequence = sequence + l.strip("\n") ## removes the newline character and joins the sequence without newline character for a particular protein
## For the last entry:
protein_sequence_dict[uniprot_identifier] = sequence
input_file_handle.close()
## ok, now we have read in all protein sequences, and stored them in a hash.
## next, open the file with the domain coordinates:
domain_input_file_handle = open (sys.argv[2])
for line in domain_input_file_handle:
if not line.startswith("#"): ###### removes the line starting with hash
l = line.strip().split() ############# for each line, split it into the various words according to the file format
target_name = l[0]; accession = l[1]; tlen = l[2] ; query_name =l[3]; domain_accession = l[4];
qlen = l[5]; e_value = l[6]; score = l[7] ; bias = l[8];
index_nr = l[9]; index_of = l[10] ; c_evalue = l[11]; i_evalue = l[12];
domain_score = l[13]; domain_bias = l[14]; hmm_from = l[15]; hmm_to = l[16];
ali_from = l[17]; ali_to = l[18];env_from = l[19]; env_to = l[20] ;
acc = l[21];
# [Q1] Change the criteria to be use the domain score
corrected_evalue = float (c_evalue)
# [Q2] Change the threshold accordingly to only report
# domain scores higher than 190
if corrected_evalue <= 0.000001:
full_sequence = protein_sequence_dict[target_name] ## retrieve the corresponding protein sequence from our hash
domain_start = int(env_from)
domain_stop = int(env_to)
# [Q3] Change this line such that only the domain part of the
# full_sequence is used. String slicing is your friend
domain_sequence = full_sequence
print(">" + target_name + "." + env_from)
print(domain_sequence)
domain_input_file_handle.close()
## that's it, we're done.
# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord
# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target
#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
gb|KTF85840.1| - 1164 Paxillin PF03535.13 202 7.5e-69 222.1 20.1 1 2 7.5e-69 7.5e-69 222.1 20.1 2 202 204 401 203 401 0.89 hypothetical protein cypCar_00025934 [Cyprinus carpio]
gb|KTF85840.1| - 1164 Paxillin PF03535.13 202 7.5e-69 222.1 20.1 2 2 9.2 9.2 -3.5 2.4 97 97 484 484 407 555 0.60 hypothetical protein cypCar_00025934 [Cyprinus carpio]
gb|RXN08385.1| - 1180 Paxillin PF03535.13 202 5.6e-65 209.5 19.6 1 1 2.6e-64 2.6e-64 207.3 19.6 11 202 248 424 238 424 0.85 paxillin isoform X3 [Labeo rohita]
ref|XP_016362386.1| - 533 Paxillin PF03535.13 202 1.9e-64 207.7 19.9 1 1 5e-64 5e-64 206.4 19.9 2 202 45 229 44 229 0.91 PREDICTED: paxillin-like [Sinocyclocheilus anshuiensis]
gb|KAF4113872.1| - 533 Paxillin PF03535.13 202 3.8e-64 206.7 21.9 1 1 1.1e-63 1.1e-63 205.3 21.9 2 202 45 229 44 229 0.90 hypothetical protein G5714_006417 [Onychostema macrolepis]
ref|XP_016389709.1| - 516 Paxillin PF03535.13 202 2.3e-63 204.2 20.2 1 1 6.2e-63 6.2e-63 202.8 20.2 2 202 45 229 44 229 0.91 PREDICTED: paxillin-like [Sinocyclocheilus rhinocerous]
gb|ROL53783.1| - 1210 Paxillin PF03535.13 202 2.5e-63 204.1 20.7 1 3 0.74 0.74 0.1 0.3 108 155 87 134 79 141 0.85 Paxillin [Anabarilius grahami]
gb|ROL53783.1| - 1210 Paxillin PF03535.13 202 2.5e-63 204.1 20.7 2 3 2.5e-63 2.5e-63 204.1 20.7 2 202 206 390 205 390 0.91 Paxillin [Anabarilius grahami]
gb|ROL53783.1| - 1210 Paxillin PF03535.13 202 2.5e-63 204.1 20.7 3 3 2.5 2.5 -1.6 0.6 92 109 465 482 400 536 0.72 Paxillin [Anabarilius grahami]
ref|XP_016304400.1| - 454 Paxillin PF03535.13 202 2.5e-62 200.8 19.4 1 1 6.9e-62 6.9e-62 199.4 19.4 2 202 45 229 44 229 0.90 PREDICTED: paxillin-like isoform X3 [Sinocyclocheilus anshuiensis]
ref|XP_016304315.1| - 533 Paxillin PF03535.13 202 3.8e-62 200.2 19.4 1 1 9.8e-62 9.8e-62 198.9 19.4 2 202 45 229 44 229 0.90 PREDICTED: paxillin-like isoform X2 [Sinocyclocheilus anshuiensis]
ref|XP_026101848.1| - 734 Paxillin PF03535.13 202 4.3e-62 200.1 23.2 1 2 2.1 2.1 -1.3 0.2 58 93 71 106 56 110 0.74 paxillin-like isoform X4 [Carassius auratus]
ref|XP_026101848.1| - 734 Paxillin PF03535.13 202 4.3e-62 200.1 23.2 2 2 4.3e-62 4.3e-62 200.1 23.2 2 202 246 430 245 430 0.90 paxillin-like isoform X4 [Carassius auratus]
ref|XP_016418196.1| - 1085 Paxillin PF03535.13 202 5e-62 199.8 20.7 1 1 5e-62 5e-62 199.8 20.7 2 202 45 229 44 229 0.90 PREDICTED: proteoglycan 4-like [Sinocyclocheilus rhinocerous]
ref|XP_026096319.1| - 533 Paxillin PF03535.13 202 5.1e-62 199.8 18.5 1 1 1.4e-61 1.4e-61 198.4 18.5 2 202 45 229 44 229 0.90 paxillin-like isoform X4 [Carassius auratus]
ref|XP_026101829.1| - 1092 Paxillin PF03535.13 202 8.3e-62 199.1 23.2 1 1 8.3e-62 8.3e-62 199.1 23.2 2 202 45 229 44 229 0.90 LIM domain-binding protein 3-like isoform X2 [Carassius auratus]
ref|XP_026096317.1| - 681 Paxillin PF03535.13 202 8.3e-62 199.1 18.5 1 1 2.2e-61 2.2e-61 197.7 18.5 2 202 193 377 192 377 0.90 paxillin-like isoform X2 [Carassius auratus]
ref|XP_026101823.1| - 1293 Paxillin PF03535.13 202 1.1e-61 198.8 23.2 1 2 3.6 3.6 -2.2 0.1 58 93 71 106 62 114 0.74 proteoglycan 4-like isoform X1 [Carassius auratus]
ref|XP_026101823.1| - 1293 Paxillin PF03535.13 202 1.1e-61 198.8 23.2 2 2 1.1e-61 1.1e-61 198.8 23.2 2 202 246 430 245 430 0.90 proteoglycan 4-like isoform X1 [Carassius auratus]
ref|NP_963882.1| - 533 Paxillin PF03535.13 202 1.1e-61 198.7 20.9 1 1 3e-61 3e-61 197.3 20.9 2 202 45 229 44 229 0.89 paxillin a [Danio rerio]
ref|XP_016304258.1| - 622 Paxillin PF03535.13 202 1.3e-61 198.5 19.4 1 3 1.3e-61 1.3e-61 198.5 19.4 2 202 45 229 44 229 0.90 PREDICTED: mucin-5AC-like isoform X1 [Sinocyclocheilus anshuiensis]
ref|XP_016304258.1| - 622 Paxillin PF03535.13 202 1.3e-61 198.5 19.4 2 3 3.7 3.7 -2.2 3.1 97 106 307 316 237 380 0.63 PREDICTED: mucin-5AC-like isoform X1 [Sinocyclocheilus anshuiensis]
ref|XP_016304258.1| - 622 Paxillin PF03535.13 202 1.3e-61 198.5 19.4 3 3 6.5 6.5 -3.0 2.2 116 151 393 428 358 435 0.74 PREDICTED: mucin-5AC-like isoform X1 [Sinocyclocheilus anshuiensis]
ref|XP_026096316.1| - 724 Paxillin PF03535.13 202 2.5e-61 197.6 18.5 1 2 4.9 4.9 -2.6 0.1 17 179 98 137 84 154 0.50 paxillin-like isoform X1 [Carassius auratus]
ref|XP_026096316.1| - 724 Paxillin PF03535.13 202 2.5e-61 197.6 18.5 2 2 2.5e-61 2.5e-61 197.6 18.5 2 202 236 420 235 420 0.90 paxillin-like isoform X1 [Carassius auratus]
ref|XP_016096561.1| - 1080 Paxillin PF03535.13 202 5.3e-60 193.2 18.5 1 2 5.3e-60 5.3e-60 193.2 18.5 2 202 45 227 44 228 0.89 PREDICTED: calphotin-like [Sinocyclocheilus grahami]
ref|XP_016096561.1| - 1080 Paxillin PF03535.13 202 5.3e-60 193.2 18.5 2 2 1.8 1.8 -1.2 0.3 91 129 237 279 229 299 0.68 PREDICTED: calphotin-like [Sinocyclocheilus grahami]
ref|XP_018965205.1| - 423 Paxillin PF03535.13 202 3.5e-50 161.2 8.1 1 2 3.5e-50 3.5e-50 161.2 8.1 83 202 1 119 1 119 0.97 PREDICTED: paxillin-like [Cyprinus carpio]
ref|XP_018965205.1| - 423 Paxillin PF03535.13 202 3.5e-50 161.2 8.1 2 2 7.3 7.3 -3.1 0.1 84 97 167 180 136 182 0.66 PREDICTED: paxillin-like [Cyprinus carpio]
ref|XP_026101837.1| - 982 Paxillin PF03535.13 202 2.3e-46 148.7 9.2 1 1 2.3e-46 2.3e-46 148.7 9.2 83 202 1 119 1 119 0.93 LIM domain-binding protein 3-like isoform X3 [Carassius auratus]
ref|XP_026096321.1| - 423 Paxillin PF03535.13 202 2.3e-46 148.7 5.9 1 2 2.3e-46 2.3e-46 148.7 5.9 83 202 1 119 1 119 0.96 paxillin-like isoform X5 [Carassius auratus]
ref|XP_026096321.1| - 423 Paxillin PF03535.13 202 2.3e-46 148.7 5.9 2 2 8.3 8.3 -3.3 0.0 84 97 167 180 142 182 0.70 paxillin-like isoform X5 [Carassius auratus]
#
# Program: hmmsearch
# Version: 3.3 (Nov 2019)
# Pipeline mode: SEARCH
# Query file: Paxillin.hmm
# Target file: high_scoring_hits.fasta
# Option settings: hmmsearch --domtblout domain_positions.txt Paxillin.hmm high_scoring_hits.fasta
# Current dir: /work/bio334_spring2020/data/exam_test
# Date: Tue May 26 07:54:43 2020
# [ok]
'''
Created on May 27, 2019
To execute the script, first copy it in the folder containing the input file
calling-syntax: python script.py input_alignment.fasta > output_alignment.fasta
author: von Mering group
'''
import sys
sequence_file = open(sys.argv[1]) # read the alignment file
sequences_to_extract = 100 # change to select only the top n sequences
extracted_sequences = 0
for line in sequence_file:
if line.startswith('>'):
if extracted_sequences < sequences_to_extract:
extracted_sequences += 1
else:
break
print(line, end='')
This diff is collapsed.
This diff is collapsed.
(
(
gi|1024921483|ref|XP_016304315.1|.44:0.00000,
gi|1024921481|ref|XP_016304258.1|.44:0.00000)
:0.00000,
gi|1024921485|ref|XP_016304400.1|.44:0.00000,
(
gi|1025160681|ref|XP_016418196.1|.44:0.00024,
(
gi|1020514649|ref|XP_016096561.1|.44:0.01958,
(
(
(
(
gi|1020470066|ref|XP_016140113.1|.37:0.00000,
gi|1020470064|ref|XP_016140112.1|.191:0.00000)
:0.09260,
(
gi|1024927658|ref|XP_016362386.1|.44:0.00435,
gi|1025331920|ref|XP_016389709.1|.44:0.00640)
:0.00955)
:0.01653,
gi|1101598532|ref|XP_018964430.1|.220:0.07487)
:0.02661,
(
gi|966650938|gb|KTF85840.1|.203:0.00480,
(
(
Query_30707.1:0.07103,
gi|317419106|emb|CBN81144.1|.50:0.27643)
:0.03756,
gi|1101600035|ref|XP_018965205.1|.1:0.01678)
:0.02341)
:0.01980)
:0.02896)
:0.00236)
:0.01052);
>gb|KTF85840.1|.203
SSPPQLTSPPAQTLNGSWVEKPESKHSSTQCLKVKQLFFSFLKSFSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPAAVMSSSLGSNLSELDRLLLELNAVQHSTPSFATEEAYPPKPASNTQRYVPENGVSSVVKAAPPKIEKPKRSAPGRGIEDVRPSVESLLNELESSVPAPAPAPSVPVVPELREAQEETPAQQ
>gb|RXN08385.1|.238
VSSPPRVTSPLAQTLNGSWVEKPESKPSSTQPFTSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPSAIMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEETYPPKPASNTQRYVPENGVSSVVKAPPPKIEKPKRNVPGRGIEDVRPSVESLLNELESSVPAPAPAPSVPVVHELREVQEETPAQQ
>ref|XP_016362386.1|.44
SSPPRVASPPAQTLNGSWVEKPESKHSSTQSFSSAPKSTSPRVSQSEEEHVYSFPNKQKTTDSPAAVMSSLLGSNLSELDRLLLELNAVQHSTPAFPTEETYPPKPASNAQRYVPENGVSSVVKAPPPKIEKPKRNAPGRGIEDVRPSVENLLNELESSVPAPAPAPAVPVVPELREVQEETPCQQ
>gb|KAF4113872.1|.44
SSPPRVTSPPAQTLNGSWVEKPESKHSSTQSVSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPAAVMSSSLGSNLSELDRLLLELNAVQHSTPAFPTEETYPHKPASNAQCYVPENGVSSVVKAPPSKMEKPKRSAPGRGIEDVRPSVESLLNELESSVPAPVPAPAVPVVPELREVQEETPAQQ
>ref|XP_016389709.1|.44
SSPPRVASPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPAAVMSSLLGSNLSELDRLLLELNAVQHSTPAFPTEETYPPKPASNAQRYVPENGVSSVVKAPPPKIEKPKRNAPGRGIEDVTPSVENLLNELESSVPAPAPAPAVPVVPELREVQEETPCQQ
>gb|ROL53783.1|.205
SSPPRVTSPPAETLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQTDEEHVYSFPNKQKTTDSPTAVMSSSLGSNLSELDRLLLELNAVQHSTPSFATEETYPSKPASNAQRYVPENGVSSGVKAAPPKIDKPKRSAPGRGIEDVRPSVESLLNELESSVPAPAPAPSVPMVPELREVQEETPTQQ
>ref|XP_016304400.1|.44
SSPPRLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPTAVMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYPPKPASNTQCYVPENGVSSVVKAPLPKIEKPKRSAHGWGIEDVRPSVESLLNELESSVPAPVTAPSVPVVPELREVQEETTAQQ
>ref|XP_016304315.1|.44
SSPPRLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPTAVMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYPPKPASNTQCYVPENGVSSVVKAPLPKIEKPKRSAHGWGIEDVRPSVESLLNELESSVPAPVTAPSVPVVPELREVQEETTAQQ
>ref|XP_026101848.1|.245
SSPPPLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEAEHVYSFPNKQKTTDPPAAVMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYQPKPASNTQRYVPENGVSSVVKAPPPKLEKPKRSAPGRGIEEVRPSVESLLNELESSVPAPAAAPSGPVVPESREVQEETPAQQ
>ref|XP_016418196.1|.44
SSPPRLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPTAVMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYPPKPASNTQCYVPENGVSSVVKAPLPKIEKPKRSAPGRGIEDVRPSVESLLNELESSVPAPVTAPSVPVVPELREVQEETTAQQ
>ref|XP_026096319.1|.44
SSPPRVASPPAQTLNGSWVEKPESKHSSTQSFSSAPKTASPRVSQSEDEHVYSFPNKQKTTDSPTAIMSSSLGSNLSELDRLLLELNAVQHSTPAFPTEETYPPKPASNAQRYVPENGVSSVVKAPPPMIEKPKRGAPGRGIEVVRPSMESMLHELESSVPAPAPAPAVPVVPEMREFQEETPTQQ
>ref|XP_026101829.1|.44
SSPPPLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEAEHVYSFPNKQKTTDPPAAVMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYQPKPASNTQRYVPENGVSSVVKAPPPKLEKPKRSAPGRGIEEVRPSVESLLNELESSVPAPAAAPSGPVVPESREVQEETPAQQ
>ref|XP_026096317.1|.192
SSPPRLASPPAQTLNGSWVEKPESKHSSTQSFSSAPKTASPRVSQSEDEHVYSFPNKQKTTDSPTAIMSSSLGSNLSELDRLLLELNAVQHSTPAFPTEETYPPKPASNAQRYVPENGVSSVVKAPPPMIEKPKRGAPGRGIEVVRPSMESMLHELESSVPAPAPAPAVPVVPEMREFQEETPTQQ
>ref|XP_026101823.1|.245
SSPPPLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEAEHVYSFPNKQKTTDPPAAVMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYQPKPASNTQRYVPENGVSSVVKAPPPKLEKPKRSAPGRGIEEVRPSVESLLNELESSVPAPAAAPSGPVVPESREVQEETPAQQ
>ref|NP_963882.1|.44
SSPPRVISPPAETLNGSWVEKPESKHSSTQSFNSAPKSSSPRVSQSEEEHVYSFPNKQKSIESPTAVMNSSLGSNLSELDRLLLELNAVQHSTPSFPAEETYPPKPASNTQRYVPENGVSSVVKAAPPKIEKPKRNIPAKVIEEVRPSVESLLNQLESSVPAAVPVSSVPMVSELRGVQEETPAQQ
>ref|XP_016304258.1|.44
SSPPRLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPTAVMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYPPKPASNTQCYVPENGVSSVVKAPLPKIEKPKRSAHGWGIEDVRPSVESLLNELESSVPAPVTAPSVPVVPELREVQEETTAQQ
>ref|XP_026096316.1|.235
SSPPRVASPPAQTLNGSWVEKPESKHSSTQSFSSAPKTASPRVSQSEDEHVYSFPNKQKTTDSPTAIMSSSLGSNLSELDRLLLELNAVQHSTPAFPTEETYPPKPASNAQRYVPENGVSSVVKAPPPMIEKPKRGAPGRGIEVVRPSMESMLHELESSVPAPAPAPAVPVVPEMREFQEETPTQQ
>ref|XP_016096561.1|.44
SSPPRLTSPPAQTLNGSWVEKPESKHSSTQSFSSAPKSASPRVSQSEEEHVYSFPNKQKTTDSPTAIMSSSLGSNLSELDRLLLELNAVQHSTPSFPTEEAYPPKPASNTQCYVPENGVLSVVKAPLPKIEKPKRSAPGRGIEDVRPSVERLLNELESSVPAPVTAPSVPVVPEFVQEETTAQQQ
>Seq_of_Interest
MNSSLGSNLSELDRLLLELNAVQHSTPSFPAEETYPPKPASNTQRYVPENGVSSVVKAA
PPKIEKPKRNIPAKVIEEVRPSVESLLNQLESSVPAAVPVSSVPMVSELRGVQEETPAQ
QQARISASSATRELDELMASLSDFKVQSNVNIAETFTIIHHLSYKCLIEFLNMIYCTTT
QSTC
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment