Commit d552ed2f authored by Can Ulutekin Alvarez's avatar Can Ulutekin Alvarez

Auto-saving for can.ulutekinalvarez on branch master from commit 8f4c6c43

parent 8f4c6c43
Pipeline #77087 passed with stage
in 13 minutes and 25 seconds
This diff is collapsed.
import sys # import sys module for open/close methods
def poly_calc(sequences):
size = len(sequences)
poly_sites = set()
for base_position in range(len(sequences[0])):
poly_bases = set()
poly_bases.add(sequences[0][base_position])
for seq in sequences:
if seq[base_position] != sequences[0][base_position]:
poly_bases.add(seq[base_position])
poly_sites.add(base_position)
if len(poly_bases) > 1:
seg_or_not = "segregated"
else:
seg_or_not = "not segregated"
print(poly_bases, end = "\t")
print(seg_or_not)
return len(poly_sites)
#-----------------------------------------------------------------------------------------------------------------
sequences = []
f = open(sys.argv[1])
for line in f:
if not line[0] == ">":
sequences.append(line.rstrip())
f.close()
total_seg_sites = 0 # Initialize a variable to store the total number of segregating sites
total_seg_sites = poly_calc(sequences)
print("total segregating sites =", total_seg_sites) # Show the total number of segregating sites
\ No newline at end of file
import sys
import math
def poly_site_calc(sequences):
size = len(sequences)
poly_sites = set()
for base_position in range(len(sequences[0])):
for seq in sequences:
if seq[base_position] != sequences[0][base_position]:
poly_sites.add(base_position)
return len(poly_sites)
def pi_calc(sequences):
size = len(sequences)
pi_list = []
for i in range(size):
for j in range(i+1, size):
non_equal = 0
for base in range(len(sequences[i])):
if sequences[i][base] != sequences[j][base]:
non_equal += 1
pi_list.append(non_equal/len(sequences[i]))
combinations = size * (size - 1) / 2
pi_value = sum(pi_list) / combinations
return pi_value
def pi_calc2(sequences):
size = len(sequences)
pi_list = []
for i in range(size):
for j in range(i+1, size):
non_equal = 0
for base in range(len(sequences[i])):
if sequences[i][base] != sequences[j][base]:
non_equal += 1
pi_list.append(non_equal/len(sequences[i]))
combinations = size * (size - 1) / 2
result = sum(pi_list) * len_sequence
return result
def teta_calc(sequences):
size = len(sequences)
poly_sites = set()
for i in range(1, size):
for base in range(len(sequences[i])):
if sequences[i][base] != sequences[0][base]:
poly_sites.add(base)
S = len(poly_sites) / len(sequences[0])
normalizer = 0
for i in range(len(sequences) - 1):
normalizer += 1 / (i+1)
teta = S / normalizer
return teta
def Tajimas_D(pi, teta, sequences):
d = pi - teta
dd = d * len(sequences[0])
a1 = sum([1.0/i for i in range(1, len(sequences))])
a2 = sum([1.0/i**2 for i in range(1, len(sequences))])
b1 = (len(sequences) + 1) / (3 * (len(sequences) - 1))
b2 = 2 * (len(sequences)**2 + len(sequences) + 3) / (9 * len(sequences) * (len(sequences) - 1))
c1 = b1 - (1 / a1)
c2 = b2 - (len(sequences) + 2) / (a1 * len(sequences)) + a2 / (a1**2)
e1 = c1 / a1
e2 = c2 / (a1**2 + a2)
total_seg_sites = poly_site_calc(sequences)
td = dd / (e1 * total_seg_sites + e2 * total_seg_sites * (total_seg_sites-1))**(1/2)
return td
#-----------------------------------------------------------------------------------------------------------------
# Load a FASTA file
sequences = []
f = open(sys.argv[1])
for line in f:
if not line[0] == ">":
sequences.append(line.rstrip())
f.close()
len_sequence = len(sequences[0])
# Calculate the total number of different nucleotides
total_diff = pi_calc2(sequences)
# Calculate the total number of segregating sites
total_seg_sites = poly_site_calc(sequences)
# Calculate pi and d
num_sequences = len(sequences)
combination = num_sequences*(num_sequences-1)/2.0
pi = pi_calc(sequences)
estimated_pi = teta_calc(sequences)
sum_ = sum([1.0/i for i in range(1, num_sequences)])
s = float(total_seg_sites)/len_sequence
d = pi - estimated_pi
PI = pi*len_sequence
dd = PI - total_seg_sites/sum_
td = Tajimas_D(pi, estimated_pi, sequences)
print("Total differences =", total_diff)
print("nC2 =", combination)
print("Lenght of sequence =", len_sequence)
print("Nucleotide diversity pi =", pi)
print("Pair-wise difference PI =", PI)
print("Number of SNP sites (total segregating sites) S =", total_seg_sites)
print("Frequency of SNP sites s = S/sequence_length =", s)
print("sum_{i=1}^{n-1}(1/i) =", sum_)
print("estimated pi = s/sum(1/i) =", estimated_pi)
print("d = pi - estimated_pi =", d)
print("Tajima's D=", td)
\ No newline at end of file
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
......@@ -18,6 +18,7 @@ for line in input_file: #loop over each line of input file
else: # when the line is the sequence
## Q 3.b modify using replace function to change "-" gap characters to ""
seq_without_gaps = line #add your code
seq_without_gaps = seq_without_gaps.replace("-", "")
if seq_without_gaps.isspace(): #checks if the new sequence is empty
pass #do nothing, i.e. do not write the new sequence in the output file
......
......@@ -9,7 +9,7 @@ import sys
sequence_file = open(sys.argv[1]) # read the alignment file
sequences_to_extract = 100 # change to select only the top n sequences
sequences_to_extract = 21 # change to select only the top n sequences
extracted_sequences = 0
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment