# Filename1 = info_entropy1.py
"""
The citation for these programs is: Thompson WA, Martwick A
and Weltman JK. (2009) Decimative Multiplication of Entropy Arrays, with
Application to Influenza. Entropy 11(3):351-359.
Compute cumulative entropy for input nucleotide sequences,
This suite of seven programs should work together as is.
You should:
(1) enter input file name and directory for f_in in
info_entropy1.py;
(2) enter the number of sequences (ns_in) in your dataset in
info_entropy1.py;
(3) enter the number of positions (np_in) in the nucleotide
sequence in info_entropy1.py;
(4) save the files, denoted by the "****"
separators, using the indicated filenames.
"""
from scipy import *
from pylab import *
from numpy import *
import cumulator
import v_protein # calculates entropy vector for encoding
nucleotide seq
f_in = "C:\Python25\my_file.fa" # Your input file
identification goes here.
# ns_in = number of seqs in dataset
ns_in = nnn # input ns_in for your dataset.
# np = number of nucleotide positions per sequence.
Sequences should be of equal length.
np_in = nnn # input np_in for your dataset.
# Calculate entropy vector for protein gene.
v_in = v_protein.main(f_in, ns_in, np_in)
# Calculate decimated vectors from entropy vector.
z=cumulator.main(v_in, np_in)
in_001 = z[0]
in_010 = z[1]
in_100 = z[2]
plot(in_001,"-", lw=3, color = "red",
label = "001")
plot(in_010,"-", lw=3, color = "cyan",
label = "010")
plot(in_100,"-", lw=3, color = "black",
label = "100")
legend(loc = 0) # optimize legend position.
xlabel("Nucleotide Position")
ylabel("Cumulative Information Entropy (bits)")
title("My Nucleotide Sequence")
show()
****************************************************************************
"""
Filename2 = cumulator.py
Determines cumulative sum of input entropy vector.
Decimation is performed.
"""
def main(v_seq, np_seq):
import numpy as
npy
import decimators
as decs
z = decs.main()
refseq001 = z[0]
refseq010 = z[1]
refseq100 = z[2]
seq_111 = v_seq
seq_001 =
seq_111*refseq001[0:np_seq]
seq_010 =
seq_111*refseq010[0:np_seq]
seq_100 =
seq_111*refseq100[0:np_seq]
seq_111_cum =
npy.zeros(np_seq)
seq_001_cum =
npy.zeros(np_seq)
seq_010_cum =
npy.zeros(np_seq)
seq_100_cum =
npy.zeros(np_seq)
for ndx3 in
range(1, np_seq):
cumul_111 =
seq_111_cum[ndx3 - 1]
cumul_001 =
seq_001_cum[ndx3 - 1]
cumul_010 =
seq_010_cum[ndx3 - 1]
cumul_100 =
seq_100_cum[ndx3 - 1]
seq_111_cum[ndx3] = cumul_111 + seq_111[ndx3]
seq_001_cum[ndx3] = cumul_001 + seq_001[ndx3]
seq_010_cum[ndx3] = cumul_010 + seq_010[ndx3]
seq_100_cum[ndx3] = cumul_100 + seq_100[ndx3]
return
seq_001_cum, seq_010_cum, seq_100_cum
****************************************************************************
# Filename3 = decimators,py
# decimating arrays
from numpy import *
def main():
refseq001 =
[0,0,1]*5000
refseq010 = [ 0,
1, 0]*5000
refseq100 = [ 1,
0, 0]*5000
refseq =
[refseq001, refseq010, refseq100]
refseq =
array(refseq)
# print refseq
return refseq
****************************************************************************
#Filnename4 = dist.py
import probs
import nt_entropy
def main(f_in, f_in_counts, ndx2):
my_fa = f_in
my_eval =
open(my_fa, 'r')
my_seqx=''
for ndx1 in
range(f_in_counts):
my_header=
my_eval.readline()
my_seq=my_eval.readline()
try:
my_seqx=my_seqx+my_seq[ndx2]
except
IndexError:
continue
a=my_seqx.count('A')
c=my_seqx.count('C')
g=my_seqx.count('G')
u=my_seqx.count('T')
px=probs.main(a,c,g,u)
H=nt_entropy.main(px[0], px[1], px[2], px[3])
return H
****************************************************************************
# Filename5 = nt_entropy.py
# Calculates entropy in bits.
from cmath import *
def main(a,c,g,u):
if a==0:
Ha=0
else:
Ha=a*log(1/a,2)
if c==0:
Hc=0
else:
Hc=c*log(1/c,2)
if g==0:
Hg=0
else:
Hg=g*log(1/g,2)
if u==0:
Hu=0
else:
Hu=u*log(1/u,2)
H=Ha+Hc+Hg+Hu
H=abs(H)
return H
***********************************************************************************
# Filename6 = probs.py
# Input counts, return probabilities
def main(a,c,g,u):
total_nt=float(a+c+g+u)
pa=a/total_nt
pc=c/total_nt
pg=g/total_nt
pu=u/total_nt
return pa, pc,
pg, pu
*************************************************************************************
# Filename7 = v_protein.py
"""
Calculates H vector from input nucleotide sequences
f_in = input file
ns_in = number of sequences in f_in
np_in = number of nucleotide positions in each sequence.
"""
from numpy import *
import dist
def main(f_in, ns_in, np_in):
x_in = f_in
seq_count = ns_in
ndx1 = np_in
v = zeros(np_in)
for ndx1 in
range(np_in):
Hx =
dist.main(x_in,seq_count,ndx1)
v[ndx1] =
Hx
v_out = v
return v_out
****************************************************************************************