# Filename1 = info

# Filename1 = info_entropy1.py

"""

The citation for these programs is: Thompson WA, Martwick A and Weltman JK. (2009) Decimative Multiplication of Entropy Arrays, with Application to Influenza. Entropy 11(3):351-359.

Compute cumulative entropy for input nucleotide sequences,

This suite of seven programs should work together as is.

You should:

(1) enter input file name and directory for f_in in info_entropy1.py;

(2) enter the number of sequences (ns_in) in your dataset in info_entropy1.py;

(3) enter the number of positions (np_in) in the nucleotide sequence in info_entropy1.py;

(4) save the files, denoted by the "****" separators, using the indicated filenames.

"""

from scipy import *

from pylab import *

from numpy import *

import cumulator

import v_protein # calculates entropy vector for encoding nucleotide seq

f_in = "C:\Python25\my_file.fa" # Your input file identification goes here.

# ns_in = number of seqs in dataset

ns_in = nnn # input ns_in for your dataset.

# np = number of nucleotide positions per sequence. Sequences should be of equal length.

np_in = nnn # input np_in for your dataset.

# Calculate entropy vector for protein gene.

v_in = v_protein.main(f_in, ns_in, np_in)

# Calculate decimated vectors from entropy vector.

z=cumulator.main(v_in, np_in)

in_001 = z[0]

in_010 = z[1]

in_100 = z[2]

plot(in_001,"-", lw=3, color = "red", label = "001")

plot(in_010,"-", lw=3, color = "cyan", label = "010")

plot(in_100,"-", lw=3, color = "black", label = "100")

legend(loc = 0) # optimize legend position.

xlabel("Nucleotide Position")

ylabel("Cumulative Information Entropy (bits)")

title("My Nucleotide Sequence")

show()

****************************************************************************

"""

Filename2 = cumulator.py

Determines cumulative sum of input entropy vector.

Decimation is performed.

"""

def main(v_seq, np_seq):

import numpy as npy

import decimators as decs

z = decs.main()

refseq001 = z[0]

refseq010 = z[1]

refseq100 = z[2]

seq_111 = v_seq

seq_001 = seq_111*refseq001[0:np_seq]

seq_010 = seq_111*refseq010[0:np_seq]

seq_100 = seq_111*refseq100[0:np_seq]

seq_111_cum = npy.zeros(np_seq)

seq_001_cum = npy.zeros(np_seq)

seq_010_cum = npy.zeros(np_seq)

seq_100_cum = npy.zeros(np_seq)

for ndx3 in range(1, np_seq):

cumul_111 = seq_111_cum[ndx3 - 1]

cumul_001 = seq_001_cum[ndx3 - 1]

cumul_010 = seq_010_cum[ndx3 - 1]

cumul_100 = seq_100_cum[ndx3 - 1]

seq_111_cum[ndx3] = cumul_111 + seq_111[ndx3]

seq_001_cum[ndx3] = cumul_001 + seq_001[ndx3]

seq_010_cum[ndx3] = cumul_010 + seq_010[ndx3]

seq_100_cum[ndx3] = cumul_100 + seq_100[ndx3]

return seq_001_cum, seq_010_cum, seq_100_cum

****************************************************************************

# Filename3 = decimators,py

# decimating arrays

from numpy import *

def main():

refseq001 = [0,0,1]*5000

refseq010 = [ 0, 1, 0]*5000

refseq100 = [ 1, 0, 0]*5000

refseq = [refseq001, refseq010, refseq100]

refseq = array(refseq)

# print refseq

return refseq

****************************************************************************

#Filnename4 = dist.py

import probs

import nt_entropy

def main(f_in, f_in_counts, ndx2):

my_fa = f_in

my_eval = open(my_fa, 'r')

my_seqx=''

for ndx1 in range(f_in_counts):

my_header= my_eval.readline()

my_seq=my_eval.readline()

try:

my_seqx=my_seqx+my_seq[ndx2]

except IndexError:

continue

a=my_seqx.count('A')

c=my_seqx.count('C')

g=my_seqx.count('G')

u=my_seqx.count('T')

px=probs.main(a,c,g,u)

H=nt_entropy.main(px[0], px[1], px[2], px[3])

return H

****************************************************************************

# Filename5 = nt_entropy.py

# Calculates entropy in bits.

from cmath import *

def main(a,c,g,u):

if a==0:

Ha=0

else: Ha=a*log(1/a,2)

if c==0:

Hc=0

else: Hc=c*log(1/c,2)

if g==0:

Hg=0

else: Hg=g*log(1/g,2)

if u==0:

Hu=0

else: Hu=u*log(1/u,2)

H=Ha+Hc+Hg+Hu

H=abs(H)

return H

***********************************************************************************

# Filename6 = probs.py

# Input counts, return probabilities

def main(a,c,g,u):

total_nt=float(a+c+g+u)

pa=a/total_nt

pc=c/total_nt

pg=g/total_nt

pu=u/total_nt

return pa, pc, pg, pu

*************************************************************************************

# Filename7 = v_protein.py

"""

Calculates H vector from input nucleotide sequences

f_in = input file

ns_in = number of sequences in f_in

np_in = number of nucleotide positions in each sequence.

"""

from numpy import *

import dist

def main(f_in, ns_in, np_in):

x_in = f_in

seq_count = ns_in

ndx1 = np_in

v = zeros(np_in)

for ndx1 in range(np_in):

Hx = dist.main(x_in,seq_count,ndx1)

v[ndx1] = Hx

v_out = v

return v_out

****************************************************************************************