# Filename1 = info_entropy1.py

"""

The citation for these programs is: Thompson WA, Martwick A and Weltman JK. (2009) Decimative Multiplication of Entropy Arrays, with Application to Influenza. Entropy 11(3):351-359.

 

Compute cumulative entropy for input nucleotide sequences,

This suite of seven programs should work together as is.

You should:

(1) enter input file name and directory for f_in in info_entropy1.py;

(2) enter the number of sequences (ns_in) in your dataset in info_entropy1.py;

(3) enter the number of positions (np_in) in the nucleotide sequence in info_entropy1.py;

(4) save the files, denoted by the "****" separators, using the indicated filenames.

 

"""

 

from scipy import *

from pylab import *

from numpy import *

import cumulator

import v_protein # calculates entropy vector for encoding nucleotide seq 

 

f_in = "C:\Python25\my_file.fa" # Your input file identification goes here.

 

# ns_in = number of seqs in dataset

ns_in = nnn # input ns_in for your dataset.

 

# np = number of nucleotide positions per sequence. Sequences should be of equal length.

np_in = nnn # input np_in for your dataset.

 

# Calculate entropy vector for protein gene.

v_in = v_protein.main(f_in, ns_in, np_in)

 

# Calculate decimated vectors from entropy vector.

z=cumulator.main(v_in, np_in)

in_001 = z[0]

in_010 = z[1]

in_100 = z[2]

 

plot(in_001,"-", lw=3, color = "red", label = "001")

plot(in_010,"-", lw=3, color = "cyan", label = "010")

plot(in_100,"-", lw=3, color = "black", label = "100")

 

legend(loc = 0) # optimize legend position.

 

xlabel("Nucleotide Position")

ylabel("Cumulative Information Entropy (bits)")

title("My Nucleotide Sequence")

 

show()

 

 

****************************************************************************

"""

Filename2 = cumulator.py

Determines cumulative sum of input entropy vector.

Decimation is performed.

"""

 

def main(v_seq, np_seq):

    import numpy as npy

    import decimators as decs

 

    z = decs.main()

    refseq001 = z[0]

    refseq010 = z[1]

    refseq100 = z[2]

 

    seq_111 = v_seq

    seq_001 = seq_111*refseq001[0:np_seq]

    seq_010 = seq_111*refseq010[0:np_seq]

    seq_100 = seq_111*refseq100[0:np_seq]

 

    seq_111_cum = npy.zeros(np_seq)

    seq_001_cum = npy.zeros(np_seq)

    seq_010_cum = npy.zeros(np_seq)

    seq_100_cum = npy.zeros(np_seq)

 

    for ndx3 in range(1, np_seq):

        cumul_111 = seq_111_cum[ndx3 - 1]

        cumul_001 = seq_001_cum[ndx3 - 1]

        cumul_010 = seq_010_cum[ndx3 - 1]

        cumul_100 = seq_100_cum[ndx3 - 1]

 

        seq_111_cum[ndx3] = cumul_111 + seq_111[ndx3]

        seq_001_cum[ndx3] = cumul_001 + seq_001[ndx3]

        seq_010_cum[ndx3] = cumul_010 + seq_010[ndx3]

        seq_100_cum[ndx3] = cumul_100 + seq_100[ndx3]

 

    return seq_001_cum, seq_010_cum, seq_100_cum

 

****************************************************************************

 

# Filename3 = decimators,py

# decimating arrays

from numpy import *

 

def main():

 

    refseq001 = [0,0,1]*5000

 

    refseq010 = [ 0, 1, 0]*5000

 

    refseq100 = [ 1, 0, 0]*5000

 

    refseq = [refseq001, refseq010, refseq100]

    refseq = array(refseq)

    # print refseq

    return refseq

 

****************************************************************************

#Filnename4 = dist.py

 

import probs

import nt_entropy

 

def main(f_in, f_in_counts, ndx2):

 

    my_fa = f_in

    my_eval = open(my_fa, 'r')

 

    my_seqx=''

 

    for ndx1 in range(f_in_counts):

        my_header= my_eval.readline()    

        my_seq=my_eval.readline()

       

        try:

            my_seqx=my_seqx+my_seq[ndx2]

        except IndexError:

            continue   

       

        a=my_seqx.count('A')

        c=my_seqx.count('C')

        g=my_seqx.count('G')

        u=my_seqx.count('T')   

 

    px=probs.main(a,c,g,u)

    H=nt_entropy.main(px[0], px[1], px[2], px[3])

 

    return H

 

****************************************************************************

# Filename5 = nt_entropy.py

 

# Calculates entropy in bits.

 

from cmath import *

 

def main(a,c,g,u):

     if a==0:

          Ha=0

     else: Ha=a*log(1/a,2)

 

     if c==0:

          Hc=0

     else: Hc=c*log(1/c,2)

 

     if g==0:

          Hg=0

     else: Hg=g*log(1/g,2)

 

     if u==0:

          Hu=0

     else: Hu=u*log(1/u,2)    

 

     H=Ha+Hc+Hg+Hu

     H=abs(H)

 

     return H

 

***********************************************************************************

 

# Filename6 = probs.py

# Input counts, return probabilities

 

def main(a,c,g,u):

    total_nt=float(a+c+g+u)

  

    pa=a/total_nt

    pc=c/total_nt

    pg=g/total_nt

    pu=u/total_nt

 

    return pa, pc, pg, pu

 

*************************************************************************************

 

 

# Filename7 = v_protein.py

 

"""

Calculates H vector from input nucleotide sequences

 

f_in = input file

ns_in = number of sequences in f_in

np_in = number of nucleotide positions in each sequence.

"""

 

from numpy import *

import dist

 

def main(f_in, ns_in, np_in):

    x_in = f_in

    seq_count = ns_in

    ndx1 = np_in

    v = zeros(np_in)

 

    for ndx1 in range(np_in):

        Hx = dist.main(x_in,seq_count,ndx1)

        v[ndx1] = Hx           

    v_out = v

   

    return v_out

 

****************************************************************************************