From: Dariusz Murakowski Date: Fri, 24 Apr 2015 04:15:13 +0000 (-0400) Subject: Copy over code to read seq2state file, which gives amino acid identity of each Potts... X-Git-Url: http://src.murakowski.org/?a=commitdiff_plain;h=892627c8431ed8d640349a7e804c39cf7d3caa0f;p=VirEvoDyn.git Copy over code to read seq2state file, which gives amino acid identity of each Potts state. --- diff --git a/ham_ss.cpp b/ham_ss.cpp index bcac30d..78a7834 100644 --- a/ham_ss.cpp +++ b/ham_ss.cpp @@ -134,6 +134,7 @@ PottsHamiltonian::PottsHamiltonian(const std::string &couplingsFile, const std:: { getCouplings(couplingsFile); getSeq2State(seq2stateFile); + this->size = aa2state.size(); } @@ -165,10 +166,63 @@ void PottsHamiltonian::getCouplings(const std::string &FILENAME) } + fclose(input); + } + +// Load each line of a *-seq2state.dat file into a vector of characters. + void PottsHamiltonian::getSeq2State(const std::string &FILENAME) { + FILE *input = fopen(FILENAME.c_str(),"r"); // -seq2state.dat + if (input == NULL) { perror((std::string("ERROR in PottsHamiltonian::getSeq2State: ") + FILENAME).c_str()); exit(1); } + + CharVector Schar; + + char c; + char o; + + while (fscanf(input,"%c",&c)==1) { + + Schar.push_back(std::vector()); + (Schar.back()).push_back(c); + + while (fscanf(input,"%c",&o)==1) { + + if (o=='\n' || o=='\r') break; + + fscanf(input,"%c",&c); + (Schar.back()).push_back(c); + + } + + } + + fclose(input); + + + //std::vector > aa2state; + for (unsigned i=0; i()); + // expect all lines to contain the 'X' character to represent all the other amino acids + std::vector::iterator begin = Schar[i].begin(), end = Schar[i].end(); + int Xidx = std::distance(begin, std::find(begin,end,'X')); + for (int aa=aa_BEGIN; aa != aa_END; ++aa) { + std::vector::iterator it = std::find(begin,end,int2aa.at(aa)); + int idx; + if (it != end) { // amino acid explicitly encoded + idx = std::distance(begin,it); + } + else { // amino acid represented by 'X' + idx = Xidx; + } + // commonest amino acid goes to end + // maybe easier to subtract 1 and mod? + idx = idx==0 ? Schar[i].size()-1 : idx-1; + aa2state.back()[aa] = idx; + } + } } diff --git a/ham_ss.h b/ham_ss.h index 951bce4..bbfbbdd 100644 --- a/ham_ss.h +++ b/ham_ss.h @@ -4,9 +4,11 @@ #include #include #include +#include #include // sqrt() #include "pop_ss.h" +#include "seqTools.h" typedef std::vector > AdjacencyList; @@ -14,7 +16,7 @@ typedef std::vector > Coupling; typedef std::set MutatedSiteSequence; -typedef std::vector PottsState; +typedef std::vector PottsState; class Virus; @@ -75,9 +77,11 @@ inline int index(int i, int j, size_t length) { class PottsHamiltonian { public: + unsigned int size; double bh; // fields "inverse temperature" multiplier double bJ; // couplings "inverse temperature" multiplier Coupling J; + std::vector > aa2state; PottsHamiltonian(const std::string &couplingsfile, const std::string &seq2stateFile); PottsHamiltonian() : bh(1.0), bJ(1.0) { } diff --git a/pop_ss.h b/pop_ss.h index 5b02bd9..083d656 100644 --- a/pop_ss.h +++ b/pop_ss.h @@ -11,6 +11,8 @@ #include "seqTools.h" +typedef std::vector PottsState; + class Hamiltonian; @@ -49,7 +51,7 @@ class NTVirus { public: double energy; std::string nt_seq; - std::vector aa_seq; + PottsState aa_seq; unsigned int L_nt; unsigned int L_aa; diff --git a/seqTools.cpp b/seqTools.cpp index d9d58fe..1d86b21 100644 --- a/seqTools.cpp +++ b/seqTools.cpp @@ -123,7 +123,7 @@ void getSeq2State(FILE *input, CharVector &p) } -#if 0 +#if HIDDEN int main(int argc, char *argv[]) { FILE *f = fopen(argv[1],"r"); @@ -148,7 +148,7 @@ int main(int argc, char *argv[]) std::vector::iterator begin = Schar[i].begin(), end = Schar[i].end(); int Xidx = std::distance(begin, std::find(begin,end,'X')); for (int aa=aa_BEGIN; aa != aa_END; ++aa) { - std::vector::iterator it = std::find(begin,end,int2aa[aa]); + std::vector::iterator it = std::find(begin,end,int2aa.at(aa)); int idx; if (it != end) { // amino acid explicitly encoded idx = std::distance(begin,it); @@ -174,7 +174,7 @@ int main(int argc, char *argv[]) std::vector config(configChar.size()); for (int i=0; i < config.size(); i++) { std::cout << configChar[i] << ' '; - config[i] = aa2int[configChar[i]]; + config[i] = aa2int.at(configChar[i]); } std::cout << '\n'; @@ -189,7 +189,7 @@ int main(int argc, char *argv[]) std::vector codon_seq; codon_seq.reserve(nt_seq.size()/3); for (unsigned i=0; i aa_seq; aa_seq.reserve(nt_seq.size()/3); for (unsigned i=0; i > CharVector; // ATT,ATC,ATA,ATG, ACT,ACC,ACA,ACG, AAT,AAC,AAA,AAG, AGT,AGC,AGA,AGG, // GTT,GTC,GTA,GTG, GCT,GCC,GCA,GCG, GAT,GAC,GAA,GAG, GGT,GGC,GGA,GGG}; -enum aa {aa_BEGIN, aa_unknown=aa_BEGIN, aa_A,aa_R,aa_N,aa_D,aa_C, aa_Q,aa_E,aa_G,aa_H,aa_I, aa_L,aa_K,aa_M,aa_F,aa_P, aa_S,aa_T,aa_W,aa_Y,aa_V, aa_B,aa_Z,aa_X, aa_star, aa_gap, aa_END}; +enum aa : unsigned {aa_BEGIN, aa_unknown=aa_BEGIN, aa_A,aa_R,aa_N,aa_D,aa_C, aa_Q,aa_E,aa_G,aa_H,aa_I, aa_L,aa_K,aa_M,aa_F,aa_P, aa_S,aa_T,aa_W,aa_Y,aa_V, aa_B,aa_Z,aa_X, aa_star, aa_gap, aa_END}; enum nt {nt_T,nt_C,nt_A,nt_G}; enum codon {cod_TTT,cod_TTC,cod_TTA,cod_TTG, cod_TCT,cod_TCC,cod_TCA,cod_TCG, cod_TAT,cod_TAC,cod_TAA,cod_TAG, cod_TGT,cod_TGC,cod_TGA,cod_TGG, cod_CTT,cod_CTC,cod_CTA,cod_CTG, cod_CCT,cod_CCC,cod_CCA,cod_CCG, cod_CAT,cod_CAC,cod_CAA,cod_CAG, cod_CGT,cod_CGC,cod_CGA,cod_CGG, diff --git a/ss.cpp b/ss.cpp index 115ad06..6523be8 100644 --- a/ss.cpp +++ b/ss.cpp @@ -489,21 +489,30 @@ void usage() int main(int argc, char *argv[]) { +///* std::string fname = argv[1]; - PottsHamiltonian H(fname,""); + std::string fname2 = argv[2]; + PottsHamiltonian H(fname,fname2); for (unsigned i=0; i