From f266821a7af4950c639a3f59a9d3c3ba8f7f103c Mon Sep 17 00:00:00 2001 From: Dariusz Murakowski Date: Thu, 23 Apr 2015 02:21:33 -0400 Subject: [PATCH] Copy seqTools.cpp to seqTools.h for inclusion to main. --- seqTools.h | 347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 347 insertions(+) create mode 100644 seqTools.h diff --git a/seqTools.h b/seqTools.h new file mode 100644 index 0000000..44c9f0d --- /dev/null +++ b/seqTools.h @@ -0,0 +1,347 @@ +#include +#include +#include +#include +#include + +typedef std::vector > IntVector; +typedef std::vector > CharVector; + +//enum class aa {BEGIN, unknown=BEGIN, A,R,N,D,C, Q,E,G,H,I, L,K,M,F,P, S,T,W,Y,V, B,Z,X, star, gap, END}; +enum aa {aa_BEGIN, aa_unknown=aa_BEGIN, aa_A,aa_R,aa_N,aa_D,aa_C, aa_Q,aa_E,aa_G,aa_H,aa_I, aa_L,aa_K,aa_M,aa_F,aa_P, aa_S,aa_T,aa_W,aa_Y,aa_V, aa_B,aa_Z,aa_X, aa_star, aa_gap, aa_END}; +enum class nt {T,C,A,G}; +enum class codon {TTT,TTC,TTA,TTG, TCT,TCC,TCA,TCG, TAT,TAC,TAA,TAG, TGT,TGC,TGA,TGG, + CTT,CTC,CTA,CTG, CCT,CCC,CCA,CCG, CAT,CAC,CAA,CAG, CGT,CGC,CGA,CGG, + ATT,ATC,ATA,ATG, ACT,ACC,ACA,ACG, AAT,AAC,AAA,AAG, AGT,AGC,AGA,AGG, + GTT,GTC,GTA,GTG, GCT,GCC,GCA,GCG ,GAT,GAC,GAA,GAG, GGT,GGC,GGA,GGG}; + + +typedef std::unordered_map str2int_map; +typedef std::unordered_map int2str_map; +//typedef std::unordered_map int2aa_map; +typedef std::unordered_map codon2aa_map; +typedef std::unordered_map aa2str_map; +typedef std::unordered_map str2aa_map; +typedef std::unordered_map codon2str_map; +typedef std::unordered_map str2codon_map; + +namespace std { + template<> struct hash { + inline size_t operator()(const aa &x) const { return static_cast(x); } + }; + template<> struct hash { + inline size_t operator()(const nt &x) const { return static_cast(x); } + }; + template<> struct hash { + inline size_t operator()(const codon &x) const { return static_cast(x); } + }; +} + +codon2aa_map codon2aa { + {codon::TTT, aa_F}, {codon::TTC, aa_F}, {codon::TTA, aa_L}, {codon::TTG, aa_L}, + {codon::TCT, aa_S}, {codon::TCC, aa_S}, {codon::TCA, aa_S}, {codon::TCG, aa_S}, + {codon::TAT, aa_Y}, {codon::TAC, aa_Y}, {codon::TAA, aa_star}, {codon::TAG, aa_star}, + {codon::TGT, aa_C}, {codon::TGC, aa_C}, {codon::TGA, aa_star}, {codon::TGG, aa_W}, + {codon::CTT, aa_L}, {codon::CTC, aa_L}, {codon::CTA, aa_L}, {codon::CTG, aa_L}, + {codon::CCT, aa_P}, {codon::CCC, aa_P}, {codon::CCA, aa_P}, {codon::CCG, aa_P}, + {codon::CAT, aa_H}, {codon::CAC, aa_H}, {codon::CAA, aa_Q}, {codon::CAG, aa_Q}, + {codon::CGT, aa_R}, {codon::CGC, aa_R}, {codon::CGA, aa_R}, {codon::CGG, aa_R}, + {codon::ATT, aa_I}, {codon::ATC, aa_I}, {codon::ATA, aa_I}, {codon::ATG, aa_M}, + {codon::ACT, aa_T}, {codon::ACC, aa_T}, {codon::ACA, aa_T}, {codon::ACG, aa_T}, + {codon::AAT, aa_N}, {codon::AAC, aa_N}, {codon::AAA, aa_K}, {codon::AAG, aa_K}, + {codon::AGT, aa_S}, {codon::AGC, aa_S}, {codon::AGA, aa_R}, {codon::AGG, aa_R}, + {codon::GTT, aa_V}, {codon::GTC, aa_V}, {codon::GTA, aa_V}, {codon::GTG, aa_V}, + {codon::GCT, aa_A}, {codon::GCC, aa_A}, {codon::GCA, aa_A}, {codon::GCG, aa_A}, + {codon::GAT, aa_D}, {codon::GAC, aa_D}, {codon::GAA, aa_E}, {codon::GAG, aa_E}, + {codon::GGT, aa_G}, {codon::GGC, aa_G}, {codon::GGA, aa_G}, {codon::GGG, aa_G}, +}; + +str2int_map aa2int { + {'A', 1}, {'R', 2}, {'N', 3}, {'D', 4}, {'C', 5}, + {'Q', 6}, {'E', 7}, {'G', 8}, {'H', 9}, {'I', 10}, + {'L', 11}, {'K', 12}, {'M', 13}, {'F', 14}, {'P', 15}, + {'S', 16}, {'T', 17}, {'W', 18}, {'Y', 19}, {'V', 20}, + {'B', 21}, {'Z', 22}, {'X', 23}, {'*', 24}, {'-', 25}, {'?', 0} +}; + +int2str_map int2aa = { + { 1, 'A'}, { 2, 'R'}, { 3, 'N'}, { 4, 'D'}, { 5, 'C'}, + { 6, 'Q'}, { 7, 'E'}, { 8, 'G'}, { 9, 'H'}, {10, 'I'}, + {11, 'L'}, {12, 'K'}, {13, 'M'}, {14, 'F'}, {15, 'P'}, + {16, 'S'}, {17, 'T'}, {18, 'W'}, {19, 'Y'}, {20, 'V'}, + {21, 'B'}, {22, 'Z'}, {23, 'X'}, {24, '*'}, {25, '-'}, {0, '?'} +}; + +aa2str_map aa2str = { + {aa_A,'A'}, {aa_R,'R'}, {aa_N,'N'}, {aa_D,'D'}, {aa_C,'C'}, + {aa_Q,'Q'}, {aa_E,'E'}, {aa_G,'G'}, {aa_H,'H'}, {aa_I,'I'}, + {aa_L,'L'}, {aa_K,'K'}, {aa_M,'M'}, {aa_F,'F'}, {aa_P,'P'}, + {aa_S,'S'}, {aa_T,'T'}, {aa_W,'W'}, {aa_Y,'Y'}, {aa_V,'V'}, + {aa_B,'B'}, {aa_Z,'Z'}, {aa_X,'X'}, {aa_star,'*'}, {aa_gap,'-'}, {aa_unknown,'?'} +}; + +str2aa_map str2aa = { + {'A',aa_A}, {'R',aa_R}, {'N',aa_N}, {'D',aa_D}, {'C',aa_C}, + {'Q',aa_Q}, {'E',aa_E}, {'G',aa_G}, {'H',aa_H}, {'I',aa_I}, + {'L',aa_L}, {'K',aa_K}, {'M',aa_M}, {'F',aa_F}, {'P',aa_P}, + {'S',aa_S}, {'T',aa_T}, {'W',aa_W}, {'Y',aa_Y}, {'V',aa_V}, + {'B',aa_B}, {'Z',aa_Z}, {'X',aa_X}, {'*',aa_star}, {'-',aa_gap}, {'?',aa_unknown} +}; + +codon2str_map codon2str = { + {codon::TTT, "TTT"}, {codon::TTC, "TTC"}, {codon::TTA, "TTA"}, {codon::TTG, "TTG"}, + {codon::TCT, "TCT"}, {codon::TCC, "TCC"}, {codon::TCA, "TCA"}, {codon::TCG, "TCG"}, + {codon::TAT, "TAT"}, {codon::TAC, "TAC"}, {codon::TAA, "TAA"}, {codon::TAG, "TAG"}, + {codon::TGT, "TGT"}, {codon::TGC, "TGC"}, {codon::TGA, "TGA"}, {codon::TGG, "TGG"}, + {codon::CTT, "CTT"}, {codon::CTC, "CTC"}, {codon::CTA, "CTA"}, {codon::CTG, "CTG"}, + {codon::CCT, "CCT"}, {codon::CCC, "CCC"}, {codon::CCA, "CCA"}, {codon::CCG, "CCG"}, + {codon::CAT, "CAT"}, {codon::CAC, "CAC"}, {codon::CAA, "CAA"}, {codon::CAG, "CAG"}, + {codon::CGT, "CGT"}, {codon::CGC, "CGC"}, {codon::CGA, "CGA"}, {codon::CGG, "CGG"}, + {codon::ATT, "ATT"}, {codon::ATC, "ATC"}, {codon::ATA, "ATA"}, {codon::ATG, "ATG"}, + {codon::ACT, "ACT"}, {codon::ACC, "ACC"}, {codon::ACA, "ACA"}, {codon::ACG, "ACG"}, + {codon::AAT, "AAT"}, {codon::AAC, "AAC"}, {codon::AAA, "AAA"}, {codon::AAG, "AAG"}, + {codon::AGT, "AGT"}, {codon::AGC, "AGC"}, {codon::AGA, "AGA"}, {codon::AGG, "AGG"}, + {codon::GTT, "GTT"}, {codon::GTC, "GTC"}, {codon::GTA, "GTA"}, {codon::GTG, "GTG"}, + {codon::GCT, "GCT"}, {codon::GCC, "GCC"}, {codon::GCA, "GCA"}, {codon::GCG, "GCG"}, + {codon::GAT, "GAT"}, {codon::GAC, "GAC"}, {codon::GAA, "GAA"}, {codon::GAG, "GAG"}, + {codon::GGT, "GGT"}, {codon::GGC, "GGC"}, {codon::GGA, "GGA"}, {codon::GGG, "GGG"}, +}; + +str2codon_map str2codon = { + {"TTT", codon::TTT}, {"TTC", codon::TTC}, {"TTA", codon::TTA}, {"TTG", codon::TTG}, + {"TCT", codon::TCT}, {"TCC", codon::TCC}, {"TCA", codon::TCA}, {"TCG", codon::TCG}, + {"TAT", codon::TAT}, {"TAC", codon::TAC}, {"TAA", codon::TAA}, {"TAG", codon::TAG}, + {"TGT", codon::TGT}, {"TGC", codon::TGC}, {"TGA", codon::TGA}, {"TGG", codon::TGG}, + {"CTT", codon::CTT}, {"CTC", codon::CTC}, {"CTA", codon::CTA}, {"CTG", codon::CTG}, + {"CCT", codon::CCT}, {"CCC", codon::CCC}, {"CCA", codon::CCA}, {"CCG", codon::CCG}, + {"CAT", codon::CAT}, {"CAC", codon::CAC}, {"CAA", codon::CAA}, {"CAG", codon::CAG}, + {"CGT", codon::CGT}, {"CGC", codon::CGC}, {"CGA", codon::CGA}, {"CGG", codon::CGG}, + {"ATT", codon::ATT}, {"ATC", codon::ATC}, {"ATA", codon::ATA}, {"ATG", codon::ATG}, + {"ACT", codon::ACT}, {"ACC", codon::ACC}, {"ACA", codon::ACA}, {"ACG", codon::ACG}, + {"AAT", codon::AAT}, {"AAC", codon::AAC}, {"AAA", codon::AAA}, {"AAG", codon::AAG}, + {"AGT", codon::AGT}, {"AGC", codon::AGC}, {"AGA", codon::AGA}, {"AGG", codon::AGG}, + {"GTT", codon::GTT}, {"GTC", codon::GTC}, {"GTA", codon::GTA}, {"GTG", codon::GTG}, + {"GCT", codon::GCT}, {"GCC", codon::GCC}, {"GCA", codon::GCA}, {"GCG", codon::GCG}, + {"GAT", codon::GAT}, {"GAC", codon::GAC}, {"GAA", codon::GAA}, {"GAG", codon::GAG}, + {"GGT", codon::GGT}, {"GGC", codon::GGC}, {"GGA", codon::GGA}, {"GGG", codon::GGG}, +}; + +/* +template +to x2y(from x) +{ + switch (typeof(from)) { + case int: + switch (typeof(to)) { + case float: + return (float)(x); + break; + } + break; + } +} +*/ +template +to x2y(from x) {return static_cast(x);} + + +void getSeq2State(FILE *input, CharVector &p) +{ + char c; + char o; + + while (fscanf(input,"%c",&c)==1) { + + p.push_back(std::vector()); + (p.back()).push_back(c); + + while (fscanf(input,"%c",&o)==1) { + + if (o=='\n' || o=='\r') break; + + fscanf(input,"%c",&c); + (p.back()).push_back(c); + + } + + } + +} + +int main(int argc, char *argv[]) +{ + FILE *f = fopen(argv[1],"r"); + if (f==NULL) { perror("could not load file"); exit(1); } + + CharVector Schar; + getSeq2State(f,Schar); + fclose(f); + + for (int i=0; i > aa2state; + for (int i=0; i()); + // expect all lines to contain the 'X' character to represent all the other amino acids + std::vector::iterator begin = Schar[i].begin(), end = Schar[i].end(); + int Xidx = std::distance(begin, std::find(begin,end,'X')); + for (int aa=aa_BEGIN; aa != aa_END; ++aa) { + std::vector::iterator it = std::find(begin,end,int2aa[aa]); + int idx; + if (it != end) { // amino acid explicitly encoded + idx = std::distance(begin,it); + } + else { // amino acid represented by 'X' + idx = Xidx; + } + // commonest amino acid goes to end + // maybe easier to subtract 1 and mod? + idx = idx==0 ? Schar[i].size()-1 : idx-1; + aa2state.back()[aa] = idx; + } + } + + for (int i = 0; i < aa2state.size(); i++) { + for (int aa=aa_BEGIN; aa != aa_END; ++aa) { + std::cout << aa2state[i][aa] << ' '; + } + std::cout << '\n'; + } + + std::vector configChar = {'A','Y'}; + std::vector config(configChar.size()); + for (int i=0; i < config.size(); i++) { + std::cout << configChar[i] << ' '; + config[i] = aa2int[configChar[i]]; + } + std::cout << '\n'; + + for (int i=0; i < config.size(); i++) { + std::cout << aa2state[i][config[i]] << ' '; + } + std::cout << '\n'; + + + + /* + std::cout << x2y(5) << std::endl; + std::cout << (int)(codon2aa[codon::TTT]) << std::endl; + std::cout << static_cast(codon2aa[codon::TTC]) << std::endl; + std::cout << aa2str[codon2aa[codon::TTC]] << std::endl; + */ + //std::cout << aa2str[str2aa[int2aa[aa2int[codon2aa[str2codon["TTC"]]]]]] << std::endl; + +#if 0 + + for(charmap::iterator i = aa2int.begin(); i != aa2int.end(); ++i) { + std::cout << aa2int[i->first] << '\t' << i->second << std::endl; + } + + + for(intmap::iterator i = int2aa.begin(); i != int2aa.end(); ++i) { + std::cout << int2aa[i->first] << '\t' << i->second << std::endl; + } +#endif + + return 0; +} + + +/* +aa2int = { + 'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5, + 'Q': 6, 'E': 7, 'G': 8, 'H': 9, 'I': 10, + 'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, + 'S': 16, 'T': 17, 'W': 18, 'Y': 19, 'V': 20, + 'B': 21, 'Z': 22, 'X': 23, '*': 24, '-': 25, '?': 0 +} + + +int2aa = { + 1 :'A', 2 :'R', 3 :'N', 4 :'D', 5 :'C', + 6 :'Q', 7 :'E', 8 :'G', 9 :'H', 10 :'I', + 11 :'L', 12 :'K', 13 :'M', 14 :'F', 15 :'P', + 16 :'S', 17 :'T', 18 :'W', 19 :'Y', 20 :'V', + 21 :'B', 22 :'Z', 23 :'X', 24 :'*', 25 :'-', 0 :'?' +} + + +def codon2aa(c): + """ + Returns the amino acid character corresponding to the input codon. + """ + + # If all nucleotides are missing, return gap + + if c[0]=='-' and c[1]=='-' and c[2]=='-': + return '-' + + # If the first or second nucleotide is ambiguous, AA cannot be determined, return '*' + + elif c[0] in ['W', 'S', 'M', 'K', 'R', 'Y', '-'] or c[1] in ['W', 'S', 'M', 'K', 'R', 'Y', '-']: + return '*' + + # Else go to tree + + elif c[0]=='T': + if c[1]=='T': + if c[2] in ['T', 'C', 'Y']: return 'F' + elif c[2] in ['A', 'G', 'R']: return 'L' + else: return '*' + elif c[1]=='C': return 'S' + elif c[1]=='A': + if c[2] in ['T', 'C', 'Y']: return 'Y' + elif c[2] in ['A', 'G', 'R']: return '*' + else: return '*' + elif c[1]=='G': + if c[2] in ['T', 'C', 'Y']: return 'C' + elif c[2]=='A': return '*' + elif c[2]=='G': return 'W' + else: return '*' + else: return '*' + + elif c[0]=='C': + if c[1]=='T': return 'L' + elif c[1]=='C': return 'P' + elif c[1]=='A': + if c[2] in ['T', 'C', 'Y']: return 'H' + elif c[2] in ['A', 'G', 'R']: return 'Q' + else: return '*' + elif c[1]=='G': return 'R' + else: return '*' + + elif c[0]=='A': + if c[1]=='T': + if c[2] in ['T', 'C', 'Y']: return 'I' + elif c[2] in ['A', 'M', 'W']: return 'I' + elif c[2]=='G': return 'M' + else: return '*' + elif c[1]=='C': return 'T' + elif c[1]=='A': + if c[2] in ['T', 'C', 'Y']: return 'N' + elif c[2] in ['A', 'G', 'R']: return 'K' + else: return '*' + elif c[1]=='G': + if c[2] in ['T', 'C', 'Y']: return 'S' + elif c[2] in ['A', 'G', 'R']: return 'R' + else: return '*' + else: return '*' + + elif c[0]=='G': + if c[1]=='T': return 'V' + elif c[1]=='C': return 'A' + elif c[1]=='A': + if c[2] in ['T', 'C', 'Y']: return 'D' + elif c[2] in ['A', 'G', 'R']: return 'E' + else: return '*' + elif c[1]=='G': return 'G' + else: return '*' + + else: return '*' +*/ -- 2.7.4