// Pscan-ChIP 1.1g // // Copyright (C) 2013 Federico Zambelli and Giulio Pavesi // // // // This program is free software: you can redistribute it and/or modify // // it under the terms of the GNU General Public License as published by // // the Free Software Foundation, either version 3 of the License, or // // (at your option) any later version. // // // // This program is distributed in the hope that it will be useful, // // but WITHOUT ANY WARRANTY; without even the implied warranty of // // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // // GNU General Public License for more details. // // // // You should have received a copy of the GNU General Public License // // along with this program. If not, see . // // // // If you find Pscan-ChIP useful in your research please cite our paper: // // // // Zambelli F, Pesole G, Pavesi G. PscanChIP: Finding over-represented // // transcription factor-binding site motifs and their correlations in sequences from // // ChIP-Seq experiments. // // Nucleic Acids Res. 2013 Jul;41(Web Server issue):W535-43 #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; const double MIN_ADD_FREQ = 0.01, EPSILON = pow(10,-21), INF = pow(10,21), /*POS_VIS_THRESHOLD = pow(10,-5),*/ DIST_RATIO_THRESHOLD = 3, DIST_PV_THRESHOLD = pow(10,-5); const unsigned int MAX_REG_SIZE = 3000, ALPHABET_SIZE = 4, TOP_P = 1, DIST_HIT_THRESHOLD = 10; //const char ALPHABET[4] = {'A','C','G','T'}, RC_ALPHABET[4] = {'T','G','C','A'}; char VI['T' + 1]; string regfile, genome, matrixfile, additional_matrixfile, outfile, bgfile, selected, obstf, GALAXY_OUTPUT; unsigned int RSIZE = 150, MAX_L = 0, W_SIZE = 10, W_STEP = 5; bool SECOND_BEST_POS = false, SINGLE_STRAND = false, VERBOSE = false, GET_SEQUENCES = false, POSITIONAL = false, SINGLE_MATRIX = false, SPEARMAN = false, TWOBIT = false, QUIET = false; //NAIVE = false; ofstream fasta_out; double SINGLE_MATRIX_BG_AVG = 0; struct mpoint { unsigned int x; unsigned int y; }; struct bg { unsigned int Size; double Avg, Var; }; class sequence { private: string Chr, Name; char *Seq; unsigned int Start, End, E_Rank; char Strand, AntiStrand; bool good; public: sequence(string*, ifstream*, unsigned int); sequence(string*, unsigned int, string, unsigned int); char strand(); char antistrand(); string chr(); unsigned int start(); unsigned int end(); unsigned int e_rank(); string name(); char* seq(); bool is_good(); double score(); int mpos(); }; class matrix { friend void scanner(vector*, vector*); private: void normalizer(); void to_log(); void avg(); void max_min(); void rev_c(); double get_M_value(int, char); double get_rc_M_value(int, char); void calc_var(double); void calc_W_param(double); void welch_test(double); void welch_test_bg(double); // void welch_test_bg_G(double); void welch_test_pos(double); void z_test_pos(double); unsigned int L, bg_Size, BIN_NUM; double **M; double **rc_M; double max_score, min_score, t, dof, t_bg, dof_bg; double spearman; string Name, Id; bool O_U, O_U_bg; vector W_O_U; bool good, BG; bool norm_sec_step; vector v_score, vF_score; vector v_pos; vector v_strand; vector > W_R_v_score; vector W_R_Avg, W_R_Var, W_R_pvalue, W_R_T_pvalue; double R_Avg, F_Avg, R_Var, F_Var, /*G_Avg, G_Var,*/ pvalue, W_Avg, W_Var; double bg_Avg, bg_Var, pvalue_bg; public: matrix(vector*, map*); void display(); bool is_good(); string name(); string id(); string o_or_u(); string o_or_u_bg(); int W_o_or_u(unsigned int); void scan(vector*); double scan_seq_ds(char*, bool, unsigned int, vector*); double scan_seq_ss(char*, bool, unsigned int, char, vector*); double scan_seq_ds_second_best(char*, unsigned int, unsigned int *, bool *, int); double scan_seq_ss_second_best(char*, unsigned int, unsigned int *, bool *, unsigned int, bool); double get_score_at(unsigned int); unsigned int get_pos_at(unsigned int); unsigned int length(); bool get_strand_at(unsigned int); bool have_bg(); unsigned int bin_num(); double get_R_Avg(); double get_F_Avg(); double get_bg_Avg(); double get_R_Var(); double get_F_Var(); double max(); double min(); double get_pvalue(); double get_pvalue_bg(); double get_W_pvalue(unsigned int); double get_W_R_Avg_at_pos(unsigned int); double get_t(); double get_dof(); double get_spearman(); double get_W_Avg(); double get_W_R_T_pvalue(unsigned int); }; void command_line_parser(int, char**); void acquire_regions(vector*); void acquire_matrices(vector*, map*, string*); void acquire_background(map*); void scanner(vector*, vector*); void main_output(vector*, vector*, unsigned int); void main_output_single_matrix(vector*, vector*); void distance_analysis(vector*, vector*, unsigned int, multimap ::iterator>*); void display_help(); int main(int argc, char **argv) { srand(time(NULL)); vector Seq; vector Mat; map Bg; VI['A'] = 0; VI['C'] = 1; VI['G'] = 2; VI['T'] = 3; VI['N'] = 4; obstf.clear(); command_line_parser(argc, argv); if(GET_SEQUENCES) { string outfile = regfile; outfile += ".fasta"; fasta_out.open(outfile.c_str(), ios::out); } if(!regfile.empty() && !matrixfile.empty()) { if(!bgfile.empty()) { if(!QUIET) cerr << "\nAcquiring background file... "; acquire_background(&Bg); if(!QUIET) cerr << "done" << endl; } if(!QUIET) cerr << "\nReading binding profiles..."; acquire_matrices(&Mat, &Bg, &matrixfile); if(!additional_matrixfile.empty()) acquire_matrices(&Mat, &Bg, &additional_matrixfile); if(Mat.empty()) { if(!QUIET) cerr << "\nZero valid matrices acquired. Quitting..." << endl; exit(1); } if(!QUIET) cerr << "done: " << Mat.size() << " binding profiles acquired. Max matrix length = " << MAX_L << endl; if(Mat.size() == 1) { if(!QUIET) cerr << "\nSwitching to single matrix mode..." << endl; SINGLE_MATRIX = true; } if(!QUIET) cerr << "\nAcquiring regions..."; acquire_regions(&Seq); if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl; if(Seq.size() <= 2) POSITIONAL = false; if(Seq.empty()) { if(!QUIET) cerr << "\nZero regions to work on! Exiting..." << endl; exit(1); } if(GET_SEQUENCES) { fasta_out.close(); exit(0); } // for(unsigned int i = 0; i < Mat.size(); i++) // Mat[i].display(); if(!QUIET) cerr << "\nScanning..."; scanner(&Mat, &Seq); if(!QUIET) cerr << "\nWriting output..." << endl; if(!SINGLE_MATRIX) main_output(&Mat, &Seq, Seq.size()); else main_output_single_matrix(&Mat, &Seq); if(!QUIET) cerr << "\nOutput written in file: " << outfile << endl; } else if(GET_SEQUENCES && !regfile.empty()) { if(!QUIET) cerr << "\nAcquiring regions..."; acquire_regions(&Seq); if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl; fasta_out.close(); exit(0); } else display_help(); /* for(unsigned int i = 0; i < Seq.size(); i++) { cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << Seq[i].start() << '\t' << Seq[i].end() << '\t' << Mat[0].get_score_at(i) << '\t' << Mat[0].get_pos_at(i) << '\t' << Mat[0].get_strand_at(i) << '\t' << "Max = " << Mat[0].max() << '\t' << "Min = " << Mat[0].min() << endl << Seq[i].seq() << endl; cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "UPSTREAM" << endl << Seq[i].uf_seq() << endl; cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "DOWNSTREAM" << endl << Seq[i].df_seq() << endl; } */ return 0; } void display_help() { if(!QUIET) cerr << endl << "Syntax: pscan_chip -r region_file.bed -M matrixfile.wil -g genome [-pos] [-wsize window_size] [-wstep window_step] [-bg background_file] [-s region_size]" << endl << endl; exit(0); } void main_output_single_matrix(vector *Mat, vector *Seq) { outfile = regfile; outfile += "."; outfile += Mat->at(0).id(); string outfile2 = outfile; outfile += ".ris"; outfile2 += ".all.ris"; if(!GALAXY_OUTPUT.empty()) outfile = GALAXY_OUTPUT; multimap outmap; ofstream out(outfile.c_str(), ios::out); ofstream out2(outfile2.c_str(), ios::out); if(!out) { if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl; exit(1); } out << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE"; out2 << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE"; // if(SPEARMAN) // out << "\tE_RANK"; if(SECOND_BEST_POS) { out << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl; out2 << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl; } else { out << endl; out2 << endl; } matrix *M = &Mat->at(0); /* if(SINGLE_MATRIX_BG_AVG == 0) //QUESTO SERVE PER QUANDO NON C'E' IL VALORE DI BACKGROUND! { double loc_avg = 0, loc_stdev = 0; for(unsigned int i = 0; i < Seq->size(); i++) loc_avg += M->get_score_at(i); loc_avg /= Seq->size(); for(unsigned int i = 0; i < Seq->size(); i++) loc_stdev += pow(M->get_score_at(i) - loc_avg, 2); loc_stdev /= (Seq->size() - 1); loc_stdev = sqrt(loc_stdev); SINGLE_MATRIX_BG_AVG = loc_avg - (2*loc_stdev); if(!QUIET) cerr << "\nSMBG = " << SINGLE_MATRIX_BG_AVG; } */ char Oligo[M->length() + 1]; Oligo[M->length()] = '\0'; for(unsigned int i = 0; i < Seq->size(); i++) { // if(!QUIET) cerr << endl << M->get_score_at(i) << '\t' << M->get_bg_Avg() << '\t'; // if(M->get_score_at(i) < SINGLE_MATRIX_BG_AVG) //ONLY REPORT OCCURRENCES WITH SCORE >= BG_AVG // continue; // if(!QUIET) cerr << "passed"; ostringstream outbuf; sequence *S = &Seq->at(i); strncpy(Oligo, S->seq() + RSIZE/2 + (M->get_pos_at(i) - M->length()/2), M->length()); outbuf << S->chr() << '\t' << S->start() << '\t' << S->end() << '\t' << S->strand() << '\t' << S->start() + M->get_pos_at(i) - M->length()/2 << '\t' << S->start() + M->get_pos_at(i) + M->length()/2 << '\t' << (int)((M->get_pos_at(i) - M->length()/2) - RSIZE/2) << '\t' << (int)((M->get_pos_at(i) + M->length()/2 - 1) - RSIZE/2)<< '\t'; if(S->strand() == '+') { if(!M->get_strand_at(i)) outbuf << S->strand(); else outbuf << S->antistrand(); } else { if(!M->get_strand_at(i)) outbuf << S->antistrand(); else outbuf << S->strand(); } outbuf << '\t' << M->get_score_at(i) << '\t' << Oligo; // if(SPEARMAN) // outbuf << '\t' << S->e_rank(); if(SECOND_BEST_POS) { if(SINGLE_STRAND) { bool s; unsigned int P; double score; score = M->scan_seq_ss_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s,M->get_pos_at(i), S->strand()); strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length()); outbuf << '\t' << P << '\t' << score << '\t' << Oligo; } else { bool s; unsigned int P; double score; score = M->scan_seq_ds_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s, (int)M->get_pos_at(i)); strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length()); outbuf << '\t' << (int)P - (int)RSIZE/2 << '\t' << score << '\t' << Oligo << '\t'; if(s) outbuf << '-'; else outbuf << '+'; } outbuf << endl; } else outbuf << endl; outmap.insert(make_pair(M->get_score_at(i),outbuf.str())); } multimap::reverse_iterator mi = outmap.rbegin(); // unsigned int count = 0; while(mi != outmap.rend()) { // if(count >= outmap.size()/2 && outmap.size() > 1) //SOLO LA MIGLIORE META' DELLE OCCORRENZE!!! // break; if(mi->first >= SINGLE_MATRIX_BG_AVG || mi == outmap.rbegin()) out << mi->second; out2 << mi->second; mi++; // count++; } out.close(); out2.close(); return; } void main_output(vector *Mat, vector *Seq, unsigned int SIZE) { multimap ::iterator> S_map; outfile = regfile; outfile += ".res"; if(!GALAXY_OUTPUT.empty()) outfile = GALAXY_OUTPUT; ofstream out(outfile.c_str(),ios::out); if(!out) { if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl; exit(1); } out << "NAME\tID\tREG_N\tL_PVALUE O/U\tG_PVALUE O/U\tR_AVG\tR_STDEV\tF_AVG\tBG_AVG"; if(SPEARMAN) out << "\tSP_COR"; if(POSITIONAL) out << "\tPREF_POS\tPREF_POS_PV\n"; else out << endl; vector::iterator vi = Mat->begin(); while(vi != Mat->end()) { if(S_map.find(vi->get_pvalue()) != S_map.end()) //GFI WORKAROUND { if(S_map.find(vi->get_pvalue())->second->get_pvalue_bg() < vi->get_pvalue_bg()) { S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi)); vi++; continue; } else if((S_map.find(vi->get_pvalue())->second->get_pvalue_bg() > vi->get_pvalue_bg())) { S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi)); vi++; continue; } else { if(S_map.find(vi->get_pvalue())->second->get_spearman() > vi->get_spearman()) { S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi)); vi++; continue; } else if(S_map.find(vi->get_pvalue())->second->get_spearman() <= vi->get_spearman()) { S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi)); vi++; continue; } } } else S_map.insert(make_pair(vi->get_pvalue(), vi)); vi++; } multimap ::iterator>::iterator mi = S_map.begin(); while(mi != S_map.end()) { double pv1, pv2; pv1 = mi->second->get_pvalue() * Mat->size(); if(pv1 > 1) pv1 = 1; pv2 = mi->second->get_pvalue_bg() * Mat->size(); if(pv2 > 1) pv2 = 1; out << mi->second->name() << '\t' << mi->second->id() << '\t' << SIZE << '\t' << pv1 << ' ' << mi->second->o_or_u() << '\t'; if(mi->second->have_bg()) out << pv2 << ' ' << mi->second->o_or_u_bg() << '\t'; else out << "N/A N/A\t"; out << mi->second->get_R_Avg() << '\t' << pow(mi->second->get_R_Var(), 0.5) << '\t' << mi->second->get_F_Avg() << '\t'; if(mi->second->have_bg()) out << mi->second->get_bg_Avg() << '\t'; else out << "N/A" << '\t'; if(SPEARMAN) out << mi->second->get_spearman() << '\t'; if(POSITIONAL) { double min_pv = 1, min_pv_c; unsigned int min_pos; for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++) { if(mi->second->get_W_pvalue(i) <= min_pv) //&& mi->second->W_o_or_u(i) == 1) { min_pv = mi->second->get_W_pvalue(i); min_pos = i; } } min_pv_c = min_pv * (mi->second->bin_num() * Seq->size()); if(min_pv_c > 1) min_pv_c = 1; out << '[' << (int)(((W_STEP * min_pos) + mi->second->length()/2) - RSIZE/2) << ',' << (int)(((W_STEP * min_pos) + W_SIZE) + mi->second->length()/2 - RSIZE/2) << ']' << '\t' << min_pv_c;// << '\t' << mi->second->get_W_R_T_pvalue(min_pos) << '\t' << min_pv_c; } out << endl; mi++; } if(POSITIONAL) { out << endl << endl << "POSITIONAL ANALYSIS: W_SIZE = " << W_SIZE << " W_STEP = " << W_STEP << endl << endl; out << "NAME\tID\t"; mi = S_map.begin(); for(int i = -(int)(RSIZE/2); i <= (int)((RSIZE/2) - W_SIZE); i += W_STEP) out << i /*<< '|' << i + (int)(W_SIZE - 1)*/ << '\t'; out << endl; while(mi != S_map.end()) { /* bool gflag = false; for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++) if(mi->second->get_W_pvalue(i) <= POS_VIS_THRESHOLD) { gflag = true; break; } if(!gflag) { mi++; continue; } */ out << mi->second->name() << '\t' << mi->second->id() << '\t'; for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++) { // out << (-log(mi->second->get_W_pvalue(i))/log(10)) * mi->second->W_o_or_u(i) << '\t';//<< " [" << mi->second->W_o_or_u(i) << "]\t"; // double bpv = mi->second->get_W_pvalue(i) * (mi->second->bin_num() * Seq->size()); if(bpv > 1) bpv = 1; out << bpv << '\t'; } out << endl; mi++; } if(!obstf.empty()) distance_analysis(Mat, Seq, SIZE, &S_map); } out.close(); return; } void distance_analysis(vector *Mat, vector *Seq, unsigned int SIZE, multimap ::iterator> *S_map) { string dist_outfile = regfile, dist2_outfile = regfile, dist3_outfile = regfile; dist_outfile += ".dist"; dist2_outfile += ".exp_dist"; dist3_outfile += ".pv_dist"; ofstream out(dist_outfile.c_str()), out2, out3; if(!obstf.empty()) { out2.open(dist2_outfile.c_str()); out3.open(dist3_outfile.c_str()); out2 << "TF/EXP_HITS\t"; out3 << "TF/PVALUE\t"; } out << "TF1|TF2/HITS\t"; for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++) out << p << '\t'; out << endl; if(!obstf.empty()) { for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++) { out2 << p << '\t'; out3 << p << '\t'; } out2 << endl; out3 << endl; } multimap ::iterator>::iterator MRI = S_map->begin(); unsigned int LCOUNT = 0; while(MRI != S_map->end()) { if((MRI->second->get_pvalue_bg() > DIST_PV_THRESHOLD || MRI->second->o_or_u_bg() == "UNDER") && (MRI->second->get_pvalue() > DIST_PV_THRESHOLD || MRI->second->o_or_u() == "UNDER") ) { MRI++; continue; } // if(!QUIET) cerr << endl << MRI->second->name() << endl; LCOUNT++; map R_dist_map; if(!obstf.empty() && obstf == MRI->second->id()) { map M_hit_distr; for(unsigned int a = 0; a < SIZE; a++) { map::iterator mi; if(!MRI->second->get_strand_at(a)) mi = M_hit_distr.find(MRI->second->get_pos_at(a)); else mi = M_hit_distr.find(RSIZE - MRI->second->get_pos_at(a)); if(mi == M_hit_distr.end()) { if(!MRI->second->get_strand_at(a)) M_hit_distr[MRI->second->get_pos_at(a)] = 1; else M_hit_distr[RSIZE - MRI->second->get_pos_at(a)] = 1; } else mi->second++; } map::iterator mi = M_hit_distr.begin(); while(mi != M_hit_distr.end()) { for(int b = 0; b < RSIZE; b++) { int dpos = mi->first - b; map::iterator ri = R_dist_map.find(dpos); if(ri == R_dist_map.end()) R_dist_map[dpos] = mi->second; else ri->second += mi->second; } mi++; } map::iterator ri = R_dist_map.begin(); double risum = SIZE * RSIZE; out2 << MRI->second->name() << '\t'; while(ri != R_dist_map.end()) { ri->second /= risum; out2 << ri->second*SIZE << '\t'; ri++; } out2 << endl; } // multimap ::iterator>::iterator MRI2 = MRI; multimap ::iterator>::iterator MRI2 = S_map->begin(); while(MRI2 != S_map->end()) { vector hit_v((RSIZE*2) - 1, 0); if(MRI2 == MRI) { vector SBPOS; // vector SBSTRAND; if(SINGLE_STRAND) { for(unsigned int i = 0; i < Seq->size(); i++) { bool s; unsigned int P; double score; score = MRI->second->scan_seq_ss_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, MRI->second->get_pos_at(i), Seq->at(i).strand()); SBPOS.push_back(P); // SBSTRAND.push_back(s); } } else { for(unsigned int i = 0; i < Seq->size(); i++) { bool s; unsigned int P; double score; score = MRI->second->scan_seq_ds_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, (int)MRI->second->get_pos_at(i)); SBPOS.push_back(P + MRI->second->length()/2); // SBSTRAND.push_back(s); } } for(unsigned int k = 0; k < SIZE; k++) { int dist; if(!MRI->second->get_strand_at(k)) dist = (int)SBPOS.at(k) - (int)MRI->second->get_pos_at(k); else dist = ((int)(RSIZE-1) - (int)SBPOS.at(k)) - ((int)(RSIZE - 1) - (int)MRI->second->get_pos_at(k)); hit_v.at((RSIZE-1) + dist)++; } } else { for(unsigned int k = 0; k < SIZE; k++) { int dist; if(!MRI->second->get_strand_at(k)) dist = (int)MRI2->second->get_pos_at(k) - (int)MRI->second->get_pos_at(k); else dist = ((int)(RSIZE - 1 ) - (int)MRI2->second->get_pos_at(k)) - ((int)(RSIZE - 1 ) - (int)MRI->second->get_pos_at(k)); // if(!QUIET) cerr << endl << (int)((RSIZE-1) + dist) << endl; hit_v.at((RSIZE-1) + dist)++; } } if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty()) out << MRI->second->name() << "|" << MRI2->second->name() << '\t'; if(!obstf.empty() && obstf == MRI->second->id()) out3 << MRI->second->name() << "|" << MRI2->second->name() << '\t'; for(unsigned int p = 0; p < (RSIZE*2) - 1; p++) { if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty()) out << hit_v[p] << '\t'; if(!obstf.empty() && obstf == MRI->second->id()) { map::iterator ri = R_dist_map.find(p - (RSIZE - 1)); if(hit_v[p] >= DIST_HIT_THRESHOLD && hit_v[p]/(ri->second * SIZE) >= DIST_RATIO_THRESHOLD) { double pv = gsl_cdf_binomial_Q(hit_v[p], ri->second, SIZE); if(pv <= DIST_PV_THRESHOLD) out3 << pv << '\t'; else out3 << "NA" << '\t'; } else out3 << "NA" << '\t'; } } if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty()) out << endl; if(!obstf.empty() && obstf == MRI->second->id()) out3 << endl; MRI2++; } MRI++; } out.close(); if(!obstf.empty()) { out2.close(); out3.close(); } if(!LCOUNT) { remove(dist_outfile.c_str()); if(!obstf.empty()) { remove(dist2_outfile.c_str()); remove(dist3_outfile.c_str()); } } if(!QUIET) cerr << "\nDistance analysis written in file: " << dist_outfile << endl; return; } void scanner(vector *Mat, vector *Seq) { for(unsigned int i = 0; i < Mat->size(); i++) { if(VERBOSE) { if(!QUIET) cerr << "\nScanning sequences for " << Mat->at(i).name() << " (" << Mat->at(i).id() << ")"<< endl; } Mat->at(i).scan(Seq); Mat->at(i).calc_var((double)Seq->size()); if(POSITIONAL) Mat->at(i).calc_W_param((double)Seq->size()); if(VERBOSE) { if(!QUIET) cerr << "\nRegions avg = " << Mat->at(i).R_Avg << '\t' << "Flanks avg = " << Mat->at(i).F_Avg << '\t' << "Regions devstd = " << pow(Mat->at(i).R_Var,0.5) << '\t' << "Flanks devstd = " << pow(Mat->at(i).F_Var,0.5); if(!QUIET) cerr << "\nDoing Welch's t test... "; } Mat->at(i).welch_test((double)Seq->size()); if(Mat->at(i).have_bg()) Mat->at(i).welch_test_bg((double)Seq->size()); if(POSITIONAL) { // Mat->at(i).welch_test_pos((double)Seq->size()); Mat->at(i).z_test_pos((double)Seq->size()); } if(VERBOSE) { if(!QUIET) cerr << "done. Pvalue = " << Mat->at(i).get_pvalue() << ' ' << Mat->at(i).o_or_u() << endl; if(!QUIET) cerr << endl; } } return; } void acquire_matrices(vector *Mat, map *Bg, string *matfile) { ifstream in(matfile->c_str()); string line; if(!in) { if(!QUIET) cerr << "Can't find " << *matfile << endl; exit(1); } while(getline(in,line)) { if(line.empty()) continue; vector mbuf; mbuf.push_back(line); for(unsigned int i = 0; i < ALPHABET_SIZE; i++) { getline(in,line); if(!line.empty()) mbuf.push_back(line); } if(mbuf.size() == ALPHABET_SIZE + 1) Mat->push_back(matrix(&mbuf,Bg)); } in.close(); vector::iterator vi = Mat->begin(); while(vi != Mat->end()) { if(!vi->is_good()) { vi = Mat->erase(vi); continue; } vi++; } return; } void acquire_background(map *Bg) { ifstream in(bgfile.c_str()); if(!in) { if(!QUIET) cerr << "\nWarning: background file " << bgfile << " not found..." << endl; return; } string line; getline(in,line); while(getline(in, line)) { istringstream str(line); string id, trash; bg nbg; str >> trash >> id >> nbg.Size >> trash >> trash >> trash >> trash >> nbg.Avg >> nbg.Var; nbg.Var = pow(nbg.Var,2); Bg->insert(make_pair(id,nbg)); } in.close(); return; } void acquire_regions(vector *Seq) { ifstream in(regfile.c_str()); // map > Regs; map > Regs; //PREVENT REG DUPLICATES map > M_RANK; if(!in) { if(!QUIET) cerr << "Can't find " << regfile << endl; exit(1); } string line; unsigned int r_count = 0; while(getline(in,line)) { if(line.empty()) continue; if(line[0] == '#') continue; for(unsigned int i = 0; i < 3; i++) line[i] = tolower(line[i]); for(unsigned int i = 3; i < line.size(); i++) line[i] = toupper(line[i]); istringstream str(line); string chr; unsigned int start, end, mp; str >> chr >> start >> end; // Regs[chr].push_back(line); mp = start + ((end - start)/2); if(mp > (RSIZE + RSIZE/2) + 1) { Regs[chr][mp] = line; // if(!QUIET) cerr << endl << line; if(M_RANK.find(chr) != M_RANK.end()) //PREVENT REGIONS PRESENT MORE THAN ONCE TO DO TOO MUCH MESS WITH SPEARMAN CORRELATION if(M_RANK[chr].find(mp) != M_RANK[chr].end()) continue; M_RANK[chr][mp] = r_count; r_count++; } else if(!QUIET) cerr << "\nSkipping region: too close to chromosome start..." << endl << line << endl; } in.close(); map >::iterator mi = Regs.begin(); if(!TWOBIT) { while(mi != Regs.end()) { string file = genome; file += "/"; file += mi->first; file += ".raw"; in.open(file.c_str()); if(!in) { if(!QUIET) cerr << "\nCan't find " << file << ". Skipping chromosome:\n" << mi->first << endl; if(SPEARMAN) { if(!QUIET) cerr << "\nSpearman correlation disabled..." << endl; SPEARMAN = false; } mi++; in.close(); continue; } map::iterator si = mi->second.begin(); while(si != mi->second.end()) { Seq->push_back(sequence(&si->second, &in, M_RANK[mi->first][si->first])); si++; } in.close(); mi++; } } else { ostringstream str; ofstream outfile; str << ".pchip." << rand() << ".tmp"; string tmp1 = str.str(); outfile.open(tmp1.c_str(), ios::out); vector k1; vector k2; while(mi != Regs.end()) { map::iterator si = mi->second.begin(); while(si != mi->second.end()) { unsigned int Start, End; Start = si->first - RSIZE; End = si->first + RSIZE + 1 + MAX_L; outfile << mi->first << '\t' << Start << '\t' << End << '\t' << "." << endl; k1.push_back(mi->first); k2.push_back(si->first); si++; } mi++; } string cline = "twoBitToFa -bedPos -bed="; cline += str.str(); str << ".fa"; cline += " "; cline += genome; /* cline += "/"; cline += genome; cline += ".2bit "; */ cline += " "; cline += str.str(); // if(!QUIET) cerr << endl << "TwoBit command line:\n" << cline << endl; if(system(cline.c_str()) != 0) { if(!QUIET) cerr << endl << "twoBitToFa returned non 0 error code... aborting pscan_chip run!\n" << endl; exit(1); } else { ifstream in(str.str().c_str()); string seq; seq.clear(); if(!in) { if(!QUIET) cerr << endl << "Can't access twoBitToFa output file! Aborting pscan_chip run!\n" << endl; exit(1); } unsigned int scount = 0; while(getline(in,line)) { if(line.empty()) continue; if(line[0] == '>') { if(seq.empty()) continue; else { Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount))); seq.clear(); scount++; continue; } } else seq += line; } Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount))); //LAST SEQUENCE! in.close(); remove(str.str().c_str()); remove(tmp1.c_str()); } // exit(1); } vector::iterator vi = Seq->begin(); while(vi != Seq->end()) { if(!vi->is_good()) { vi = Seq->erase(vi); continue; } vi++; } return; } void command_line_parser(int argc, char **argv) { if(argc == 1) return; for(int i = 1; i < argc; i++) { string buf = argv[i]; if(buf == "-r") { if(i < argc - 1) regfile = argv[++i]; continue; } if(buf == "-g") { if(i < argc - 1) genome = argv[++i]; continue; } if(buf == "-s") { if(i < argc - 1) RSIZE = atoi(argv[++i]); continue; } if(buf == "-bgavg") { if(i < argc - 1) SINGLE_MATRIX_BG_AVG = atof(argv[++i]); continue; } if(buf == "-M") { if(i < argc - 1) matrixfile = argv[++i]; continue; } if(buf == "-MADD") { if(i < argc - 1) additional_matrixfile = argv[++i]; continue; } if(buf == "-m") { if(i < argc - 1) selected = argv[++i]; continue; } if(buf == "-bg") { if(i < argc - 1) bgfile = argv[++i]; continue; } if(buf == "-go") { if(i < argc - 1) GALAXY_OUTPUT = argv[++i]; continue; } if(buf == "-obs") { if(i < argc - 1) obstf = argv[++i]; continue; } if(buf == "-ss") { SINGLE_STRAND = true; continue; } if(buf == "-2bit") { TWOBIT = true; continue; } if(buf == "-quiet") { QUIET = true; continue; } if(buf == "-sbp") { SECOND_BEST_POS = true; continue; } if(buf == "-fasta") { GET_SEQUENCES = true; continue; } if(buf == "-pos") { POSITIONAL = true; continue; } if(buf == "-spear") { SPEARMAN = true; continue; } if(buf == "-wsize") { if(i < argc - 1) W_SIZE = atoi(argv[++i]); W_STEP = W_SIZE / 2; continue; } /* if(buf == "-wstep") { if(i < argc - 1) W_STEP = atoi(argv[++i]); continue; } if(buf == "-fast") { FAST_SCAN = true; continue; }*/ /* if(buf == "-naive") { NAIVE = true; continue; } */ if(buf == "-v") { VERBOSE = true; continue; } else { if(!QUIET) cerr << "\nUnknown option: " << buf << endl; exit(1); } /* if(buf == "-Q") { if(i < argc - 1) idfile = argv[++i]; continue; } if(buf == "-ql") { if(i < argc - 1) q_idlist = argv[++i]; continue; } if(buf == "-imgm") { if(i < argc - 1) bg_for_img = argv[++i]; continue; } if(buf == "-magic") { if(i < argc - 1) magic = argv[++i]; continue; }*/ } if(RSIZE > MAX_REG_SIZE) { if(!QUIET) cerr << "\nRegion size exceeds " << MAX_REG_SIZE << "..." << endl; exit(1); } if(RSIZE % W_SIZE != 0 || W_SIZE % 2 != 0 || RSIZE / W_SIZE < 5) { if(!QUIET) cerr << "\nWindow size must be even and both an exact divisor of region size and at least five times smaller of it: positional analysis disabled." << endl; POSITIONAL = false; } return; } //SEQUENCE CLASS BEGIN sequence::sequence(string *seq, unsigned int R, string chr, unsigned int mp) //CONSTRUCTOR FOR 2BIT INPUT { Chr = chr; Strand = '+'; AntiStrand = '-'; E_Rank = R; good = true; Start = mp - RSIZE/2; End = (mp + RSIZE/2) - 1; for(unsigned int i = 0; i < seq->size(); i++) seq->at(i) = toupper(seq->at(i)); Seq = new char[(RSIZE*2) + 1 + MAX_L]; // if(!QUIET) cerr << "\nseq.size = " << seq->size() << endl; // if(!QUIET) cerr << "\narray.size = " << (RSIZE*2) + 1 + MAX_L << endl; strcpy(Seq, seq->c_str()); if(seq->size() != RSIZE*2 + MAX_L + 1) { good = false; return; } if(GET_SEQUENCES) fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << endl << seq->substr(RSIZE/2, RSIZE) << endl; return; } sequence::sequence(string *line, ifstream *in, unsigned int R) { istringstream str(*line); string trash; int buf_s, buf_e, mp; good = true; E_Rank = R; str >> Chr >> buf_s >> buf_e >> Name >> trash >> Strand; if(Strand == 'R') Strand = '-'; if(Strand != '+' && Strand != '-') Strand = '+'; if(Strand == '+') AntiStrand = '-'; else AntiStrand = '+'; mp = buf_s + ((buf_e - buf_s)/2); Start = mp - RSIZE/2; End = (mp + RSIZE/2) - 1; // Seq = new char[RSIZE + 1]; // UF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2]; // DF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2]; Seq = new char[(RSIZE*2) + 1 + MAX_L]; in->seekg((Start) - RSIZE/2); in->get(Seq, (RSIZE*2 + 1) + MAX_L); if(in->gcount() != RSIZE*2 + MAX_L) { good = false; return; } if(GET_SEQUENCES) { char *out_seq; out_seq = new char[RSIZE + 1]; in->seekg(Start); in->get(out_seq, (RSIZE + 1)); fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << '\t' << "---" << '\t' << buf_s << '\t' << buf_e << '\t' << endl << out_seq << endl; delete[] out_seq; } /*in->seekg(((Start - 1) - RSIZE/2) - (MAX_L/2)); in->get(UF_Seq, RSIZE/2 + 1 + MAX_L /2); if(in->gcount() != RSIZE/2 + (MAX_L/2)) { good = false; return; } in->get(Seq, RSIZE + 1); if(in->gcount() != RSIZE) { good = false; return; } in->get(DF_Seq, RSIZE/2 + 1 + (MAX_L/2)); if(in->gcount() != RSIZE/2 + (MAX_L/2)) good = false; */ return; } unsigned int sequence::e_rank() { return E_Rank; } string sequence::chr() { return Chr; } /*string* sequence::seq() { return &Seq; }*/ char* sequence::seq() { return Seq; } char sequence::strand() { return Strand; } char sequence::antistrand() { return AntiStrand; } /*char* sequence::uf_seq() { return UF_Seq; } char* sequence::df_seq() { return DF_Seq; } */ string sequence::name() { return Name; } unsigned int sequence::start() { return Start; } unsigned int sequence::end() { return End; } bool sequence::is_good() { return good; } // SEQUENCE CLASS END // MATRIX CLASS BEGIN matrix::matrix(vector* mbuf, map *Bg) { string buf; istringstream bstr(mbuf->at(1)), nstr(mbuf->at(0)); L = 0; R_Avg = 0; F_Avg = 0; R_Var = 0; F_Var = 0; bg_Avg = 0; bg_Var = 0; BIN_NUM = RSIZE / W_STEP; W_R_Avg.resize(BIN_NUM, 0); W_R_Var.resize(BIN_NUM, 0); W_R_pvalue.resize(BIN_NUM, 0); W_R_T_pvalue.resize(BIN_NUM,0); spearman = 42; O_U = false; O_U_bg = false; W_O_U.resize(BIN_NUM, 0); BG = false; good = true; norm_sec_step = false; while(bstr >> buf) L++; if(L > RSIZE/2) { good = false; return; } if(L > MAX_L) MAX_L = L; if(!mbuf->at(0).empty()) { if(mbuf->at(0).at(0) != '>') { good = false; return; } } else { good = false; return; } // Name = mbuf->at(0).substr(1); nstr >> Id >> Name; Id = Id.substr(1); if(!selected.empty()) { // cerr << endl << selected << '\t' << Id << '\t' << Name << endl; if(selected != Id && selected != Name) { good = false; return; } } if(!Bg->empty()) { map::iterator mi = Bg->find(Id); if(mi == Bg->end()) mi = Bg->find(Name); if(mi != Bg->end()) { bg_Avg = mi->second.Avg; bg_Var = mi->second.Var; bg_Size = mi->second.Size; BG = true; } else if(!QUIET) cerr << "\nWarning: no background for matrix: " << Id << endl; } M = new double*[L]; rc_M = new double*[L]; for(unsigned int i = 1; i < ALPHABET_SIZE + 1; i++) { istringstream str(mbuf->at(i)); unsigned int count = 0; while(str >> buf) { if(i == 1) { M[count] = new double[ALPHABET_SIZE + 1]; rc_M[count] = new double[ALPHABET_SIZE + 1]; } M[count][i-1] = atof(buf.c_str()); count++; if(count > L) { good = false; return; } } if(count != L) { good = false; return; } } normalizer(); to_log(); rev_c(); avg(); max_min(); // set_top_pos(); if(min_score == max_score) good = false; return; } void matrix::display() { if(!QUIET) cerr << "\n>" << Name << '\t' << good << endl; for(unsigned int i = 0; i < ALPHABET_SIZE; i++) { for(unsigned int j = 0; j < L; j++) if(!QUIET) cerr << M[j][i] << '\t'; if(!QUIET) cerr << endl; } if(!QUIET) cerr << endl; // for(unsigned int i = 0; i < TOP_P; i++) // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << top_pos[i] + 1 << "-" << top_char[i] << '\t'; if(!QUIET) cerr << "\n\n>" << Name << '\t' << good << '\t' << "RC" << endl; for(unsigned int i = 0; i < ALPHABET_SIZE; i++) { for(unsigned int j = 0; j < L; j++) if(!QUIET) cerr << rc_M[j][i] << '\t'; if(!QUIET) cerr << endl; } if(!QUIET) cerr << endl; // for(unsigned int i = 0; i < TOP_P; i++) // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << rc_top_pos[i] + 1 << "-" << rc_top_char[i] << '\t'; if(!QUIET) cerr << endl; return; } bool matrix::is_good() { return good; } void matrix::normalizer() { double *sums = new double[L]; for(int x = 0; x < L; x++) { sums[x] = 0; for(int y = 0; y < ALPHABET_SIZE; y++) sums[x] += M[x][y]; } for(int x = 0; x < L; x++) for(int y = 0; y < ALPHABET_SIZE; y++) M[x][y] /= sums[x]; delete[] sums; if(!norm_sec_step) { for(int x = 0; x < L; x++) for(int y = 0; y < ALPHABET_SIZE; y++) M[x][y] += MIN_ADD_FREQ; norm_sec_step = true; normalizer(); } return; } void matrix::to_log() { for(int x = 0; x < L; x++) for(int y = 0; y < ALPHABET_SIZE; y++) M[x][y] = log(M[x][y]); return; } void matrix::avg() { // mat_avg = new double[L]; for(int x = 0; x < L; x++) { double buf = 0; for(int y = 0; y < ALPHABET_SIZE; y++) buf += M[x][y]; // mat_avg[x] = buf/ALPHABET_SIZE; M[x][ALPHABET_SIZE] = buf/ALPHABET_SIZE; rc_M[(L-1) -x][ALPHABET_SIZE] = buf/ALPHABET_SIZE; } return; } void matrix::max_min() { double min, max; max_score = 0; min_score = 0; for(int x = 0; x < L; x++) { min = M[x][0]; max = M[x][0]; for(int y = 1; y < ALPHABET_SIZE; y++) { if(M[x][y] < min) min = M[x][y]; if(M[x][y] > max) max = M[x][y]; } min_score += min; max_score += max; } return; } void matrix::rev_c() { for(unsigned int x = 0; x < L; x++) for(unsigned int y = 0; y < ALPHABET_SIZE; y++) rc_M[(L - 1) - x][(ALPHABET_SIZE - 1) - y] = M[x][y]; return; } void matrix::scan(vector *Seq) { v_score.reserve(Seq->size()); vF_score.reserve(Seq->size()); v_pos.reserve(Seq->size()); v_strand.reserve(Seq->size()); if(POSITIONAL) W_R_v_score.reserve(Seq->size()); unsigned int progress; if(!SINGLE_STRAND) { for(unsigned int i = 0; i < Seq->size(); i++) { if(VERBOSE) { progress = (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100); if(!(progress%5)) if(!QUIET) cerr << progress << '%'; } double max_flank_u, max_flank_d; vector W_buf(BIN_NUM, 0); { double r_score; max_flank_u = scan_seq_ds(Seq->at(i).seq() , true, RSIZE/2, &W_buf); r_score = scan_seq_ds(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, &W_buf); max_flank_d = scan_seq_ds(Seq->at(i).seq() + ((RSIZE/2)*3) , true, RSIZE/2, &W_buf); R_Avg += r_score; /* if(r_score < max_flank_u || r_score < max_flank_d) { if(max_flank_u > max_flank_d) G_Avg += max_flank_u; else G_Avg += max_flank_d; } else G_Avg += r_score;*/ if(POSITIONAL) { // for(unsigned int i = 0; i < BIN_NUM; i++) // W_R_Avg[i] += W_buf[i]; W_R_v_score.push_back(W_buf); } } if(max_flank_u > max_flank_d) { F_Avg += max_flank_u; vF_score.push_back(max_flank_u); } else { F_Avg += max_flank_d; vF_score.push_back(max_flank_d); } if(VERBOSE) if(!(progress%5)) if(!QUIET) cerr << (char)13; } } else { for(unsigned int i = 0; i < Seq->size(); i++) { if(VERBOSE) if(!QUIET) cerr << (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100) << '%'; double max_flank_u, max_flank_d; vector W_buf(BIN_NUM, 0); { double r_score; max_flank_u = scan_seq_ss(Seq->at(i).seq() , true, RSIZE/2, Seq->at(i).strand(), &W_buf); r_score += scan_seq_ss(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, Seq->at(i).strand(), &W_buf); max_flank_d = scan_seq_ss(Seq->at(i).seq() + ((RSIZE/2)*3), true, RSIZE/2, Seq->at(i).strand(), &W_buf); R_Avg += r_score; /* if(r_score < max_flank_u || r_score < max_flank_d) { if(max_flank_u > max_flank_d) G_Avg += max_flank_u; else G_Avg += max_flank_d; } else G_Avg += r_score;*/ if(POSITIONAL) { // for(unsigned int i = 0; i < BIN_NUM; i++) // W_R_Avg[i] += W_buf[i]; W_R_v_score.push_back(W_buf); } } if(max_flank_u > max_flank_d) { F_Avg += max_flank_u; vF_score.push_back(max_flank_u); } else { F_Avg += max_flank_d; vF_score.push_back(max_flank_d); } if(VERBOSE) if(!QUIET) cerr << (char)13; } } R_Avg /= Seq->size(); F_Avg /= Seq->size(); // G_Avg /= Seq->size(); // for(unsigned int i = 0; i < BIN_NUM; i++) // W_R_Avg[i] /= Seq->size(); if(SPEARMAN) { multimap SRM; for(unsigned int i = 0; i < v_score.size(); i++) SRM.insert(make_pair(v_score[i], i)); multimap::reverse_iterator mmi = SRM.rbegin(); int sr = 0; double d = 0; while(mmi != SRM.rend()) { // if(!QUIET) cerr << endl << mmi->first << '\t' << sr << '\t' << Seq->at(mmi->second).e_rank() << '\t' << this->name(); d += pow((sr - (int)Seq->at(mmi->second).e_rank()), 2); sr++; mmi++; } d *= 6; // if(!QUIET) cerr << endl << "d = " << d << "\tden = " << (Seq->size() * (pow(Seq->size(), 2) - 1)) << '\t' << "ssize = " << Seq->size(); d /= (Seq->size() * (pow(Seq->size(), 2) - 1)); spearman = 1 - d; } return; } void matrix::calc_var(double S) { for(unsigned int i = 0; i < S; i++) { R_Var += pow(v_score[i] - R_Avg,2); F_Var += pow(vF_score[i] - F_Avg,2); /* if(v_score[i] >= vF_score[i]) G_Var += pow(v_score[i] - G_Avg,2); else G_Var += pow(vF_score[i] - G_Avg,2);*/ } if(S > 1) { R_Var /= (S - 1); F_Var /= (S - 1); // G_Var /= (S - 1); } if(R_Var == 0) R_Var += EPSILON; if(F_Var == 0) F_Var += EPSILON; // if(G_Var == 0) // G_Var += EPSILON; return; } void matrix::calc_W_param(double S) { if(S > 1) S = S/2; //SOLO PER LA PRIMA META' DELLE SEQUENZE!!! W_Avg = 0; W_Var = 0; multimap score_seq_map; for(unsigned int i = 0; i < v_score.size(); i++) score_seq_map.insert(make_pair(v_score.at(i), i)); multimap::reverse_iterator mi = score_seq_map.rbegin(); for(unsigned int i = 0; i < S; i++) { for(unsigned int j = 0; j < BIN_NUM; j++) W_R_Avg[j] += W_R_v_score[mi->second][j]; // if(!QUIET) cerr << Name << '\t' << mi->first << endl; mi++; } for(unsigned int j = 0; j < BIN_NUM; j++) W_R_Avg[j] /= S; mi = score_seq_map.rbegin(); for(unsigned int i = 0; i < S; i++) { for(unsigned int j = 0; j < BIN_NUM; j++) W_R_Var[j] += pow(W_R_v_score[mi->second][j] - W_R_Avg[j],2); mi++; } /* for(unsigned int i = 0; i < S; i++) for(unsigned int j = 0; j < BIN_NUM; j++) W_R_Var[j] += pow(W_R_v_score[i][j] - W_R_Avg[j],2);*/ if(S > 1) { for(unsigned int j = 0; j < BIN_NUM; j++) W_R_Var[j] /= (S - 1); } for(unsigned int j = 0; j < BIN_NUM; j++) if(W_R_Var[j] == 0) W_R_Var[j] += EPSILON; for(unsigned int j = 0; j < BIN_NUM - 1; j++) //THE LAST BIN IS WASTED SINCE IT IS AN HALF BIN W_Avg += W_R_Avg[j]; W_Avg /= (BIN_NUM - 1); for(unsigned int j = 0; j < BIN_NUM - 1; j++) W_Var += pow(W_R_Avg[j] - W_Avg, 2); W_Var /= (BIN_NUM - 2); return; } void matrix::welch_test(double S) { t = R_Avg - F_Avg; if(t == 0) t = EPSILON; t /= pow((R_Var/S) + (F_Var/S),0.5); dof = pow((R_Var/S) + (F_Var/S),2); dof /= (pow(R_Var,2)/((pow(S,2)) * (S-1))) + (pow(F_Var,2)/((pow(S,2)) * (S-1))); if(t > 0) pvalue = gsl_cdf_tdist_Q(t,dof); else { pvalue = gsl_cdf_tdist_P(t,dof); O_U = true; } return; } void matrix::z_test_pos(double S) { // if(!QUIET) cerr << endl << Name; for(unsigned int i = 0; i < BIN_NUM; i++) { double se = sqrt(W_R_Var.at(i)), z; se /= sqrt(S/2); z = W_R_Avg.at(i) - W_Avg; z /=se; if(z >= 0) W_O_U.at(i) = true; W_R_pvalue.at(i) = gsl_cdf_ugaussian_Q(z); // if(!QUIET) cerr << endl << "W_R_Avg.at(i) " << W_R_Avg.at(i) << '\t' << W_Avg << '\t' << W_R_pvalue.at(i) << endl; } return; } void matrix::welch_test_pos(double S) { if(S > 1) S = S/2; //SOLO LA PRIMA META' DELLE SEQUENZE!!! // if(!QUIET) cerr << "\nRAVG = " << W_Avg << endl; for(unsigned int i = 0; i < BIN_NUM; i++) { double T, DOF; T = W_R_Avg.at(i) - W_Avg; // if(!QUIET) cerr << W_R_Avg.at(i) << '\t'; // if(!QUIET) cerr << endl << Name << endl << "T = " << T << '\t' << W_R_Avg.at(i) << '\t' << W_Avg; if(T == 0) T = EPSILON; T /= pow((W_R_Var.at(i)/S) + (W_Var/S),0.5); DOF = pow((W_R_Var.at(i)/S) + (W_Var/S),2); DOF /= (pow(W_R_Var.at(i),2)/((pow(S,2)) * (S-1))) + (pow(W_Var,2)/((pow(S,2)) * (S-1))); if(T > 0) W_R_T_pvalue.at(i) = gsl_cdf_tdist_Q(T,DOF); else { W_R_T_pvalue.at(i) = gsl_cdf_tdist_P(T,DOF); W_O_U.at(i) = true; } // if(!QUIET) cerr << "\tPV= " << W_R_pvalue.at(i); } // if(!QUIET) cerr << endl; return; } void matrix::welch_test_bg(double S1) { double S2 = bg_Size; // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl; t_bg = R_Avg - bg_Avg; if(t_bg == 0) t_bg = EPSILON; t_bg /= pow((R_Var/S1) + (bg_Var/S2),0.5); dof_bg = pow((R_Var/S1) + (bg_Var/S2),2); dof_bg /= (pow(R_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1))); if(t_bg > 0) pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg); else { pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg); O_U_bg = true; } return; } /*void matrix::welch_test_bg_G(double S1) { double S2 = bg_Size; // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl; t_bg = G_Avg - bg_Avg; if(t_bg == 0) t_bg = EPSILON; t_bg /= pow((G_Var/S1) + (bg_Var/S2),0.5); dof_bg = pow((G_Var/S1) + (bg_Var/S2),2); dof_bg /= (pow(G_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1))); if(t_bg > 0) pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg); else { pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg); O_U_bg = true; } return; }*/ double matrix::scan_seq_ds(char *S, bool flank, unsigned int l, vector *W_buf) { unsigned int pos = 0, max_pos = 0; double seq_score; bool strand = false; l -= L/2; while(pos < l) { double pos_score[2] = {0,0}; for(unsigned int j = 0; j < L; j++) { pos_score[0] += get_M_value(j, S[pos+j]); pos_score[1] += get_rc_M_value(j, S[pos+j]); } if(seq_score < pos_score[0] || !pos) { seq_score = pos_score[0]; max_pos = pos; strand = false; } if(seq_score < pos_score[1]) { seq_score = pos_score[1]; max_pos = pos; strand = true; } if(!flank && POSITIONAL) { unsigned int vpos = pos / W_STEP; if(W_buf->at(vpos) < pos_score[0] || !(pos % W_STEP)) W_buf->at(vpos) = pos_score[0]; if(W_buf->at(vpos) < pos_score[1]) W_buf->at(vpos) = pos_score[1]; if(vpos) { if(W_buf->at(vpos - 1) < pos_score[0]) W_buf->at(vpos - 1) = pos_score[0]; if(W_buf->at(vpos - 1) < pos_score[1]) W_buf->at(vpos - 1) = pos_score[1]; } } pos++; } seq_score = 1 + (seq_score - max_score)/(max_score - min_score); if(!flank && POSITIONAL) { // if(!QUIET) cerr << endl << Name << endl; unsigned int binm = 0; for(unsigned int i = 0; i < BIN_NUM; i++) { if(W_buf->at(i) != 0) W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score); else binm++; } BIN_NUM -= binm; } if(!flank) { v_score.push_back(seq_score); v_pos.push_back(max_pos + L/2); v_strand.push_back(strand); } return seq_score; } double matrix::scan_seq_ss_second_best(char *S, unsigned int l, unsigned int *P, bool *s, unsigned int POS, bool seq_strand) { unsigned int pos = 0, max_pos = 0; double seq_score = min_score; bool strand = false; while(pos < l) { if(pos == POS) { pos++; continue; } double pos_score[2] = {0,0}; if(seq_strand == '+') { for(unsigned int j = 0; j < L; j++) pos_score[0] += get_M_value(j, S[pos+j]); } else { for(unsigned int j = 0; j < L; j++) pos_score[0] += get_rc_M_value(j, S[pos+j]); } if(seq_score < pos_score[0]) { seq_score = pos_score[0]; max_pos = pos; strand = false; } pos++; } seq_score = 1 + (seq_score - max_score)/(max_score - min_score); *P = max_pos; *s = strand; return seq_score; } double matrix::scan_seq_ds_second_best(char *S, unsigned int l, unsigned int *P, bool *s, int POS) { int pos = 0, max_pos = 0, count = 0; double seq_score; bool strand = false; POS -= L/2; l -= L/2; while(pos < l) { // if(pos == POS) if(pos >= (POS - L) && pos <= (POS + L)) { pos++; continue; } double pos_score[2] = {0,0}; for(unsigned int j = 0; j < L; j++) { pos_score[0] += get_M_value(j, S[pos+j]); pos_score[1] += get_rc_M_value(j, S[pos+j]); } if(seq_score < pos_score[0] || !count) { seq_score = pos_score[0]; max_pos = pos; strand = false; } if(seq_score < pos_score[1]) { seq_score = pos_score[1]; max_pos = pos; strand = true; } pos++; count++; } seq_score = 1 + (seq_score - max_score)/(max_score - min_score); *P = max_pos ;//+ L/2; *s = strand; return seq_score; } /*double matrix::scan_seq_ds_quick(char *S, bool flank, unsigned int l) { unsigned int pos = 0, max_pos = 0; double seq_score = min_score; bool strand = false; while(pos < l) { double pos_score[2] = {0,0}; for(unsigned int j = 0; j < L; j++) { pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]); if((pos_score[0] + p_max[j+1]) <= seq_score) { pos_score[0] = min_score; break; } } for(unsigned int j = 0; j < L; j++) { pos_score[1] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]); if((pos_score[1] + p_max[j+1]) <= seq_score) { pos_score[1] = min_score; break; } } if(seq_score < pos_score[0]) { seq_score = pos_score[0]; max_pos = pos; strand = false; } if(seq_score < pos_score[1]) { seq_score = pos_score[1]; max_pos = pos; strand = true; } pos++; } seq_score = 1 + (seq_score - max_score)/(max_score - min_score); if(!flank) { v_score.push_back(seq_score); v_pos.push_back(max_pos); v_strand.push_back(strand); } return seq_score; } */ double matrix::scan_seq_ss(char *S, bool flank, unsigned int l, char seq_strand, vector *W_buf) { unsigned int pos = 0, max_pos = 0; double seq_score = min_score; bool strand = false; while(pos < l) { double pos_score[2] = {0,0}; if(seq_strand == '+') { for(unsigned int j = 0; j < L; j++) pos_score[0] += get_M_value(j, S[pos+j]); } else { for(unsigned int j = 0; j < L; j++) pos_score[0] += get_rc_M_value(j, S[pos+j]); } if(seq_score < pos_score[0]) { seq_score = pos_score[0]; max_pos = pos; strand = false; } if(!flank && POSITIONAL) { unsigned int vpos = pos / W_STEP; if(W_buf->at(vpos) < pos_score[0]) W_buf->at(vpos) = pos_score[0]; if(vpos) { if(W_buf->at(vpos - 1) < pos_score[0]) W_buf->at(vpos - 1) = pos_score[0]; } } pos++; } seq_score = 1 + (seq_score - max_score)/(max_score - min_score); if(POSITIONAL && !flank) for(unsigned int i = 0; i < W_buf->size(); i++) W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score); if(!flank) { v_score.push_back(seq_score); v_pos.push_back(max_pos); v_strand.push_back(strand); } return seq_score; } /* double matrix::scan_seq_ss_quick(char *S, bool flank, unsigned int l, char seq_strand) { unsigned int pos = 0, max_pos = 0; double seq_score = min_score; bool strand = false; while(pos < l) { double pos_score[2] = {0,0}; if(seq_strand == '+') { for(unsigned int j = 0; j < L; j++) { pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]); if((pos_score[0] + p_max[j+1]) <= seq_score) { pos_score[0] = min_score; break; } } } else { for(unsigned int j = 0; j < L; j++) { pos_score[0] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]); if((pos_score[0] + p_max[j+1]) <= seq_score) { pos_score[0] = min_score; break; } } } if(seq_score < pos_score[0]) { seq_score = pos_score[0]; max_pos = pos; strand = false; } pos++; } seq_score = 1 + (seq_score - max_score)/(max_score - min_score); if(!flank) { v_score.push_back(seq_score); v_pos.push_back(max_pos); v_strand.push_back(strand); } return seq_score; }*/ double matrix::get_M_value(int pos, char C) { return M[pos][VI[C]]; } double matrix::get_rc_M_value(int pos, char C) { return rc_M[pos][VI[C]]; } /* void matrix::set_top_pos() { multimap Tmap; vector p_max_buf, rc_p_max_buf; for(unsigned int x = 0; x < L; x++) { for(unsigned int y = 0; y < ALPHABET_SIZE; y++) { mpoint p; p.x = x; p.y = y; Tmap.insert(make_pair(M[x][y],p)); if(!y) { p_max_buf.push_back(M[x][y]); rc_p_max_buf.push_back(rc_M[x][y]); } else { if(M[x][y] > p_max_buf.back()) p_max_buf[p_max_buf.size() - 1] = M[x][y]; if(rc_M[x][y] > rc_p_max_buf.back()) rc_p_max_buf[rc_p_max_buf.size() - 1] = rc_M[x][y]; } } // if(!QUIET) cerr << endl << p_max_buf.back(); } multimap::reverse_iterator ri = Tmap.rbegin(); for(unsigned int i = 0; i < TOP_P; i++) { top_pos[i] = ri->second.x; top_char[i] = ALPHABET[ri->second.y]; rc_top_pos[i] = (L - 1) - ri->second.x; rc_top_char[i] = RC_ALPHABET[ri->second.y]; ri++; } // if(!QUIET) cerr << endl << name() << '\t' << max_score << '\t' << min_score << endl; while(p_max.size() != p_max_buf.size()) { double buf = -INF; unsigned int pos; for(unsigned int i = 0; i < p_max_buf.size(); i++) if(p_max_buf.at(i) > buf) { pos = i; buf = p_max_buf.at(i); } p_pos.push_back(pos); rc_p_pos.push_back((L - 1) - pos); p_max.push_back(buf); p_max_buf[pos] = -INF; } for(unsigned int i = 0; i < L; i++) { //if(!QUIET) cerr << p_max[i] << "//"<< '\t'; for(unsigned int j = i + 1; j < L; j++) p_max[i] += p_max[j]; // for(unsigned int j = i + 1; j < L; j++) // rc_p_max[i] += rc_p_max[j]; // if(!QUIET) cerr << p_max[i] << "(" << p_pos[i] << ")" << rc_p_max[i] << '\t'; } p_max.push_back(0); //WE NEED A 0 AT THE END OF THE VECTOR! // rc_p_max.push_back(0); // if(!QUIET) cerr << endl; return; } */ double matrix::get_score_at(unsigned int a) { return v_score.at(a); } unsigned int matrix::get_pos_at(unsigned int a) { return v_pos.at(a); } bool matrix::get_strand_at(unsigned int a) { return v_strand.at(a); } double matrix::max() { return max_score; } double matrix::min() { return min_score; } string matrix::name() { if(!Name.empty()) return Name; else return Id; } string matrix::id() { return Id; } double matrix::get_pvalue() { return pvalue; } double matrix::get_pvalue_bg() { return pvalue_bg; } double matrix::get_W_pvalue(unsigned int pos) { return W_R_pvalue.at(pos); } double matrix::get_W_R_Avg_at_pos(unsigned int pos) { return W_R_Avg.at(pos); } string matrix::o_or_u() { if(O_U) return "UNDER"; else return "OVER"; } string matrix::o_or_u_bg() { if(O_U_bg) return "UNDER"; else return "OVER"; } int matrix::W_o_or_u(unsigned int pos) { if(W_O_U[pos]) return -1; else return 1; } double matrix::get_R_Avg() { return R_Avg; } double matrix::get_F_Avg() { return F_Avg; } double matrix::get_R_Var() { return R_Var; } double matrix::get_F_Var() { return F_Var; } double matrix::get_bg_Avg() { return bg_Avg; } double matrix::get_t() { return t; } double matrix::get_dof() { return dof; } bool matrix::have_bg() { return BG; } unsigned int matrix::bin_num() { return BIN_NUM; } unsigned int matrix::length() { return L; } double matrix::get_spearman() { double s = spearman; s *= 10000; if(s > 0) s = floor(s); else if(s < 0) s = ceil(s); s /= 10000; return s; } double matrix::get_W_Avg() { return W_Avg; } double matrix::get_W_R_T_pvalue(unsigned int i) { return W_R_T_pvalue.at(i); } // MATRIX CLASS END