/* Program STRING. Version 1.1 (13-Oct-03) Academic version of the algorithm described in: V. Parisi, V. De Fonzo, and F. Aluffi-Pentini. STRING: a novel program for finding tandem repeats in DNA sequences. Bioinformatics, Vol. 19 no. 14, 2003, Pages 1733-1738. Presented to 6th Congress of SIMAI (May 2002). It can be freely used, but please reference the above paper. */ /* In order to find the Tandem Repeats (TR) in a sequence the user must call the integer function "string" with five arguments: - an unsigned long integer, the length of the sequence (in the comments called "SeqLen") - a char* pointer to the sequence (in the comments called "Seq") - a FILE* pointer to the (already open) file for the complete results (in the comments called "Outf1") - a FILE* pointer to the (already open) file for the main results table (in the comments called "Outf2") - an integer, the threshold of the score above which a TR is considered interesting (in the comments called "Score") The return value is an integer error indicator with values 0 : OK 1 : At least one warning occurred, and possibly some results may be unreliable If one puts the return value in a generic integer variable (here called "Res"), the call sequence is Res = string (SeqLen, Seq, Outf1, Outf2, Score); The function "string" has as side effects: to write on standard output some results in very concise form to write on the first output file all complete results to write on the second output file the results in table form without closing the output files WARNINGS Possible warnings are reported on the standard output and on the first output file All warnings are due to insufficient dimensioning of some work arrays and indicate the corresponding integer constant to be augmented List of the warnings WARNING: increase maxwin WARNING: increase max_tree_w at least to <> WARNING: increase max_tree_no WARNING: increase cmaxwin at least to <> WARNING: increase max_SIP_no WARNING: increase max_gp Note 1: The need of long work arrays depends on the degree of repetitiveness (and not on the length) of the sequence Note 2: Huge values of the above constants may of course require to redefine the type of some variables (for example from short to long) If an internal check detects an abnormal event, an error message is issued ("Internal error <>") and the execution is aborted. In this case please kindly report to the authors the input DNA sequence and the environment conditions needed to reproduce the error All the references to the symbols used in the paper are included in square brackets [ ] */ #include #include #include #include #define maxgamma 100 /* [GammaMax] */ #define maxwin 10000 /* see warnings */ #define cmaxwin 20000 /* see warnings */ #define nul 32767 #define aggwin(z) ((z + maxwin) % maxwin) #define caggwin(z) ((z + lc) % lc) int string (unsigned long, char*, FILE*, FILE*, int); void add_words (long, short); void treat_word (long, short); void rotate_word (char*, short); void find_word (char*, long); void add_word (char*); void reduce_word (char*); void store_SIP (long, long, char*); void circ_align (long, long, char*); void select_SIP (void); void print_results (void); void print_SIP (char*, long, short, long, short, long, short); void go_out (int); short sim, /* score for similarity */ dif, /* penalty for difference */ peg, /* penalty for existence of a gap */ plg, /* penalty for length of a gap */ min_out; /* minimum score for output */ /* 1st phase dynamic programming matrices */ /* val(ii,j) score at the element (ii,j) */ /* pre(ii,j) "pointer" from element (ii,j) to "previous" element along the chain ( constant "nul" if (ii,j) is the 1st element) pre = nul: chain start pre = 0: no gap pre > 0: gap up pre < 0: gap down */ /* pos(ii,j) and pha(ii,j) contain the ii-index and the j-index - of the 1st element (root) of the chain if (ii,j) is not the root of the chain - of the element of the chain with greatest score if (ii,j) is the root of the chain */ long val [maxwin][maxgamma]; /* max value: sim*maxwin */ short pos [maxwin][maxgamma]; /* max value: maxwin */ short pre [maxwin][maxgamma]; /* range: -/+maxgamma or nul */ short pha [maxwin][maxgamma]; /* [Gamma]-1 */ /* max value: maxgamma */ long va1 [maxwin]; long va2 [maxwin]; #define max_tree_no 2000 /* see warnings */ #define max_tree_w 150 /* see warnings */ /* tree of consensus words */ char tree_word [max_tree_no][max_tree_w+1]; /* consensus word */ long tree_sxp [max_tree_no]; /* pointer to left */ long tree_dxp [max_tree_no]; /* pointer to right */ long tree_no; /* number of words */ #define max_SIP_no 5000 /* see warnings */ char SIP_word [max_SIP_no][max_tree_w+1]; /* consensus word */ long SIP_ib [max_SIP_no]; /* I Beginning */ /* max value: bp_no */ short SIP_jb [max_SIP_no]; /* J Beginning */ /* max value: max_tree_w */ long SIP_ie [max_SIP_no]; /* I Ending */ /* max value: bp_no */ short SIP_je [max_SIP_no]; /* J Ending */ /* max value: max_tree_w */ long SIP_score [max_SIP_no]; /* score */ /* max value: sim*cmaxwin */ short SIP_sf [max_SIP_no]; /* Selection Flag */ /* -1: possibly nested */ /* 0: discarded */ /* +1: selected */ long SIP_no; /* 2nd phase dynamic programming matrices */ /* cval(ii,j) score of the element (ii,j) */ /* cpre(ii,j) "pointer" from element (ii,j) to "previous" element along the chain (constant "cnul" if (ii,j) is the 1st element) cpre = cnul: chain start cpre = 0: no gap cpre > 0: insertion cpre < 0: deletion */ /* cpos(ii,j) and ccyc(ii,j) contain the ii-index and the j-index - of the 1st element (root) of the chain if (ii,j) is not the root of the chain - of the element of the chain with greatest score if (ii,j) is the root of the chain */ long cval [cmaxwin][max_tree_w]; /* max value: sim*cmaxwin */ short cpos [cmaxwin][max_tree_w]; /* max value: cmaxwin */ short cpre [cmaxwin][max_tree_w]; /* range: -max_tree_w..cmaxwin */ short ccyc [cmaxwin][max_tree_w]; /* max value: max_tree_w */ short csim, cdif, cpeg1, cplg1, cpeg2, cplg2; long CntBases [26]; short cmemmin; /* value of Score (see comments above) */ #define min_score 150 #define max_score 10000 #define score_ratio (2.0 / 3.0) int if_maxwin; int if_max_tree_w; int if_max_tree_no; int if_cmaxwin; int if_max_SIP_no; int if_max_gp; char* s; unsigned long bp_no; FILE* fout1; FILE* fout2; int string(unsigned long qb, char* st, FILE *f1, FILE *f2, int st_score) { /* 1st index (y=pos=i...): position relative to window of an element 2nd index (x=pha=j...): displacement of an element */ short mpre; /* range: -/+maxgamma */ short ii, rig, i1; /* max value: maxwin */ short j, j1; /* max value: maxgamma */ long i; long icor; long a; long mval; /* maximum score of the chain */ long iib, iie; /* start and end of augmented extension */ long int_b, int_e; /* start and end of interesting zone */ int tolint = 200; /* [D0] */ int int_zone; /* boolean */ /* Gotoh-like algorithm variables */ long best_sc_i [maxgamma]; /* max value: sim*maxwin */ /* best score along i */ short best_pos_i [maxgamma]; /* max value: maxgamma */ /* best position along i */ long best_sc_j; /* max value: sim*maxwin */ /* best score along j */ short best_pos_j; /* max value: maxgamma */ /* best position along j */ short xi, xm; /* max value: maxgamma */ short yi, ym; /* max value: maxwin */ if (st_score < min_score) st_score = min_score; if (st_score > max_score) st_score = max_score; fout1 = f1; fout2 = f2; s = st; bp_no = qb; /* Scores [p] and [q] for base comparison and coefficients [W0] and [W1] of the affine gap penalty function */ /* phase 1 */ sim = 10; /* [p] */ dif = 30; /* [-q] */ peg = 100; /* [-W0] */ plg = 10; /* [-W1] */ /* phase 2 */ csim = 10; /* [p] */ cdif = 30; /* [-q] */ cpeg1 = 100; /* [-W0] for insertion */ cplg1 = 10; /* [-W1] for insertion */ cpeg2 = 100; /* [-W0] for deletion */ cplg2 = 10; /* [-W1] for deletion */ min_out = (short) (st_score*score_ratio); /* minimum score for output */ cmemmin = st_score; if_maxwin = 0; if_max_tree_w = 0; if_max_tree_no = 0; if_cmaxwin = 0; if_max_SIP_no = 0; if_max_gp = 0; for (ii = 0; ii < maxwin; ii++) /* pos. relative to window */ for (j = 0; j < maxgamma; j++) /* j =[Gamma]-1 */ { val [ii][j] = 0; /* dynamic programming matrices */ pre [ii][j] = nul; /* initialisation */ pha [ii][j] = j; pos [ii][j] = ii; } for (j = 0; j < maxgamma; j++) /* Gotoh-like algorithm */ { /* initialisations */ best_sc_i [j] = - peg - plg; best_pos_i [j] = 0; } int_zone = 0; /* interesting zone flag */ tree_no = 0; /* number of found words */ for (i = 0; i < bp_no + maxwin; i++) { rig = aggwin (i); /* pos. relative to window */ if (i >= maxwin) /* to skip the 1st time */ { for (j = 0; j < maxgamma; j++) { mval = val [pos [rig][j]][pha [rig][j]]; /* max score */ /* in chain */ if (mval > min_out && pre [rig][j] == nul) { iib = i-maxwin; /* true beginning */ /* (Interesting I Beginning) */ iie = iib + aggwin (pos [rig][j] - rig) + pha [rig][j] + 1; /* true end */ /* (Interesting I Ending) */ if (!int_zone) { int_b = iib; /* interesting zone beginning */ int_e = iie; /* interesting zone end */ } else { if (iie > int_e) int_e = iie; } int_zone = 1; add_words (i - maxwin, j); } } if (int_zone && (i - maxwin > int_e + tolint || i == bp_no + maxwin -1)) { int_b -= tolint/2; int_e += tolint/2; if (int_b < 0) int_b = 0; if (int_e > bp_no - 1) int_e = bp_no - 1; SIP_no = 0; /* number of considered alignments */ for (icor = 0; icor < tree_no; icor++) store_SIP (int_b, int_e, tree_word [icor]); select_SIP (); print_results (); int_zone = 0; tree_no = 0; } } if (i >= bp_no) continue; for (j = 0; j < maxgamma && (i + j + 1) < bp_no; j++) { /* no gap: i1 = i-1; j1 = j */ i1 = aggwin (i - 1); mval = val [i1][j]; mpre = 0; /* insertion of k bp: i1 = i-k-1; j1 = j+k */ if (j < maxgamma-1) { i1 = aggwin (i1 - 1); j1 = j + 1; best_sc_i [j] = best_sc_i [j1] - plg; best_pos_i [j] = best_pos_i [j1] + 1; a = val [i1][j1] - plg - peg; if (a >= best_sc_i [j]) { best_sc_i [j] = a; best_pos_i [j] = 1; } if (best_sc_i [j] > mval) { mval = best_sc_i [j]; mpre = best_pos_i [j]; } } /* deletion of k bp: i1 = i-1; j1 = j-k */ i1 = aggwin (i - 1); if (j == 0) /* Gotoh-like algorithm initialisations */ { best_sc_j = - peg - plg; best_pos_j = 0; } else { best_sc_j -= plg; a = val [i1][j-1] - peg - plg; if (a >= best_sc_j) { best_sc_j = a; best_pos_j = j-1; /* value of j of the best link */ } } if (best_sc_j > mval) { mval = best_sc_j; mpre = best_pos_j - j; /* -k */ } /* clearing of past */ if (mval <= 0) { mval = 0; mpre = nul; pha [rig][j] = j; pos [rig][j] = rig; } /* score updating at current element */ if (s [i] == s [i + j + 1]) mval += sim; else mval -= dif; if (mpre != nul) /* going on with chain */ { j1 = mpre + j; /* displacement of previous element */ if (mpre > 0) i1 = aggwin (rig - mpre - 1); else i1 = aggwin (rig - 1); xi = pha [i1][j1]; /* root was stored in previous el. */ yi = pos [i1][j1]; if_maxwin = (aggwin (rig - yi) + j >= maxwin - 1); if (if_maxwin) /* chain too long */ { printf ("WARNING: increase maxwin\n"); fprintf (fout1, "WARNING: increase maxwin\n"); if_maxwin = 1; return 1; } pha [rig][j] = xi; /* root */ pos [rig][j] = yi; /* is stored in current */ xm = pha [yi][xi]; /* best score element */ ym = pos [yi][xi]; /* was stored in root */ if (mval > val [ym][xm]) /* new best score element */ { pha [yi][xi] = j; /* best score element */ pos [yi][xi] = rig; /* is stored in root */ } } /* end if (mpre != nul) */ val [rig][j] = mval; /* current element */ pre [rig][j] = mpre; /* updating */ } /* end for j */ } /* end for i */ return (if_maxwin || if_max_tree_w || if_max_tree_no || if_cmaxwin || if_max_SIP_no || if_max_gp); } void add_words (long in, short j) /* extracts the interesting words for current alignment in, j = absolute position and displacement of the root */ { short rig, ym, yi, yc, ip, nva, k, i1; /* max value: maxwin */ short mpre, is; /* range: -/+maxgamma */ short x, xi, xm, xc, jp, jv; /* max value: maxgamma */ long mval; long y, iv, yn, inm; rig = aggwin (in); /* relative pos. of root */ xm = pha [rig][j]; /* displ. of best element */ ym = pos [rig][j]; /* relative pos. of best element */ xi = pha [ym][xm]; /* displ. of initial element */ yi = pos [ym][xm]; /* relative pos. of initial element */ if (xi != j || yi != rig) go_out (1); nva = 0; /* current chain number of elements */ inm = in + maxwin; yn = in; /* absolute pos. of root */ y = inm - (inm - ym) % maxwin; /* absolute pos. of best element */ x = xm; va1 [nva] = y; /* absolute pos. of 1st base of best element */ va2 [nva] = y + x + 1; /* absolute pos. of 2nd base of best element */ for (nva = 0; va1 [nva] > yn; nva++) /* proceeding backwards along chain from the best to root, sets va1 and va2 to absolute pos. of aligned base pairs */ { is = pre [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1]; if (is != nul) { if (!is) { va1 [nva + 1] = va1 [nva] - 1; va2 [nva + 1] = va2 [nva] - 1; } else { if (is > 0) { va1 [nva + 1] = va1 [nva] - is - 1; va2 [nva + 1] = va2 [nva] - 1; } else { va1 [nva + 1] = va1 [nva] - 1; va2 [nva + 1] = va2 [nva] + is - 1; } } } else { val [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = 0; go_out (2); } pre [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = nul; val [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = 0; pos [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = va1 [nva] % maxwin; pha [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = va2 [nva] - va1 [nva] - 1; } /* the aligned base pairs go from 0 to nva */ pre [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = nul; val [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = 0; pos [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = va1 [nva] % maxwin; pha [va1 [nva] % maxwin][va2 [nva] - va1 [nva] - 1] = va2 [nva] - va1 [nva] - 1; /* 2nd phase word adding */ if (va1 [0] > va2 [nva]) { for (k = 0; k <= nva; k += (va2 [k] - va1 [k])) treat_word (va1 [k], (short) (va2 [k] - va1 [k])); } /* clearing of past */ /* tree pruning */ for (iv = in + 1; iv < in + maxwin; iv++) { for (jv = 0; jv < maxgamma && (iv + jv + 1) < bp_no; jv++) { /* iv, jv: indices of the element to be cleared */ i1 = aggwin (iv); mpre = pre [i1][jv]; if (mpre == nul) continue; /* if (i1, jv)-element has (xi, yi) as chain root, it is cleared */ if (pha [i1][jv] != xi || pos [i1][jv] != yi) continue; jp = mpre + jv; if (mpre > 0) ip = aggwin (iv - mpre - 1); else ip = aggwin (iv - 1); /* ip, jp: previous element indices */ mval = val [ip][jp]; if (mpre > 0) mval -= peg + plg * mpre; if (mpre < 0) mval -= peg - plg * mpre; if (mval > 0) { /* going on with chain */ if (s [iv] == s [iv + jv + 1]) mval += sim; else mval -= dif; val [i1][jv] = mval; if (pre [ip][jp] == nul) /* (ip, jp)-element is: */ { /* root */ xc = jp; yc = ip; } else { /* not root */ xc = pha [ip][jp]; yc = pos [ip][jp]; } pha [i1][jv] = xc; pos [i1][jv] = yc; if (mval > val [pos [yc][xc]] [pha [yc][xc]]) { pha [yc][xc] = jv; pos [yc][xc] = i1; } } else { /* new chain */ pre [i1][jv] = nul; pha [i1][jv] = jv; pos [i1][jv] = i1; if (s [iv] == s [iv + jv + 1]) val [i1][jv] = sim; else val [i1][jv] = -dif; } } } } #define dmax_tree_w (max_tree_w*2) void treat_word (long start, short len) /* updates the tree with the 1st word in alphabetic order among all the cyclical permutations of the given word */ { char wor [max_tree_w+1]; if (len > max_tree_w) { if (!if_max_tree_w) { printf ("WARNING: increase max_tree_w to %ld\n", len); fprintf (fout1, "WARNING: increase max_tree_w to %ld\n", len); if_max_tree_w = 1; } return; } memcpy (wor, s + start, len); rotate_word (wor, len); wor [len] = 0; /* tree updating with the possibly reduced word */ reduce_word (wor); if (!tree_no) add_word (wor); /* build new root of tree */ else find_word (wor, 0); /* possibly add new leaf to tree */ } void rotate_word (char* wor, short len) /* finds the 1st word in alphabetic order among all the cyclical permutations of the given word */ { char dwor [dmax_tree_w]; short imax, icor; /* max: max_tree_w */ memcpy (dwor, wor, len); memcpy (dwor+len, wor, len); imax = 0; for (icor=1; icor < len; icor++) if (memcmp (dwor+imax, dwor+icor, len) > 0) imax = icor; memcpy (wor, dwor+imax, len); } #define tree_null -1 void reduce_word (char* wor) /* if the word is a TR of shorter words, we use the shortest one */ { short i, l, p, ip; /* max: max_tree_w */ l = strlen (wor); for (i = l; i > 1; i--) { if (l % i) continue; p = l / i; for (ip = 1; ip < i; ip++) if (memcmp (wor, wor + ip*p, p)) break; if (i != ip) continue; wor [p] = 0; return; } } void find_word (char* wor, long r) /* possibly add new word */ { int cmp; if (tree_no >= max_tree_no) { if (!if_max_tree_no) { printf ("WARNING: increase max_tree_no\n"); fprintf (fout1, "WARNING: increase max_tree_no\n"); if_max_tree_no = 1; } return; } cmp = strcmp (wor, tree_word [r]); if (!cmp) return; if (cmp < 0) { if (tree_sxp [r] >= 0) { find_word (wor, tree_sxp [r]); return; } tree_sxp [r] = tree_no; add_word (wor); return; } else { if (tree_dxp [r] >= 0) { find_word (wor, tree_dxp [r]); return; } tree_dxp [r] = tree_no; add_word (wor); return; } } void add_word (char* wor) /* build new leaf of tree */ { tree_sxp [tree_no] = tree_null; tree_dxp [tree_no] = tree_null; strcpy (tree_word [tree_no], wor); tree_no++; } #define cnul -16383 void store_SIP (long p_beg, long p_end, char* wor) /* 2nd phase */ /* circular alignment with consensus (SIP) storing */ { long i, lw, mval; short j; /* max: maxgamma */ short mpre; /* max: -max_tree_w..cmaxwin */ short iv, yc, ip; /* max: cmaxwin */ short lc; /* max: max_tree_w */ short jv, xc, jp; /* max: max_tree_w */ lw = p_end - p_beg + 1; lc = strlen (wor); /* consensus length */ if (lw > cmaxwin) { if (!if_cmaxwin) { printf ("WARNING: increase cmaxwin to %d\n", lw); fprintf (fout1, "WARNING: increase cmaxwin to %d\n", lw); if_cmaxwin = 1; } return; } circ_align (p_beg, p_end, wor); /* finding alignments */ for (i = 0; i < lw; i++) /* storing SIP */ for (j = 0; j < lc; j++) { if (cpre [i][j] == cnul) { mval = cval [cpos [i][j]][ccyc [i][j]] - lc * csim; if (mval > cmemmin) { if (SIP_no >= max_SIP_no) { if (!if_max_SIP_no) { printf ("WARNING: increase max_SIP_no\n"); fprintf (fout1, "WARNING: increase max_SIP_no\n"); if_max_SIP_no = 1; } return; } strcpy (SIP_word [SIP_no], wor); SIP_ib [SIP_no] = p_beg+i; SIP_ie [SIP_no] = p_beg+cpos [i][j]; SIP_jb [SIP_no] = j; SIP_je [SIP_no] = ccyc [i][j]; SIP_score [SIP_no] = mval; SIP_sf [SIP_no] = 1; SIP_no++; iv = cpos [i][j]; jv = ccyc [i][j]; while (iv > i) { /* proceeding backwards along chain from the best to root, clearing the chain */ mpre = cpre [iv][jv]; cpre [iv][jv] = cnul; cval [iv][jv] = 0; cpos [iv][jv] = iv; ccyc [iv][jv] = jv; if (mpre == cnul) go_out (3); if (mpre >= 0) { iv = iv - mpre -1; jv = caggwin (jv - 1); } else { iv = iv - 1; jv = caggwin (jv + mpre - 1); } } /* tree pruning */ for (iv = i + 1; iv < lw; iv++) { for (jv = 0; jv < lc; jv++) { /* iv, jv: indices of the element to be cleared */ mpre = cpre [iv][jv]; if (mpre == cnul) continue; /* if (i1, jv)-element has (xi, yi) as chain root, it is cleared */ if (ccyc [iv][jv] != j || cpos [iv][jv] != i) continue; if (mpre >= 0) { ip = iv - mpre -1; jp = caggwin (jv - 1); } else { ip = iv - 1; /* ip, jp: previous element indices */ jp = caggwin (jv + mpre - 1); } mval = cval [ip][jp]; if (mpre > 0) mval -= cpeg1 + cplg1 * mpre; /* insertion */ if (mpre < 0) mval -= cpeg2 - cplg2 * mpre; /* deletion */ if (mval > 0) { /* going on with chain */ if (s [p_beg+iv] == wor [jv]) mval += csim; else mval -= cdif; cval [iv][jv] = mval; if (cpre [ip][jp] == cnul) /* (ip, jp)-element is: */ { /* root */ xc = jp; yc = ip; } else { /* not root */ xc = ccyc [ip][jp]; yc = cpos [ip][jp]; } ccyc [iv][jv] = xc; cpos [iv][jv] = yc; if (mval > cval [cpos [yc][xc]] [ccyc [yc][xc]]) { ccyc [yc][xc] = jv; cpos [yc][xc] = iv; } } else { /* new chain */ cpre [iv][jv] = cnul; ccyc [iv][jv] = jv; cpos [iv][jv] = iv; if (s [p_beg+iv] == wor [jv]) mval = csim; else mval = -cdif; cval [iv][jv] = mval; } } } } } } } void circ_align (long p_beg, long p_end, char* wor) /* finds circular alignment with consensus (SIP) output: cval [i][j], cpre [i][j], ccyc [i][j], cpos [i][j] */ { short i, i1, yi, ym; /* max: cmaxwin */ short xm, j, j1, lc, xi; /* max: max_tree_w */ short mpre; /* range: -max_tree_w..cmaxwin */ long lw, mval, a; short best_pos_i [max_tree_w]; /* max: cmaxwin */ short best_pos_j; /* range: -/+max_tree_w */ long best_sc_j; /* max: csim * cmaxwin */ long best_sc_i [max_tree_w]; /* max: csim * cmaxwin */ lw = p_end - p_beg + 1; lc = strlen (wor); if (lw > cmaxwin) go_out (4); i = 0; for (j = 0; j < lc; j++) { cval [i][j] = 0; cpre [i][j] = cnul; ccyc [i][j] = j; cpos [i][j] = i; best_sc_i [j] = -cpeg1; best_pos_i [j] = 0; if (s [p_beg+i] == wor [j]) cval [i][j] += csim; else cval [i][j] -= cdif; } for (i = 1; i < lw; i++) for (j = 0; j < lc; j++) { /* no gap */ i1 = i - 1; j1 = caggwin (j - 1); mval = cval [i1][j1]; mpre = 0; /* insertion */ best_sc_i [j1] -= cplg1; if (i > 1) { if (cval [i-2][j1] >= best_sc_i [j1]) { best_sc_i [j1] = cval [i-2][j1]; best_pos_i [j1] = i-2; } /* only here best_sc_i is without initial penalty */ a = best_sc_i [j1] - cplg1 - cpeg1; if (a > mval) { mval = a; mpre = i - best_pos_i [j1] - 1; /* mpre > 0 */ } } /* deletion */ i1 = i-1; if (j == 0) { best_sc_j = - cpeg2 - cplg2; best_pos_j = 0; for (j1 = -2; -j1 <= lc; j1--) { a = cval [i1][caggwin (j1)] - cpeg2 - cplg2 * (-j1-1); if (a > best_sc_j) { best_sc_j = a; best_pos_j = j1; } } } else { best_sc_j -= cplg1; a = cval [i1][caggwin (j-2)] - cpeg2 - cplg2; if (a > best_sc_j) { best_sc_j = a; best_pos_j = j-2; } } if (best_sc_j > mval) { mval = best_sc_j; mpre = best_pos_j-j+1; /* mpre < 0 */ } /* clearing the past */ if (mval <= 0) { mval = 0; mpre = cnul; ccyc [i][j] = j; cpos [i][j] = i; } if (s [p_beg+i] == wor [j]) mval += csim; else mval -= cdif; if (mpre != cnul) { /* going on with chain */ if (mpre >= 0) { j1 = caggwin (j-1); i1 = i-1-mpre; } else { i1 = i-1; j1 = caggwin (j-1+mpre); } /* i1, j1: previous element indices */ xi = ccyc [i1][j1]; /* root */ yi = cpos [i1][j1]; /* was stored in previous element */ /* i, j: current element indices */ ccyc [i][j] = xi; /* root */ cpos [i][j] = yi; /* is stored in current element */ xm = ccyc [yi][xi]; /* best-score element */ ym = cpos [yi][xi]; /* was stored in root */ if (mval > cval [ym][xm]) /* new best-score element */ { ccyc [yi][xi] = j; /* best-score element */ cpos [yi][xi] = i; /* is stored in root */ } } /* end if (mpre != nul) */ cval [i][j] = mval; cpre [i][j] = mpre; } } #undef min #undef max #define min(x,y) ((x)>(y))?(y):(x) #define max(x,y) ((x)>(y))?(x):(y) void select_SIP (void) /* discard trivial SIPs */ { short icor, jcor; /* max: max_SIP_no */ long imi, ima, ls, lj; for (icor = 0; icor < SIP_no; icor++) for (jcor = 0; jcor < SIP_no; jcor++) { /* icor-SIP discards jcor-SIP, unless... */ if (!SIP_sf [jcor] || /* ...it is already discarded || */ icor == jcor || /* ...it is itself || */ SIP_score [icor] < SIP_score [jcor]) /* ...it has a greater score || */ continue; /* imi: start of overlapping */ imi = max (SIP_ib [icor], SIP_ib [jcor]); /* ima: end of overlapping */ ima = min (SIP_ie [icor], SIP_ie [jcor]); /* ls: length of overlapping */ ls = ima - imi + 1; /* lj: jcor-SIP sequence length */ lj = SIP_ie [jcor] - SIP_ib [jcor] + 1; if (lj > 2*ls) /* ...they overlap less than the half */ continue; /* of jcor-SIP sequence length || */ if (SIP_score [icor] == SIP_score [jcor] && jcor < icor) continue; /* ...they have same score and jcor < icor || */ if (3*strlen (SIP_word [jcor]) < 2*strlen (SIP_word [icor])) { /* ...its consensus is much shorter */ SIP_sf [jcor] = -1; /* flags possible nesting */ continue; } SIP_sf [jcor] = 0; /* discarded */ } /* screen output */ for (icor = 0; icor < SIP_no; icor++) if (SIP_sf [icor]) { printf ("%10d %10d %6d %7d ", SIP_ib [icor]+1, SIP_ie [icor]+1, strlen (SIP_word [icor]), SIP_score [icor]); if (strlen (SIP_word [icor]) < 30) printf (">%s<\n", SIP_word [icor]); else printf (">%30.30s...<\n", SIP_word [icor]); } } void print_results (void) /* final outputs */ { short icor; /* max: max_tree_w */ for (icor = 0; icor < SIP_no; icor++) { if (SIP_sf [icor]) { print_SIP (SIP_word [icor], SIP_ib [icor], SIP_jb [icor], SIP_ie [icor], SIP_je [icor], SIP_score [icor], SIP_sf [icor]); } } } #define max_gp 20000 /* see warnings */ short l_gp [max_gp]; /* graphic position relative to column */ /* (insertion lenght) */ /* max: cmaxwin */ long p_gp [max_gp]; /* position in sequence */ void print_SIP (char* wor, long inii, short inij, long fini, short finj, long sco, short sop) /* SIP output */ { short jcor, lc; /* max: max_tree_w */ short mpre; /* range: -max_tree_w..cmaxwin */ short ic; /* max: cmaxwin */ long icor, k; short maspa [max_tree_w]; /* max: cmaxwin */ char c; long ss; int fian = 40; int ins, del, match, mismatch; /* max: cmaxwin */ float nrip; short nps; /* max: cmaxwin */ /* nps: position relative to consensus (starting from the end) - normally increases by 1 - when insertion: increases by 1 (independently from number of inserted bases) - when deletion: increases by 1 for each deleted base - when is lc-multiple, change print row */ ins = del = 0; match = mismatch = 0; lc = strlen (wor); circ_align (inii, fini, wor); l_gp [0] = 1; p_gp [0] = fini; nps = 1; icor = fini; jcor = finj; if ((cval [fini - inii][finj] - lc * csim) != sco) go_out (5); while (icor > inii) { if (nps >= max_gp) { if (!if_max_gp) { printf ("WARNING: increase max_gp\n"); fprintf (fout1, "WARNING: increase max_gp\n"); if_max_gp = 1; } return; } mpre = cpre [icor-inii][jcor]; icor--; jcor = caggwin (jcor-1); if (mpre == cnul) go_out (6); if (!mpre) { l_gp [nps] = 1; p_gp [nps] = icor; } else if (mpre > 0) { /* insertion */ icor -= mpre; l_gp [nps] = 1 + mpre; p_gp [nps] = icor; ins++; } else { /* deletion */ del++; jcor = caggwin (jcor + mpre); for (k = 0; k < -mpre; k++) { if (nps >= max_gp) { if (!if_max_gp) { printf ("WARNING: increase max_gp\n"); fprintf (fout1, "WARNING: increase max_gp\n"); if_max_gp = 1; } return; } l_gp [nps] = 0; p_gp [nps] = icor; nps++; } l_gp [nps] = 1; p_gp [nps] = icor; } nps++; } if (icor != inii || jcor != inij) go_out (7); for (k = 0; k < lc; k++) maspa [k] = 1; for (k = nps-1; k >= 0; k--) { jcor = caggwin (nps-1-k); maspa [jcor] = max (maspa [jcor], l_gp [k]); } nrip = nps * 1.0 / lc; for (k = 'a'; k <= 'z'; k++) CntBases [k - 'a'] = 0; for (k = inii; k <= fini; k++) if (isalpha (s [k])) CntBases [tolower (s [k]) - 'a']++; fprintf (fout1, "\n{\n"); fprintf (fout1, "lw = %d, inii = %d, fini = %d, nrip = %.1f, ", lc, inii+1, fini+1, nrip); fprintf (fout1, "score = %d, word = >%s<\n", sco, wor); if (nrip > 2.5 && lc > 2) { fprintf (fout2, "%9ld %9ld %3d %5.1f %7d ", inii+1, fini+1, lc, nrip, sco); ss = max (0, inii - fian); fprintf (fout1, "\nbefore %d ", ss+1); for (k=ss; k < inii; k++) fprintf (fout1, "%c", s [k]); fprintf (fout1, " %d\n\n", inii); fprintf (fout1, "consensus "); for (k = 0; k < lc; k++) { jcor = caggwin (k+inij); fprintf (fout1, "%c", wor [jcor]); for (ic = 1; ic < maspa [k]; ic++) fprintf (fout1, "-"); } fprintf (fout1, "\n"); for (k = nps-1; k >= 0; k--) { jcor = caggwin (nps-1-k); if (!jcor) fprintf (fout1, "\n %8d ", p_gp [k]+1); for (ic = 0; ic < maspa [jcor]; ic++) { if (ic < l_gp [k]) { if (ic) fprintf (fout1, "%c", s [p_gp [k] + ic]); else { c = s [p_gp [k]]; if (c != wor [caggwin (jcor+inij)]) { fprintf (fout1, "%c", c); mismatch++; } else { if (isupper (c)) fprintf (fout1, "%c", tolower (c)); else if (islower (c)) fprintf (fout1, "%c", toupper (c)); match++; } } } else if (!ic) fprintf (fout1, "-"); /* l_gp [k] == 0 */ else fprintf (fout1, " "); } } ss = min (bp_no, fini + fian + 1); fprintf (fout1, "\n\nafter %d ", fini+2); for (k = fini + 1; k < ss; k++) fprintf (fout1, "%c", s [k]); fprintf (fout1, " %d\n", ss); fprintf (fout1, "\nins = %d del = %d\n", ins, del); fprintf (fout1, "match = %d mismatch = %d\n", match, mismatch); fprintf (fout2, "%3d %3d ", ins, del); fprintf (fout2, "%5d %5d ", match, mismatch); for (k = 'a'; k <= 'z'; k++) if (CntBases [k - 'a']) fprintf (fout1, " %c ", toupper (k)); fprintf (fout1, "\n"); for (k = 'a'; k <= 'z'; k++) if (CntBases [k - 'a']) fprintf (fout1, "%5.1f%%", CntBases [k - 'a']*100.0/(fini-inii+1)); fprintf (fout2, "%5.1f ", CntBases ['a' - 'a']*100.0/(fini-inii+1)); fprintf (fout2, "%5.1f ", CntBases ['c' - 'a']*100.0/(fini-inii+1)); fprintf (fout2, "%5.1f ", CntBases ['g' - 'a']*100.0/(fini-inii+1)); fprintf (fout2, "%5.1f ", CntBases ['t' - 'a']*100.0/(fini-inii+1)); if (nrip > 2.5) { if (sop == -1) { fprintf (fout2, " * "); fprintf (fout1, "\npossible nesting"); } else fprintf (fout2, " "); if (strlen (wor) <= 10) fprintf (fout2, "%s", wor); else fprintf (fout2, "%10.10s...", wor); fprintf (fout2, "\n"); } fprintf (fout1, "\n"); } fprintf (fout1, "}\n\n"); } void go_out (int err) { printf ("INTERNAL ERROR No %d\n", -err); fprintf (fout1, "\nINTERNAL ERROR No %d\n", -err); fclose (fout1); fclose (fout2); exit (-err); }