/* PSICOV - Protein Sparse Inverse COVariance analysis program */ /* by David T. Jones August 2011 - Copyright (C) 2011 University College London */ /* This code is licensed under the terms of GNU General Public License v2 or later */ /* Version 2.4 - Last Edit 28/12/16 */ #include #include #include #include #include #include #include #ifdef _OPENMP #include #endif #define FALSE 0 #define TRUE 1 #define SQR(x) ((x)*(x)) #define MAX(x,y) ((x)>(y)?(x):(y)) #define MIN(x,y) ((x)<(y)?(x):(y)) #define MAXSEQLEN 5000 #define MINEFSEQS (seqlen) /* Dump a rude message to standard error and exit */ void fail(char *fmt, ...) { va_list ap; va_start(ap, fmt) ; fprintf(stderr, "*** "); vfprintf(stderr, fmt, ap); fputc('\n', stderr); exit(-1); } /* Convert AA letter to numeric code (0-21) */ int aanum(int ch) { const static int aacvs[] = { 999, 0, 3, 4, 3, 6, 13, 7, 8, 9, 21, 11, 10, 12, 2, 21, 14, 5, 1, 15, 16, 21, 19, 17, 21, 18, 6 }; return (isalpha(ch) ? aacvs[ch & 31] : 20); } /* Allocate matrix */ void *allocmat(int rows, int columns, int size) { int i; void **p, *rp; rp = malloc(rows * sizeof(void *) + sizeof(int)); if (rp == NULL) fail("allocmat: malloc [] failed!"); *((int *)rp) = rows; p = rp + sizeof(int); for (i = 0; i < rows; i++) if ((p[i] = calloc(columns, size)) == NULL) fail("allocmat: malloc [][] failed!"); return p; } /* Allocate vector */ void *allocvec(int columns, int size) { void *p; p = calloc(columns, size); if (p == NULL) fail("allocvec: calloc failed!"); return p; } /* This subroutine computes the L1 regularized covariance matrix estimate using the algorithm described in the paper: J. Friedman, T. Hastie, R. Tibshirani: Sparse inverse covariance estimation with the graphical lasso Biostatistics, 9(3):432-441, July 2008. This code is adapted from the Fortran code described in the following report: M. A. Sustik & B. Calderhead: GLASSOFAST: An efficient GLASSO implementation Technical Report TR-12-29, University of Texas at Austin NOTE: that when multiple threads are used, we gain a huge time saving by avoiding full thread synchronisation when updating elements of the W (covariance) matrix. In multithreaded mode, the order of updates to the W matrix at each iteration will depend on the order in which threads complete. In practice, this hardly matters, because the algorithm is iterative, and in testing still converges to within 6 d.p. of the non-threaded code. If a very small degree of non-deterministic behaviour really worries you, then set the maximum number of threads to 1 (or compile without OpenMP). */ #define EPS (1.1e-15) #define BIG (1e9) int glassofast(const int n, double **S, double **L, const double thr, const int maxit, int approxflg, int warm, double **X, double **W) { int i, j, ii, iter, jj; double a, b, c, delta, dlx, dw, shr, sum, thrlasso, tmp, wd[MAXSEQLEN*21], wxj[MAXSEQLEN*21]; for (shr=ii=0; ii= 0.0) c = b / wd[i]; else c = -b / wd[i]; delta = c - X[j][i]; if (delta != 0.0 && (!approxflg || fabs(delta) > 1e-6)) { X[j][i] = c; for (ii=0; ii dlx) dlx = fabs(delta); } } } if (dlx < thrlasso) break; } wxj[j] = wd[j]; for (sum=ii=0; ii dw) dw = sum; for (ii=0; ii= 0; k--) sum -= a[i][k]*a[i][k]; if (sum <= 0.0) return TRUE; diag = sqrt(sum); #pragma omp parallel for for (j=i+1; jsc == ((struct sc_entry *)b)->sc) return 0; if (((struct sc_entry *)a)->sc < ((struct sc_entry *)b)->sc) return 1; return -1; } int main(int argc, char **argv) { int a, b, i, j, k, seqlen, nids, s, nseqs, ncon, opt, ndim, filtflg=0, approxflg=0, initflg=0, apcflg=1, maxit=10000, npair, nnzero, niter, jerr, shrinkflg=1, rawscflg = 1, pseudoc = 1, minseqsep = 5, overrideflg=0, ntries; unsigned int *wtcount, ccount[MAXSEQLEN]; double thresh=1e-4, del, **pcmat, *pcsum, pcmean, pc, trialrho, rhodefault = -1.0; double sum, score, **pa, wtsum, lambda, low_lambda, high_lambda, smean, fnzero, lastfnzero, rfact, r2, targfnzero = 0.0, scsum, scsumsq, mean, sd, zscore, ppv; double *weight, idthresh = -1.0, maxgapf = 0.9, besttd = 1.0, bestrho = 0.001; char buf[4096], seq[MAXSEQLEN], *blockfn = NULL, **aln; FILE *ifp; while ((opt = getopt(argc, argv, "aflnopr:b:i:t:c:g:d:j:z:")) >= 0) switch (opt) { case 'a': approxflg = 1; break; case 'n': shrinkflg = 0; break; case 'o': overrideflg = 1; break; case 'p': rawscflg = 0; break; case 'f': filtflg = 1; break; case 'l': apcflg = 0; break; case 'r': rhodefault = atof(optarg); break; case 'd': targfnzero = atof(optarg); if (targfnzero < 5e-5 || targfnzero >= 1.0) fail("Target density value must be in range 5e-5 >= d < 1!"); break; case 't': thresh = atof(optarg); break; case 'i': idthresh = 1.0 - atof(optarg)/100.0; break; case 'c': pseudoc = atoi(optarg); break; case 'j': minseqsep = atoi(optarg); break; case 'b': blockfn = strdup(optarg); break; case 'g': maxgapf = atof(optarg); break; case 'z': #ifdef _OPENMP omp_set_num_threads(atoi(optarg)); #endif break; case '?': exit(-1); } if (optind >= argc) fail("PSICOV V2.4 Usage: psicov [options] alnfile\n\nOptions:\n-a\t: use approximate Lasso algorithm\n-n\t: don't pre-shrink the sample covariance matrix\n-f\t: filter low-scoring contacts\n-p\t: output PPV estimates rather than raw scores\n-l\t: don't apply APC to Lasso output\n-r nnn\t: set initial rho parameter\n-d nnn\t: set target precision matrix sparsity (default 0 = not specified)\n-t nnn\t: set Lasso convergence threshold (default 1e-4)\n-i nnn\t: select BLOSUM-like weighting with given identity threshold (default selects threshold automatically)\n-c nnn\t: set pseudocount value (default 1)\n-j nnn\t: set minimum sequence separation (default 5)\n-g nnn\t: maximum fraction of gaps (default 0.9)\n-z nnn\t: set maximum no. of threads\n-b file\t: read rho parameter file\n"); ifp = fopen(argv[optind], "r"); if (!ifp) fail("Unable to open alignment file!"); for (nseqs=0;; nseqs++) if (!fgets(seq, MAXSEQLEN, ifp)) break; aln = allocvec(nseqs, sizeof(char *)); weight = allocvec(nseqs, sizeof(double)); wtcount = allocvec(nseqs, sizeof(unsigned int)); rewind(ifp); if (!fgets(seq, MAXSEQLEN, ifp)) fail("Bad alignment file!"); seqlen = strlen(seq)-1; if (!(aln[0] = malloc(seqlen))) fail("Out of memory!"); for (j=0; j 0 && k 0) { #pragma omp critical wtcount[i]++; wtcount[j]++; } } for (wtsum=i=0; i= 1.0 || ntries++ > 10) { /* Give up search - recalculate with best rho found so far and exit */ trialrho = bestrho; targfnzero = 0.0; } for (i=0; i maxgapf || pa[j][20] > maxgapf) rho[i*21+a][j*21+b] = BIG; /* Mask out regions if block-out list provided */ if (blockfn != NULL) { ifp = fopen(blockfn, "r"); for (;;) { if (fscanf(ifp, "%d %d %lf", &i, &j, &score) != 3) break; for (a=0; a<21; a++) for (b=0; b<21; b++) { rho[(i-1)*21+a][(j-1)*21+b] = score; rho[(j-1)*21+b][(i-1)*21+a] = score; } } fclose(ifp); } glassofast(ndim, cmat, rho, thresh, maxit, approxflg, initflg, wwi, ww); /* Quit with optimum rho value so far */ if (targfnzero <= 0.0) break; for (npair=nnzero=i=0; i 0.0 && fnzero != lastfnzero) { // printf("fnzero=%f lastfnzero=%f trialrho=%f oldtrialrho=%f\n", fnzero, lastfnzero, trialrho, trialrho/rfact); rfact = pow(rfact, log(targfnzero / fnzero) / log(fnzero / lastfnzero)); // printf("New rfact = %f\n", rfact); } lastfnzero = fnzero; /* Make a small trial step in the appropriate direction */ if (rfact == 0.0) rfact = (fnzero < targfnzero) ? 0.9 : 1.1; trialrho *= rfact; } /* Calculate background corrected scores using average product correction */ pcmat = allocmat(seqlen, seqlen, sizeof(double)); pcsum = allocvec(seqlen, sizeof(double)); pcmean = 0.0; for (i=0; i 0.0) { /* Calculate APC score */ if (apcflg) sclist[ncon].sc = pcmat[i][j] - pcsum[i] * pcsum[j] / SQR(seqlen - 1.0) / pcmean; else sclist[ncon].sc = pcmat[i][j]; scsum += sclist[ncon].sc; scsumsq += SQR(sclist[ncon].sc); sclist[ncon].i = i; sclist[ncon++].j = j; } qsort(sclist, ncon, sizeof(struct sc_entry), cmpfn); mean = scsum / ncon; sd = 1.25 * sqrt(scsumsq / ncon - SQR(mean)); /* Corrected for extreme-value bias */ for (i=0; i= 0.5 || (!ccount[sclist[i].i] || !ccount[sclist[i].j]) || !filtflg) { printf("%d %d 0 8 %f\n", sclist[i].i+1, sclist[i].j+1, ppv); ccount[sclist[i].i]++; ccount[sclist[i].j]++; } } return 0; }