/* Sequence and profile caches, used by the hmmpgmd daemon. * * Contents: * 2. P7_CACHEDB_SEQS: a daemon's cached sequence database * x. Benchmark driver * x. Unit tests * x. License and copyright information. */ #include "p7_config.h" #include #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_getopts.h" #include "esl_sq.h" #include "esl_sqio.h" #include "esl_stopwatch.h" #include "hmmer.h" #include "cachedb.h" #include "hmmpgmd.h" /* sort routines */ static int sort_seq(const void *p1, const void *p2) { int cmp; cmp = (((HMMER_SEQ *)p1)->idx < ((HMMER_SEQ *)p2)->idx); cmp -= (((HMMER_SEQ *)p1)->idx > ((HMMER_SEQ *)p2)->idx); return cmp; } int p7_seqcache_Open(char *seqfile, P7_SEQCACHE **ret_cache, char *errbuf) { int i; int inx; int val; int status; int32_t seq_cnt; int32_t db_cnt; int32_t db_inx[32]; uint32_t db_key; uint64_t res_cnt; uint64_t res_size; uint64_t hdr_size; char *hdr_ptr; char *res_ptr; char *desc_ptr; char *ptr; char buffer[512]; off_t offset; uint64_t total_mem; SEQ_DB *db = NULL; P7_SEQCACHE *cache = NULL; ESL_RANDOMNESS *rnd = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = NULL; ESL_ALPHABET *abc = NULL; ESL_SQASCII_DATA *ascii = NULL; if (errbuf) errbuf[0] = '\0'; /* CURRENTLY UNUSED. FIXME */ /* Open the target sequence database */ if ((status = esl_sqfile_Open(seqfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) return status; /* This is a bit of a hack. The first line contains database information. * * # ... * * The rest of the file is a fasta format. The fasta header is just * sequence number followed by a binary number indicating which * database this sequence occurs in. * * The header line will be read in, parsed and saved. Then the * parser will be repositioned after the line and used normally. */ ascii = &sqfp->data.ascii; fseek(ascii->fp, 0L, SEEK_SET); if (fgets(buffer, sizeof(buffer), ascii->fp) == NULL) return eslEFORMAT; if (buffer[0] != '#') return eslEFORMAT; ptr = buffer + 1; res_cnt = strtoll(ptr, &ptr, 10); seq_cnt = strtol(ptr, &ptr, 10); db_cnt = strtol(ptr, &ptr, 10); if (db_cnt > (sizeof(db_inx)/sizeof(db_inx[0]))) return eslEFORMAT; total_mem = sizeof(P7_SEQCACHE); ESL_ALLOC(cache, sizeof(P7_SEQCACHE)); memset(cache, 0, sizeof(P7_SEQCACHE)); if (esl_strdup(seqfile, -1, &cache->name) != eslOK) goto ERROR; total_mem += (sizeof(HMMER_SEQ) * seq_cnt); ESL_ALLOC(cache->list, sizeof(HMMER_SEQ) * seq_cnt); memset(cache->list, 0, sizeof(HMMER_SEQ) * seq_cnt); total_mem += (sizeof(SEQ_DB) * db_cnt); ESL_ALLOC(db, sizeof(SEQ_DB) * db_cnt); for (i = 0; i < db_cnt; ++i) { db[i].count = strtol(ptr, &ptr, 10); db[i].K = strtol(ptr, &ptr, 10); total_mem += (sizeof(HMMER_SEQ *) * db[i].count); ESL_ALLOC(db[i].list, sizeof(HMMER_SEQ *) * db[i].count); memset(db[i].list, 0, sizeof(HMMER_SEQ *) * db[i].count); } /* grab the unique identifier */ while (*ptr && isspace(*ptr)) ++ptr; i = strlen(ptr); ESL_ALLOC(cache->id, i+1); strcpy(cache->id, ptr); while (--i > 0 && isspace(cache->id[i])) cache->id[i] = 0; res_size = res_cnt + seq_cnt + 1; hdr_size = seq_cnt * 10; total_mem += res_size + hdr_size; ESL_ALLOC(cache->residue_mem, res_size); ESL_ALLOC(cache->header_mem, hdr_size); /* position the sequence file to the start of the first sequence. * this will force any buffers associated with the file to be reset. */ offset = ftell(ascii->fp); if ((status = esl_sqfile_Position(sqfp, offset)) != eslOK) goto ERROR; abc = esl_alphabet_Create(eslAMINO); sq = esl_sq_CreateDigital(abc); cache->db_cnt = db_cnt; cache->db = db; cache->abc = abc; cache->res_size = res_size; cache->hdr_size = hdr_size; cache->count = seq_cnt; hdr_ptr = cache->header_mem; res_ptr = cache->residue_mem; for (i = 0; i < db_cnt; ++i) db_inx[i] = 0; strcpy(buffer, "000000001"); inx = 0; while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { /* sanity checks */ if (inx >= seq_cnt) { printf("inx: %d\n", inx); return eslEFORMAT; } if (sq->n + 1 > res_size) { printf("inx: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } if (hdr_size <= 0) { printf("inx: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } /* generate the database key - modified to take the first word in the desc line. * The remaining part of the desc is then cached as the description. */ ptr = sq->desc;; desc_ptr = strchr(sq->desc, ' '); if(desc_ptr != NULL) { *desc_ptr= '\0'; ++desc_ptr; } val = 1; db_key = 0; while (*ptr) { if (*ptr == '1') db_key += val; val <<= 1; ++ptr; } if (db_key >= (1 << (db_cnt + 1))) { printf("inx: %d db %d %s\n", inx, db_key, sq->desc); return eslEFORMAT; } cache->list[inx].name = hdr_ptr; cache->list[inx].dsq = (ESL_DSQ *)res_ptr; cache->list[inx].n = sq->n; cache->list[inx].idx = inx; cache->list[inx].db_key = db_key; if(desc_ptr != NULL) esl_strdup(desc_ptr, -1, &(cache->list[inx].desc)); /* copy the digitized sequence */ memcpy(res_ptr, sq->dsq, sq->n + 1); res_ptr += (sq->n + 1); res_size -= (sq->n + 1); /* copy the index to the header */ strcpy(hdr_ptr, buffer); hdr_ptr += 10; hdr_size -= 10; /* increment the buffer string */ ++buffer[8]; for (i = 8; i > 0; --i) { if (buffer[i] > '9') { buffer[i] = '0'; buffer[i-1]++; } } esl_sq_Reuse(sq); ++inx; } if (status != eslEOF) { printf("Unexpected error %d at %d\n", status, inx); return status; } if (inx != seq_cnt) { printf("inx:: %d %d\n", inx, seq_cnt); return eslEFORMAT; } if (hdr_size != 0) { printf("inx:: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } if (res_size != 1) { printf("inx:: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } /* copy the final sentinel character */ *res_ptr++ = eslDSQ_SENTINEL; --res_size; /* sort the order of the database sequences */ rnd = esl_randomness_CreateFast(seq_cnt); for (i = 0 ; i < seq_cnt; ++i) { rnd->x = rnd->x * 69069 + 1; cache->list[i].idx = rnd->x; } esl_randomness_Destroy(rnd); qsort(cache->list, seq_cnt, sizeof(HMMER_SEQ), sort_seq); /* fill in the different databases and fix the index */ for (i = 0 ; i < seq_cnt; ++i) { inx = 0; db_key = cache->list[i].db_key; while (db_key) { if (db_key & 1) { SEQ_DB *db = cache->db + inx; if (db_inx[inx] >= db->count) { printf("sort:: %d %d\n", db_inx[inx], db->count); return eslEFORMAT; } db->list[db_inx[inx]] = &cache->list[i]; ++db_inx[inx]; } db_key >>= 1; ++inx; } cache->list[i].idx = (cache->list[i].name - cache->header_mem) / 10 + 1; } for (i = 0; i < cache->db_cnt; ++i) { printf("sequence database (%d):: %d %d\n", i, cache->db[i].count, db_inx[i]); } printf("\nLoaded sequence db file %s; total memory %" PRId64 "\n", seqfile, total_mem); esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); *ret_cache = cache; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (abc != NULL) esl_alphabet_Destroy(abc); if (cache != NULL) { if (cache->header_mem != NULL) free(cache->header_mem); if (cache->residue_mem != NULL) free(cache->residue_mem); if (cache->name != NULL) free(cache->name); if (cache->id != NULL) free(cache->id); free(cache); } for (i = 0; i < db_cnt; ++i) { if (db[i].list != NULL) free(db[i].list); } return eslEMEM; } void p7_seqcache_Close(P7_SEQCACHE *cache) { int i; if (cache->name) free(cache->name); if (cache->id) free(cache->id); if (cache->db) { for (i = 0; i < cache->db_cnt; ++i) { if (cache->db[i].list != NULL) free(cache->db[i].list); } free(cache->db); } if (cache->abc) esl_alphabet_Destroy(cache->abc); if (cache->list) free(cache->list); if (cache->residue_mem) free(cache->residue_mem); if (cache->header_mem) free(cache->header_mem); free(cache); } /***************************************************************** * x. Unit test *****************************************************************/ #ifdef CACHEDB_UTEST1 /* * gcc -O3 -malign-double -msse2 -o evalues-benchmark -I. -L. -I../easel -L../easel -Dp7EVALUES_BENCHMARK evalues.c -lhmmer -leasel -lm * gcc -g -O -pg -o evalues-benchmark -I. -L. -I../easel -L../easel -Dp7EVALUES_BENCHMARK evalues.c -lhmmer -leasel -lm * gcc -g -Wall -msse2 -o evalues-benchmark -I. -L. -I../easel -L../easel -Dp7EVALUES_BENCHMARK evalues.c -lhmmer -leasel -lm * * ./evalues-benchmark * * -malign-double is needed for gcc if the rest of HMMER was compiled w/ -malign-double * (i.e., our default gcc optimization) * * 27 Dec 08 on wanderoo: 24 msec per RRM_1 calibration; 37 msec for Caudal_act * profiling shows 75% in Forward; 12% esl_random(); <3% in MSVFilter. */ #include #include #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_getopts.h" #include "esl_sq.h" #include "esl_sqio.h" #include "esl_stopwatch.h" #include "hmmer.h" #include "cachedb.h" #include "hmmpgmd.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-o", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "direct output to file , not stdout", 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; typedef struct seq_info_s { ESL_DSQ *dsq; /* digitized sequence [1..n] */ int64_t n; /* length of dsq */ int32_t inx; /* length of dsq */ uint64_t db_key; /* flag for included databases */ struct seq_info_s *next; } SEQ_INFO; #define HASH_KEY 52807 #define SUB_HASH 1531 int main(int argc, char **argv) { FILE *ofp = stdout; int i; int cnt; int inx; int status; int seq_cnt; int db_inx; int db_key; int db_K[32]; int db_cnt[32]; uint64_t res_cnt; char *seqfile; char buffer[10]; ESL_GETOPTS *go = NULL; ESL_ALPHABET *abc = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = NULL; SEQ_INFO *info; SEQ_INFO **hash; time_t timep; if ((go = esl_getopts_Create(options)) == NULL) { printf("Failed to create options\n"); return 0; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_getopts_Destroy(go); return 0; } if (esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_getopts_Destroy(go); return 0; } if (esl_opt_ArgNumber(go) < 1) { printf("Must specify at least one database\n"); esl_getopts_Destroy(go); return 0; } if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) { printf("Failed to open output database %s for writing\n", esl_opt_GetString(go, "-o")); return 0; } } abc = esl_alphabet_Create(eslAMINO); sq = esl_sq_CreateDigital(abc); ESL_ALLOC(hash, sizeof(char *) * HASH_KEY); memset(hash, 0, sizeof(char *) * HASH_KEY); memset(db_K, 0, sizeof(db_K)); memset(db_cnt, 0, sizeof(db_cnt)); res_cnt = 0; seq_cnt = 0; cnt = 0; db_key = 1; db_inx = 1; while (db_inx <= esl_opt_ArgNumber(go)) { /* the first db just build the list */ seqfile = esl_opt_GetArg(go, db_inx); if ((status = esl_sqfile_Open(seqfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) { printf("Unable to open sequence database %s\n", seqfile); return status; } printf("Database %d: %s\n", db_inx, seqfile); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { int sum = 0; inx = 0; for (i = 1; i <= sq->n; ++i) { inx += (sq->dsq[i] * sq->dsq[i]); sum = inx % SUB_HASH; } inx = inx % HASH_KEY; info = hash[inx]; while (info != NULL) { if (sq->n == info->n && sum == info->inx && memcmp(sq->dsq, info->dsq, sq->n+2) == 0) { break; } info = info->next; } if (info == NULL) { ESL_ALLOC(info, sizeof(SEQ_INFO)); ESL_ALLOC(info->dsq, sq->n+2); memcpy(info->dsq, sq->dsq, sq->n+2); info->n = sq->n; info->db_key = db_key; info->inx = sum; ++db_cnt[db_inx-1]; info->next = hash[inx]; hash[inx] = info; res_cnt += sq->n; seq_cnt++; } else if ((info->db_key & db_key) == 0) { info->db_key |= db_key; ++db_cnt[db_inx-1]; } ++db_K[db_inx-1]; esl_sq_Reuse(sq); } esl_sqfile_Close(sqfp); db_key <<= 1; db_inx++; printf("\n"); } printf("Writing cache %s\n", esl_opt_GetString(go, "-o")); timep = time(NULL); fprintf(ofp, "# %" PRId64 " %d %d", res_cnt, seq_cnt, db_inx - 1); for (i = 0; i < db_inx - 1; ++i) fprintf(ofp, " %d %d", db_cnt[i], db_K[i]); fprintf(ofp, " %s", ctime(&timep)); strcpy(buffer, "000000001"); for (inx = 0; inx < HASH_KEY; ++inx) { info = hash[inx]; while (info) { int pos; char buf[80]; SEQ_INFO *next = info->next; fprintf(ofp, ">%s ", buffer); while (info->db_key) { fprintf(ofp, "%c", (info->db_key & 1) ? '1' : '0'); info->db_key >>= 1; } fprintf(ofp, "\n"); for (pos = 0; pos < info->n; pos += 60) { esl_abc_TextizeN(abc, info->dsq+pos+1, 60, buf); buf[60] = '\0'; fprintf(ofp, "%s\n", buf); } free(info->dsq); free(info); info = next; /* increment the buffer string */ ++buffer[8]; for (i = 8; i > 0; --i) { if (buffer[i] > '9') { buffer[i] = '0'; buffer[i-1]++; } } } } free(hash); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); if (ofp != stdout) fclose(ofp); if (esl_opt_IsOn(go, "-o")) { P7_CACHEDB_SEQS *cache = NULL; printf("Reading cache %s\n", esl_opt_GetString(go, "-o")); if ((status = cache_SeqDb(esl_opt_GetString(go, "-o"), &cache)) != eslOK) { printf("ERROR %d\n", status); return 0; } cache_SeqDestroy(cache); } esl_getopts_Destroy(go); return 0; ERROR: return status; } #endif /*CACHEDB_UTEST1*/ #ifdef CACHEDB_UTEST2 /* * gcc -O3 -malign-double -msse2 -o evalues-benchmark -I. -L. -I../easel -L../easel -Dp7EVALUES_BENCHMARK evalues.c -lhmmer -leasel -lm * gcc -g -O -pg -o evalues-benchmark -I. -L. -I../easel -L../easel -Dp7EVALUES_BENCHMARK evalues.c -lhmmer -leasel -lm * gcc -g -Wall -msse2 -o evalues-benchmark -I. -L. -I../easel -L../easel -Dp7EVALUES_BENCHMARK evalues.c -lhmmer -leasel -lm * * ./evalues-benchmark * * -malign-double is needed for gcc if the rest of HMMER was compiled w/ -malign-double * (i.e., our default gcc optimization) * * 27 Dec 08 on wanderoo: 24 msec per RRM_1 calibration; 37 msec for Caudal_act * profiling shows 75% in Forward; 12% esl_random(); <3% in MSVFilter. */ #include #include #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_getopts.h" #include "esl_sq.h" #include "esl_sqio.h" #include "esl_stopwatch.h" #include "hmmer.h" #include "cachedb.h" #include "hmmpgmd.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options] db1 [db2 ...]"; static char banner[] = "unit test for cached databases"; typedef struct seq_info_s { ESL_DSQ *dsq; /* digitized sequence [1..n] */ int64_t n; /* length of dsq */ uint64_t db_key; /* flag for included databases */ struct seq_info_s *next; } SEQ_INFO; int main(int argc, char **argv) { int status; ESL_GETOPTS *go = NULL; P7_CACHEDB_SEQS *cache = NULL; if ((go = esl_getopts_Create(options)) == NULL) { printf("Failed to create options\n"); return 0; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_getopts_Destroy(go); return 0; } if (esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_getopts_Destroy(go); return 0; } if (esl_opt_ArgNumber(go) < 1) { printf("Must specify at least one database\n"); esl_getopts_Destroy(go); return 0; } printf("Reading cache %s\n", esl_opt_GetArg(go, 1)); if ((status = cache_SeqDb(esl_opt_GetArg(go, 1), &cache)) != eslOK) { printf("ERROR %d\n", status); return 0; } cache_SeqDestroy(cache); esl_getopts_Destroy(go); return 0; } #endif /*CACHEDB_UTEST2*/ /***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Version 3.1b2; February 2015 * Copyright (C) 2015 Howard Hughes Medical Institute. * Other copyrights also apply. See the COPYRIGHT file for a full list. * * HMMER is distributed under the terms of the GNU General Public License * (GPLv3). See the LICENSE file for details. * * SVN $URL: https://svn.janelia.org/eddylab/eddys/src/hmmer/branches/3.1/src/cachedb.c $ * SVN $Id: cachedb.c 4437 2013-04-26 17:57:41Z wheelert $ *****************************************************************/