#!/usr/bin/env python
# This script is partially based on 
# https://github.com/soedinglab/uniclust-pipeline/blob/master/hhdatabase/reformat_old_cs219_ffindex.py
docstring='''
curate UniRef30_2020_01
    Convert hhblits3 database UniRef30_2020_01 to hhblits2 format.

    Note that it is not always possible to perfectly convert hhblits3
    format database, which can have unlimited sequence length, to
    hhblits2 format database, which has an upper limit of 32763 residues.

    Therefore, it is safer to search the converted database in two steps:
    firstly by hhblits2 and then, in case hhblits2 failed, by hhblits3
'''
import sys, os
import mmap
from collections import namedtuple
import textwrap

FFindexEntry = namedtuple("FFindexEntry", "name, offset, length")

AS62 ='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*--'

AS90 ='!"#$%&\'()+,/0123456789:;<=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~*--'

AS219='!"#$%&\'()+,/0123456789:;<=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff*--'


def read_index(ffindex_filename):
    entries = []
    fh = open(ffindex_filename)
    for line in fh:
        tokens = line.split("\t")
        entries.append(FFindexEntry(tokens[0], int(tokens[1]), int(tokens[2])))
    fh.close()
    return entries

def read_data(ffdata_filename):
    fh = open(ffdata_filename, "r")
    data = mmap.mmap(fh.fileno(), 0, prot=mmap.PROT_READ)
    fh.close()
    return data

def read_entry_data(entry, data):
    return data[entry.offset:entry.offset + entry.length - 1]

def main(db):
    if not os.path.isfile(db+"_a3m.ffdata"):
        sys.stderr.write("ERROR! No such file %s_a3m.ffdata"%db)
        exit(1)

    if not os.path.isfile(db+"_a3m_db"):
        cmd="ln -s %s_a3m.ffdata %s_a3m_db"%(db,db)
        sys.stdout.write(cmd+'\n')
        os.system(cmd)
    if not os.path.isfile(db+"_hhm_db"):
        cmd="ln -s %s_hhm.ffdata %s_hhm_db"%(db,db)
        sys.stdout.write(cmd+'\n')
        os.system(cmd)

    if not os.path.isfile(db+"_hhm_db.index"):
        txt=''
        fp=open(db+"_hhm.ffindex",'r')
        for line in fp.read().splitlines():
            target,start,end=line.split()
            txt+="%s.a3m\t%s\t%s\n"%(target,start,end)
        fp.close()
        fp=open(db+"_hhm_db.index",'w')
        fp.write(txt)
        fp.close()

    if not os.path.isfile(db+"_a3m_db.index"):
        txt=''
        fp=open(db+"_a3m.ffindex",'r')
        for line in fp.read().splitlines():
            target,start,end=line.split()
            txt+="%s.a3m\t%s\t%s\n"%(target,start,end)
        fp.close()
        fp=open(db+"_a3m_db.index",'w')
        fp.write(txt)
        fp.close()

    if not os.path.isfile(db+".cs219") or \
       not os.path.isfile(db+".cs219.sizes"):

        input_data  = read_data(db+"_cs219.ffdata")
        input_index = read_index(db+"_cs219.ffindex")

        fh = open(db+".cs219", "wb")

        total_length = 0
        nr_sequences = len(input_index)
        sys.stdout.write("Converting %d sequences in total\n"%nr_sequences)
        line_break = bytearray("\n", "utf-8")[0]

        processed_seq=0
        for entry in input_index:
            entry_data = read_entry_data(entry, input_data)
            for i in range(len(entry_data)):
                if entry_data[i] == line_break:
                    entry_data = entry_data[(i+1):]
                    break
            total_length += len(entry_data)
            fh.write(bytearray(">"+entry.name+"\n", "utf-8"))
            #fh.write(entry_data)
            fh.write(textwrap.fill(
                ''.join([AS219[ord(e)] for e in entry_data]),100)+'\n')
            processed_seq+=1
            if processed_seq%10000==0:
                sys.stdout.write("Converting sequence %d\n"%processed_seq)
        fh.close()

        fh = open(db+".cs219.sizes", "w")
        fh.write(str(nr_sequences) + " " + str(total_length))
        fh.close()
    return

if __name__=="__main__":
    if len(sys.argv)!=2:
        sys.stderr.write(docstring)
        exit(0)
    
    main(sys.argv[1])