#!/usr/bin/env python

"""
Transfers annotation between two genome records, based upon reciprocal
best blast hits. Missing, duplicated and unique proteins are also identified.
"""

import argparse
import drmaa
import glob
import pandas as pd
import re
import shutil
import stat
import sys
import tempfile
import os

from utils import check_format, drmaa_run
from Bio.Blast.Applications import NcbimakeblastdbCommandline
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from colorama import init, Fore, Style

def parse_genome(genome: str, locus_tags: list, type: str, tmpdir: str):
    genome_name=os.path.basename(genome)
    genome_name=os.path.splitext(genome_name)[0]
    output=os.path.join(tmpdir,genome_name)

    prots=[]
    gene_mapping={}
    pseudos={}
    for record in SeqIO.parse(genome, type):
        for feature in record.features:
            locus_tag=None
            gene=None
            translation=None
            product=None

            if (feature.type == 'CDS'):
                pseudo=False
                if 'gene' in feature.qualifiers:
                    gene=feature.qualifiers['gene'][0]
                if 'locus_tag' in feature.qualifiers:
                    locus_tag=feature.qualifiers['locus_tag'][0]
                if 'translation' in feature.qualifiers:
                    translation=feature.qualifiers['translation'][0]
                if 'product' in feature.qualifiers:
                    product=feature.qualifiers['product'][0]
                if 'pseudo' in feature.qualifiers:
                    pseudo=True
                if 'note' in feature.qualifiers:
                    note=feature.qualifiers['note'][0]

                if pseudo:
                    pseudos[locus_tag]={
                        'gene': gene,
                        'note': note
                    }

                if (len(locus_tags) and locus_tag in locus_tags) or not len(locus_tags):
                    gene_mapping[locus_tag]={'gene':gene, 'product':product}
                    prot_seq=None
                    if gene and translation:
                        prot_seq=SeqRecord(Seq(translation), id=locus_tag, 
                            description="{}\t{}".format(gene,product))
                    elif translation:
                        prot_seq=SeqRecord(Seq(translation), 
                            id=locus_tag, description="{}".format(product))

                    if prot_seq:
                        prots.append(prot_seq)

    df=pd.DataFrame.from_dict(data=gene_mapping,orient='index')
    pseudo_df=pd.DataFrame.from_dict(data=pseudos,orient='index')
    if len(pseudo_df.index):
        pseudo_df.to_csv('{}.pseudo.txt'.format(genome_name), sep='\t')
    SeqIO.write(prots, output, 'fasta')

    return(genome_name,df,pseudo_df)

def write_selected_seqs(index: dict, ids: list, fasta_name: str):
    """
    Writes fasta file of selected sequences from biopython indexed file

    Required params:
        index: dict returned by SeqIO.index() on subject fasta file
        ids: Pandas series of identifiers to output
        fasta_name: Output file name

    Returns:
        None
    """
    out_records=list(map(lambda x: index[x],ids))
    SeqIO.write(out_records,fasta_name,'fasta')

def index_db(tmpdir: str,db: str):
    """
    Blast indexes a database

    Required parameters:
        tmpdir - path to temporary directory
        db - path to fasta file to index

    Returns:
        None
    """
    dbpath=os.path.join(tmpdir,db)
    index_cline=NcbimakeblastdbCommandline(dbtype='prot',input_file=dbpath)
    stderr=index_cline()[1]
    
    if stderr:
        print("Warning - the following STDERR was reported by BLAST")
        print(stderr)
        sys.exit(1)

def create_blast_script(tmpdir:str, dbs: list, cov: int, num_align: int):

    """Generates a bash script for queue submission which carries out
    blastp searches of one genome against another

    Required params:
        tmpdir: Path to temp dir
        dbs: List of databases
        cov: proportion of query hsp coverage required
        num_align: Number of alignments to report

    Returns:
        Name of generated script
    """

    fd,tmpfile = tempfile.mkstemp()
    file_ = os.fdopen(fd, "w")
    file_.write('''\
#!/bin/bash
db1_name=$(basename {})
db2_name=$(basename {})
blastp -query {}/{} -db {}/{} -num_threads 8 -outfmt 6 -qcov_hsp_perc {} -num_alignments {} -evalue 0.001 -out {}/$db1_name.$db2_name.blast
'''.format(dbs[0], dbs[1], tmpdir, dbs[0], tmpdir, dbs[1], cov, num_align, tmpdir))
    file_.close()
    os.chmod(tmpfile,0o755)
    scriptname="{}_vs_{}.sh".format(os.path.basename(dbs[0]),os.path.basename(dbs[1]))
    shutil.copy(tmpfile,scriptname)
    os.remove(tmpfile)
   
    return(scriptname)

def run_blasts(tmpdir: str, dbs: list, recip: bool, len_thresh: int, num_align: int):
    """ Runs blasts of db1 vs db2 and vice versa 

    Required params: 
        tmpdir: path to temporary directory
        dbs: List of databases
        recip: Run reciprocal blast if True, otherwise db1 vs db2
        len_tresh: Length threshold proportion for hsp length of query
        num_align: Number of alignments to report

    Returns:
        None
    """

    scripts=[]

    script1=create_blast_script(tmpdir=tmpdir, dbs=dbs, 
        cov=len_thresh, num_align=num_align)
    scripts.append(script1)

    if recip:
        script2=create_blast_script(tmpdir=tmpdir, dbs=[dbs[1],dbs[0]], 
            cov=len_thresh, num_align=num_align)
        scripts.append(script2)
    
    drmaa_run(tmpdir=tmpdir, scripts=scripts, job_name='Blast')

def parse_blasts(tmpdir: str, dbs: list, pident: int, recip: bool):

    """Produces pandas DataFrames from blast outputs
    If a reciprocal search is carried out, then two dataframes are returned,
    one for db1 vs db2, and one for db2 vs db1

    Required Params:
        tmpdir: path to temporary directory
        dbs: 2-element list of blast database names
        recip: Parse reciprocal blast results

    Returns:
        dfs: List of pandas dataframe holding blast results (reference, subject)
    """

    blast_files=[]
    blast_file_1="{}.blast".format(".".join(dbs))
    blast_files.append(blast_file_1)

    if recip:
        dbs=[dbs[1],dbs[0]]
        blast_file_2="{}.blast".format(".".join(dbs))
        blast_files.append(blast_file_2)

    colnames=['qseq','sseq','pident','length','mismatch','gapopen','qstart',
        'qend','sstart','send','evalue','bitscore']

    dfs=[]

    for file in blast_files:
        df=pd.read_table(os.path.join(tmpdir, file), header=None, names=colnames)
        df=df.loc[df['pident']>pident]
        dfs.append(df)

    return(dfs)

def intersect_results(blast_dfs: list, gene_dfs: list):

    """Creates DataFrame combining reciprocal blast hits and their annotations
    Required Params:
        blast_dfs: List of DataFrames of reciprocal blast hits
        gene_dfs: List of DataFrames containing reference [0] 
            and subject [1] annotations
    
    Returns:
        DataFrame of merged results
    """

    ref_hits=blast_dfs[0][['qseq','sseq']].copy()
    common_hits=pd.merge(ref_hits,gene_dfs[0][['gene','product']],left_on='qseq', right_index=True)
    common_hits=pd.merge(common_hits,gene_dfs[1][['gene','product']],left_on='sseq',right_index=True)
    colnames={
        'qseq': 'RefID',
        'sseq': 'SubjID',
        'gene_x': 'RefGene',
        'product_x': 'RefProduct',
        'gene_y': 'SubjGene',
        'product_y': 'SubjProduct'
    }
    common_hits=common_hits[['sseq', 'gene_y', 'product_y', 'qseq', 'gene_x', 'product_x']]
    common_hits.rename(columns=colnames, inplace=True)

    return(common_hits)

def run_mmseq(tmpdir: str, db: str):
    """ Runs mmseq clustering of proteins

    Required params: 
        tmpdir: path to temporary directory
        db: database to cluster

    Returns:
        None
    """

    fd,tmpfile = tempfile.mkstemp()
    file_ = os.fdopen(fd, "w")
    file_.write('''\
#!/bin/bash
mmseqs easy-cluster {}/{} {}/{} {}
'''.format(tmpdir,db,tmpdir,db,tmpdir))
    file_.close()
    os.chmod(tmpfile,0o755)
    scriptname="{}/mmseqs_{}.sh".format(tmpdir,os.path.basename(db))
    shutil.copy(tmpfile,scriptname)
    os.remove(tmpfile)

    drmaa_run(tmpdir=tmpdir, scripts=[scriptname], job_name='mmseq')

def update_annotation(genome: str, common_hits: pd.DataFrame, truncated: pd.DataFrame):

    """
    Assigns reference annotations to subject genome record
    Required params:
        genome: path to subject genome file
        common_hits: dataframe containing annotation mappings
        truncated: dataframe containing truncated proteins

    Returns:
        conflicts: list of conflicting annotations
    """
    conflicts=[]

    genome_name=os.path.basename(genome)
    (genome_name,suffix)=os.path.splitext(genome_name)
    genome_file='.'.join([genome_name,'reannotated'])
    genome_file=genome_file+suffix

    if os.path.exists(genome_file):
        os.remove(genome_file)

    type=check_format(genome)
    for record in SeqIO.parse(genome, type):
        for feature in record.features:
            if feature.type=='CDS':
                # Annotated are first transferred from reciprocal top hits, and if there is none 
                # present, then we look for a truncated annotation instead.
                gene_data=common_hits.loc[common_hits['SubjID']==feature.qualifiers['locus_tag'][0]]
                trunc_data=truncated.loc[truncated['SubjID']==feature.qualifiers['locus_tag'][0]]
                gene_name=gene_data['RefGene'].values
                if len(gene_name)>1:
                    refid=gene_data['RefID'].values
                    subjid=gene_data['SubjID'].values
                    if len(refid)>1 or len(subjid)>1:
                        conflicts.append('{}:{}'.format(subjid,refid))
                elif len(gene_name)==1:
                    feature.qualifiers['gene']=gene_name[0]
                elif len(gene_name)==0:
                    gene_name=trunc_data['RefGene'].values
                    if len(gene_name)==1:
                        feature.qualifiers['gene']='{}_tr'.format(gene_name[0])
                    elif len(gene_name)>1:
                        # In the event of multiple gene names, we have conflicting annotations
                        # so add these to 'conflicts'
                        refid=trunc_data['RefID'].values
                        subjid=trunc_data['SubjID'].values
                        if len(refid)>1 or len(subjid)>1:
                            conflicts.append('{}:{}'.format(subjid,refid)) 
                
                if 'product' in feature.qualifiers:
                    product=gene_data['RefProduct'].values
                    if len(product)==1:
                        feature.qualifiers['product']=product[0]
                    elif len(product)==0:
                        product=trunc_data['RefProduct'].values
                        if len(product)==1:
                            feature.qualifiers['product']='{} (truncated)'.format(product[0])
                
        with open(genome_file,'a') as out:
            SeqIO.write(record, out, type)

    return(conflicts)

def reciprocal_blast(tmpdir: str, dbs: list, pident: int, len_thresh: int, gene_dfs: list):

    """Carries out reciprocal blast searches between two databases

    Required Params:
        tmpdir: path to temporary directory
        dbs: list of blast database names
        pident: percent-identity cutoff
        len_thresh: subject length threshold (%age)
        gene_dfs: list of dataframes containing ref and subj gene annotations

    Returns:
        common_hits: dataframe of reciprocal best hits
    """

    print(Fore.GREEN + Style.BRIGHT + "\nIdentifying reciprocal top hits..." + Style.RESET_ALL)
    run_blasts(tmpdir=tmpdir, dbs=dbs, recip=True, len_thresh=len_thresh, num_align=1)
    blast_dfs=parse_blasts(tmpdir=tmpdir, dbs=dbs, pident=pident, recip=True)

    common_hits=intersect_results(blast_dfs=blast_dfs, gene_dfs=gene_dfs) 
    common_hits.to_csv(
        '{}_{}.annotation_mapping.txt'.format(dbs[1],dbs[0]),
            sep='\t', index=None)

    return(common_hits)

def report_conflicts(conflicts: list, ref_genes: pd.DataFrame, genome_name:str, subj_index: dict ):

    """Produces a report of genes with conflicting hits to the reference
    Required Parameters:
        conflicts: list of conflicting locus tags
        ref_genes: data frame of reference gene annotations
        genome_name: name of genome...

    Returns: None
    """
    conflict_ids=[]
    if len(conflicts):
        with open('{}.conflicts.txt'.format(genome_name),'w') as file:
            for conflict in conflicts:
        
                conflict=re.sub(r'[\[\]\'\n]', '', conflict)
                subj,ref=conflict.split(':')
                subjs=subj.split(' ')
                subjs=set(subjs)
                if len(subjs)>1:
                    print('Warning: non-unique subject ids in conflict list')
                subj=subjs.pop()
                conflict_ids.append(subj)

                refs=ref.split(' ')
                ref_strs=list()
                for ref in refs:
                    ref_vals=ref_genes.loc[ref,].values
                    ref_string='{} ({})'.format(ref_vals[0],ref_vals[1])
                    ref_strs.append(ref_string)
                file.write('{}\t{}\n'.format(subj, '; '.join(ref_strs)))

        if len(conflicts):
            print(Fore.RED + "\n{} conflicting hit(s) identified".format(len(conflicts)) + Fore.RESET)
            write_selected_seqs(index=subj_index,ids=conflict_ids,
                fasta_name='{}.conflicts.fasta'.format(genome_name))

def find_truncated(tmpdir: str, genome:str, dbs: list, common_hits: pd.DataFrame, 
    gene_dfs: list, pident: int, trunc_len_thresh: int, subj_index: dict):

    """ Proteins present as a tophit in the subject search but not the reference searc may be truncated...
        Identify candidates, then Re-blast at a lower length threshold to see if they then come out 
        as reciprocal hits

        Required Params:
            tmpdir: path to temporary directory
            dbs: list of blast db names
            common_hits: Dataframe of reciprocal best hits
            gene_dfs: list of dataframes containing gene annotations
            pident: blast %ID cutoff

        Returns: 
            truncated: Dataframe of truncated proteins
            blast_dfs: List of dataframes of blast hits
    """
    print(Fore.GREEN + Style.BRIGHT + "\nIdentifying potentially truncated proteins" + Style.RESET_ALL)
    subj_gene_df=gene_dfs[1]

    subjIDs=set(subj_gene_df.index.values)
    annotated_subjIDs=set(common_hits['SubjID'].values)
    unannotated_subjIDs=list(subjIDs-annotated_subjIDs)

    # A minimal blast database is created for the subject containing just the 
    # sequences we are interested in to restrict the outputs
    type=check_format(file=genome)
    parse_genome(genome=genome, locus_tags=unannotated_subjIDs, type=type, tmpdir=tmpdir)

    # Note that reciprocal blasts are carried out, since the results of the 
    # opposite search are used in the subsequence find_missing() call
    run_blasts(tmpdir=tmpdir, dbs=[dbs[1],dbs[0]], recip=True, len_thresh=trunc_len_thresh, num_align=1)
    blast_dfs=parse_blasts(tmpdir=tmpdir, dbs=dbs, pident=pident, recip=True)
    # merge is opposite way round to that in intersect_results, so won't reuse that function...
    ref_hits=blast_dfs[1][['sseq','qseq']].copy()
    truncated=pd.merge(ref_hits,gene_dfs[0][['gene','product']],left_on='sseq', right_index=True)
    truncated=pd.merge(truncated,gene_dfs[1][['gene','product']],left_on='qseq',right_index=True)
    colnames={
        'sseq': 'RefID',
        'qseq': 'SubjID',
        'gene_x': 'RefGene',
        'product_x': 'RefProduct',
        'gene_y': 'SubjGene',
        'product_y': 'SubjProduct'
    }
    truncated=truncated[['sseq', 'gene_x', 'product_x', 'qseq', 'gene_y', 'product_y']]
    truncated.rename(columns=colnames, inplace=True)

    print(Fore.CYAN + '{} truncated protein(s) identified'.format(len(truncated)) +Fore.RESET)
    truncated.to_csv('{}.truncated.txt'.format(dbs[1]), header=True, sep="\t",index=None)
    write_selected_seqs(index=subj_index,ids=truncated['SubjID'].tolist(),
        fasta_name='{}.truncated.fasta'.format(dbs[1]))

    return(truncated,blast_dfs)

def find_missing(blast_df: pd.DataFrame, gene_df: pd.DataFrame, truncated: pd.DataFrame, genome_name: str, pseudo_df: pd.DataFrame, ref_index:dict):
    """ Proteins missing from the subject genome are identified from the blast results 
    with the lower query hsp length cutoff

    Required parameters:
        blast_df: Dataframe holding results of ref->subj blast search at lower length threshold
        gene_df: Dataframe holding details of reference genes
        genome_name: Name of reference genome
        pseudo_df: dataframe of psuedogenes in reference
        ref_index: dict of seqs from SeqIO.index() on refseq

    Returns:
        DataFrame
    """

    print(Fore.GREEN + Style.BRIGHT + "\nIdentifying missing proteins" + Style.RESET_ALL)
    hit_ids=set(blast_df['qseq'])
    ref_ids=set(gene_df.index.values)
    truncated_ids=set(truncated['RefID'])
    missing_ids=list(ref_ids-hit_ids-truncated_ids)
    missing=gene_df[gene_df.index.isin(missing_ids)]
    missing=missing.reset_index()
    missing.rename(columns={'index': 'RefId'},inplace=True)

    print(Fore.CYAN + '{} missing protein(s) identified'.format(len(missing)) + Fore.RESET)
    missing.to_csv('{}.missing.txt'.format(genome_name), header=True, sep="\t",index=None)

    # Need to drop ids which are pseudogenes from the list before writing sequences 
    # to prevent access to non-existent keys...
    non_pseudo=missing.merge(pseudo_df,how='left',left_on='RefId',right_index=True)
    non_pseudo=non_pseudo[non_pseudo['note'].isna()]
    write_selected_seqs(index=ref_index,ids=non_pseudo['RefId'].tolist(),
        fasta_name='{}.missing.fasta'.format(genome_name))
    
    return(missing)

def find_novel(tmpdir: str, dbs: list, gene_df: list, pident: int, len_thresh: int, genome_name: str, subj_index: dict):

    """ Identify novel protein sequences in the subject by lack of hits to the reference...

    Required Paramater:
        tmpdir: Path to temporary directory
        dbs: list of databases 
        gene_df: Dataframe of gene annotations in subject genome
        pident: %ID blast cutoff
        len_thresh: Subject length threshold (%age)
        genome_name: name of subject
        subj_index: dict representing fasta file of subject genome

    Returns:
        list of novel subject IDs
    """

    print(Fore.GREEN + Style.BRIGHT + "\nIdentifying novel proteins" + Style.RESET_ALL)
    run_blasts(tmpdir=tmpdir, dbs=[dbs[1],dbs[0]], recip=False, len_thresh=len_thresh, num_align=1)
    blast_df=parse_blasts(tmpdir=tmpdir, dbs=[dbs[1],dbs[0]], pident=pident,recip=False)[0]
    subj_ids=set(gene_df.index.values)
    hit_ids=set(blast_df['qseq'].values)
    novel=list(subj_ids-hit_ids)
    novel_df=gene_df[gene_df.index.isin(novel)]
    novel_df=novel_df.reset_index()
    novel_df.rename(columns={'index':'SubjId'},inplace=True)

    print(Fore.CYAN + '{} novel protein(s) identified'.format(len(novel_df)) + Fore.RESET)
    novel_df.to_csv('{}.novel.txt'.format(genome_name), header=True, sep="\t",index=None)
    write_selected_seqs(index=subj_index,ids=novel_df['SubjId'].tolist(),
        fasta_name='{}.novel.fasta'.format(genome_name))
    
    return(novel_df)


def find_duplicates(tmpdir: str, dbs:list, gene_df: list, pident: int,
        len_thresh:int, genome_name: str, subject_genome: str, subj_index: dict):

    print(Fore.GREEN + Style.BRIGHT + "\nIdentifying duplicated proteins" + Style.RESET_ALL)
    run_blasts(tmpdir=tmpdir,dbs=[dbs[1],dbs[1]], recip=False, len_thresh=len_thresh, num_align=20)
    blast_df=parse_blasts(tmpdir=tmpdir, dbs=[dbs[1],dbs[1]], pident=pident, recip=False)[0]

    multi_hits=pd.DataFrame(blast_df['qseq'].value_counts())
    multi_hits=multi_hits[multi_hits['qseq']>1].index.values
    multi_hit_details=blast_df[blast_df['qseq'].isin(multi_hits)] 

    multi_hits=list(multi_hit_details['qseq'].values)
    multi_hits.extend(multi_hit_details['sseq'].values)
    multi_hits=list(set(multi_hits))

    ## Some of these hits may have local similarity to a longer sequence
    # so carry out clustering to determine who is truly duplicated...
    type=check_format(file=subject_genome)
    parse_genome(genome=subject_genome, locus_tags=multi_hits, type=type, tmpdir=tmpdir)
    run_mmseq(tmpdir=tmpdir,db=genome_name)

    colnames=['query','subject']
    clusters=pd.read_table(os.path.join(tmpdir, '{}_cluster.tsv'.format(genome_name)), 
        header=None,names=colnames,index_col=0)
    # Discard singletons...
    cluster_counts=pd.DataFrame(clusters.index.value_counts())
    multi_cluster=cluster_counts[cluster_counts['query']>1].index.values
    clusters=clusters[clusters.index.isin(multi_cluster)]
    cluster_names=clusters.index.unique()

    cluster_reprs=[]
    with open('{}.duplicates.txt'.format(genome_name),'w') as file:
        file.write("Gene name\tProduct\tDuplicates\n")
        for cluster_name in cluster_names:
            cluster=clusters[clusters['subject'].index==cluster_name].values
            flat_cluster=[]
            for sublist in cluster:
                for item in sublist:
                    flat_cluster.append(item)
            cluster_reprs.extend(flat_cluster)
            annots=gene_df.loc[cluster_name]
            file.write("{}\t{}\t{}\n".format(annots['gene'],annots['product'],",".join(flat_cluster)))
    
    if len(cluster_names):
        print(Fore.CYAN + "{} duplicate protein(s) identified".format(len(cluster_names)) + Fore.RESET)
        write_selected_seqs(index=subj_index, ids=cluster_reprs, fasta_name='{}.duplicates.fasta'.format(genome_name))
    
    return(cluster_reprs)

def check_missing_and_novel(genome_name: str, novel: pd.DataFrame, missing: pd.DataFrame):
    '''
    Identifies any gene annotations which are detected as being both missing and novel - 
    these are likely due to annotation errors leading to the annotation term being applied
    to a different protein, and should be checked.
    
    Required_params:
        genome_name: Name of subject genome
        novel: DataFrame of novel genes
        missing: DataFrame of missing genes

    Returns:
        None
    '''
    
    novel_genes=novel.loc[novel['gene'].notna()]
    novel_genes=set(novel_genes['gene'].tolist())
    missing_genes=missing.loc[missing['gene'].notna()]
    missing_genes=set(missing_genes['gene'].tolist())
    missing_novel_isec=missing_genes.intersection(novel_genes)

    missing_novel_df=pd.DataFrame(list(missing_novel_isec),columns=['gene'])
    missing_novel_df=missing_novel_df.merge(novel,on='gene')
    missing_novel_df=missing_novel_df.merge(missing,on='gene')
    missing_novel_df.to_csv('{}.missing_and_novel.txt'.format(genome_name),sep="\t",header=True,index=None)
    if len(missing_novel_df):
        print(Fore.RED + "{} annotations identified as both novel and missing".format(len(missing_novel_df)) + Fore.RESET)

def main():

    init()
    parser = argparse.ArgumentParser(
        description="Transfer annotations between genome records based on reciprocal blast searches")
    parser.add_argument('--reference', dest='ref', 
        help='path to EMBL/Genbank file containing reference annotated genome', required=True)
    parser.add_argument('--subject', dest='subj', 
        help='path to EMBL/Genbank file containing subject annotated genome', required=True)
    parser.add_argument('--percent_id', dest='pident', default=95, type=int,
        help='Percentage ID threshold for blast hits (default: 95)')
    parser.add_argument('--len_thresh', dest='len_thresh', default=90, type=int,
        help='Percentage of subject sequence length match required (default: 90)')
    parser.add_argument('--trunc_len_thresh', dest='trunc_len_thresh', default=50, type=int,
        help='Percentage of subject sequence length match required for identifying truncated sequences (default: 50)')

    args = parser.parse_args()
    dbs=[] 
    gene_dfs=[]
    pseudo_dfs=[]

    with tempfile.TemporaryDirectory(dir='.') as tmpdir:
        for genome in (args.ref, args.subj):
            type=check_format(file=genome)
            dbname,gene_df,pseudo_df=parse_genome(genome=genome, locus_tags=[], type=type, tmpdir=tmpdir)
            dbs.append(dbname)
            gene_dfs.append(gene_df)
            pseudo_dfs.append(pseudo_df)
            index_db(tmpdir=tmpdir, db=dbname)

        ref_name=dbs[0]
        subj_name=dbs[1]
        ref_index=SeqIO.to_dict(SeqIO.parse('{}/{}'.format(tmpdir,ref_name), "fasta"))
        subj_index=SeqIO.to_dict(SeqIO.parse('{}/{}'.format(tmpdir,subj_name), "fasta"))

        # N.B. order of calls is important...full subject database replaced in find_truncated(), so
        # searches requiring full database should occur before this call...
        common_hits=reciprocal_blast(tmpdir=tmpdir, dbs=dbs, pident=args.pident, 
            len_thresh=args.len_thresh, gene_dfs=gene_dfs)

        novel=find_novel(tmpdir=tmpdir, dbs=dbs, gene_df=gene_dfs[1], pident=args.pident, 
            len_thresh=args.len_thresh, genome_name=subj_name, subj_index=subj_index)

        duplicates=find_duplicates(tmpdir=tmpdir, dbs=dbs, gene_df=gene_dfs[1], pident=args.pident, 
            len_thresh=args.len_thresh, genome_name=subj_name, subject_genome=args.subj, subj_index=subj_index)

        truncated,blast_dfs=find_truncated(tmpdir=tmpdir, genome=args.subj, dbs=dbs, 
            common_hits=common_hits, gene_dfs=gene_dfs, pident=args.pident, trunc_len_thresh=args.trunc_len_thresh,
            subj_index=subj_index)

        missing=find_missing(blast_df=blast_dfs[0], gene_df=gene_dfs[0], truncated=truncated, 
            genome_name=subj_name, pseudo_df=pseudo_dfs[0],ref_index=ref_index)

        conflicts=update_annotation(genome=args.subj, common_hits=common_hits, truncated=truncated)

        report_conflicts(conflicts=conflicts, ref_genes=gene_dfs[0], genome_name=subj_name, 
            subj_index=subj_index)

        check_missing_and_novel(genome_name=subj_name, novel=novel, missing=missing)

        for db in dbs:
            fasta_file='{}.fa'.format(db)
            if os.path.exists(fasta_file):
                os.remove(fasta_file)

if __name__ == '__main__':
    main()
