bwa_params={}     # bwa_mem params
markdup_params={} # Picard MarkDuplicates parameters
metrics_params={} # Picard metric gathering parameters
mqc_params={}     # multiqc params

##############################################################################################################
#
# Configurable parameters: 
# 
#################################################################### 
#                                                                  #
# BWA Alignment                                                    #
#                                                                  #
# For full details of these parameters, see the bwa man page:      #
#   http://bio-bwa.sourceforge.net/bwa.shtml                       #
#                                                                  #
#   Values defined here are initially set to the tool defaults     #
#                                                                  #
#################################################################### 
#                                                                  #
# Platform (for read group tag)
rgpl="Illumina"
# Minimum seed length (-k: int)
bwa_params['seed_length'] = 19
# Band width - gaps longer than value will not be found (-w: int)
bwa_params['band_width'] = 100
# Z-dropoff (-d: int)
bwa_params['z_dropoff'] = 100
# Trigger reseeding for MEM longer than seed_length*value (-r: float)
bwa_params['reseeding'] = 1.5
# Discard MEM with more occurrences than value (-c: int)
bwa_params['mem_discard'] = 10000
# Perform mate-rescue only - do not try to find hits that fit proper pair (-P: boolean)
bwa_params['rescue_only'] = False
# Match score (-A:int)
bwa_params['match'] = 1
# Mismatch penalty (-B: int)
bwa_params['mismatch'] = 1
# Gap open penalty (-O: int)
bwa_params['gap_open'] = 6
# Gap extension penalty (-E: int)
bwa_params['gap_extend'] = 1
# Clipping penalty (-L: int)
bwa_params['clipping_penalty'] = 5
# Unpaired read pair penalty (-U: int)
bwa_params['unpaired_penalty'] = 9
# Read group header (-R: str)
bwa_params['rg_header'] = ''
# low score thereshold (-T: int)
bwa_params['low_threshold'] = 30
# Output all alignments for unpaired reads (-a: boolean)
bwa_params['output_all'] = False
# Use hard-clipping in SAM output (-H: boolean)
bwa_params['hard_clipping'] = False
# Mark shorter split hits as secondary (-M: boolean)
bwa_params['mark_split_secondary'] = False
#
##################################################################################################################################### 
#                                                                                                                                   #
# Mark Duplicates (Picard)                                                                                                          #
#                                                                                                                                   #
# Duplicate identification is carried out using either the Picard MarkDuplicates or MarkDuplicatesWithMateCigar tool.               #
#                                                                                                                                   #
# For full details of these parameters, see                                                                                         #
# https://software.broadinstitute.org/gatk/documentation/tooldocs/4.0.0.0/picard_sam_markduplicates_MarkDuplicatesWithMateCigar.php #
#                                                                                                                                   #
#   Values defined here are initially set to the tool defaults                                                                      #
#                                                                                                                                   #
##################################################################################################################################### 
#
# Run Picard MarkDuplicateWithMateCigar tool (boolean)
run_mark_duplicates = True
# Picard Program to use ('MarkDuplicates'|'MarkDuplicatesWithMateCigar')
markdup_params['program'] = 'MarkDuplicates'
# Comment to include in output header (--COMMENT: string)
markdup_params['comment'] = ''
# Scoring strategy for choosing non-duplicate reads (--DUPLICATE-SCORING-STRATEGY: 'TOTAL_MAPPED_REFERENCE_LENGTH'|'SUM_OF_BASE_QUALITIES'|'RANDOM')
markdup_params['scoring_strategy']='TOTAL_MAPPED_REFERENCE_LENGTH'
# Max optical duplicate set size. Set to -1 to disable check (--MAX_OPTICAL_DUPLICATE_SET_SIZE: integer)
markdup_params['optical_set_size']=300000
# Minimum distance (--MINIMUM_DISTANCE: integer)
markdup_params['min_distance'] = -1
# Optical duplicate	 pixel distance - max offset between duplicate clusters to be conidered duplicates. 
# Default good for non-patterned flowcells, otherwise use 2500 (--OPTICAL_DUPLICATE_PIXEL_DISTANCE: integer)
markdup_params['optical_pixel_distance'] = 100
# Read name regex - regular expression for parsing tile,x and y values from read IDs (--READ_NAME_REGEX: string)
markdup_params['read_name_regex'] = ''
# Remove duplicates, rather than setting SAM flag (--REMOVE_DUPLICATES: boolean)
markdup_params['remove_duplicates'] = False
# Skip pairs with no mate CIGAR (--SKIP_PAIRS_WITH_NO_MATE_CIGAR: boolean)
markdup_params['skip_no_mate'] = True
#
##################################################################################################################################### 
#                                                                                                                                   #
# Metrics - various Picard tools are run to gather metrics on the alignments                                                        #
#                                                                                                                                   #
# For full details of these parameters, see                                                                                         #
#   https://software.broadinstitute.org/gatk/documentation/tooldocs/                                                                #
#                                                                                                                                   #
#   Values defined here are initially set to the tool defaults                                                                      #
#                                                                                                                                   #
##################################################################################################################################### 
#
# Multiple metrics: metric accumulation level (--METRIC_ACCUMULATION_LEVEL: 'ALL_READS'|'SAMPLE'|'LIBRARY'|'READ_GROUP'
metrics_params['accumulation_level'] = 'ALL_READS'
# CollectBaseDistributionMetrics: Calculate distribution over aligned reads only (--ALIGNED_READS_ONLY: bool)
metrics_params['bd:aligned_only'] = False
# CollectBaseDistributionMetrics: Use reads passing quality filters only (--PF_READS_ONLY)
metrics_params['bd:pf_reads_only'] = False
# CollectAlignmentSummaryMetrics: max insert size (--MAX_INSERT_SIZE: int)
metrics_params['as:max_insert_size'] = 100000
# CollectAlignmentSummaryMetrics:  expected pair orientation (--EXPECTED_PAIR_ORIENTATION: 'FR'|'RF'|'TANDEM)
metrics_params['as:pair_orientation'] = 'FR'
# CollectAlignmentSummaryMetrics: adapter sequence (--ADAPTER_SEQUENCE: str)
metrics_params['as:adapter_sequence'] = ''
# CollectGcBiasMetrics: scan window size (--SCAN_WINDOW_SIZE: int)
metrics_params['gc:window_size'] = 100
# CollectGcBiasMetrics: Minimum genome fraction (--MINIMUM_GENOME_FRACTION: float)
metrics_params['gc:min_genome_fraction'] = 1.0E-05
# CollectGcBiasMetrics: ignore duplicates (--ALSO_IGNORE_DUPLICATES: bool)
metrics_params['gc:ignore_duplicates'] = False
# CollectInsertSizeMetrics: deviations (--DEVIATIONS: float)
metrics_params['is:deviations'] = 10.0
# CollectInsertSizeMetrics: histogram width (--HISTOGRAM_WIDTH: int)
metrics_params['is:histogram_width'] = None
# CollectInsertSizeMetrics: Minimum percentage threshold for consideration (--MINUMUM_PCT: float)
metrics_params['is:minimum_pct'] = 0.05
# CollectInsertSizeMetrics: Include duplicates in calculations (--INCLUDE_DUPLICATES: bool)
metrics_params['is:include_duplicates'] = False
# CollectOxoGMetrics: file of intervals to restrict analysis to (--INTERVALS: str)
metrics_params['oxoG:intervals'] = None
# CollectOxoGMetrics:['oxoG:dbsnp']: VCF format dbsnp file to exlude known polymorphisms (--DB_SNP: str)
metrics_params['oxoG:dbsnp'] = None
# CollectOxoGMetrics: Minimum base quality score to be included in analysis (--MINUMUM_QUALITY_SCORE: int)
metrics_params['oxoG:min_qual'] = 20
# CollectOxoGMetrics: Minimum mapping quality score for base to be included in analysis (--MINUMUM_MAPPING_QUALITY: int)
metrics_params['oxoG:min_mapq'] = 30
# CollectOxoGMetrics: Minimum insert size forfread to be included in analyis - 0 allows unpaired reads (--MINIMUM_INSERT_SIZE: int)
metrics_params['oxoG:min_inssize'] = 60
# CollectOxoGMetrics: Maximum insert size for read to be included in analysis - 0 allows unpaired reads (--MAXIMUM_INSERT_SIZE: int)
metrics_params['oxoG:max_inssize'] = 600
# CollectOxoGMetrics: Include reads which do not pass quality filtering (--INCLUDE_NON_PF_READS: bool)
metrics_params['oxoG:non_pf_reads'] = True
# CollectOxoGMetrics: Use original quality scores for filtering (when available) (--USE_OQ: bool)
metrics_params['oxoG:use_oq'] = True
# CollectOxoGMetrics: Number of context bases to include either side of assayed base (--CONTEXT_SIZE: int)
metrics_params['oxoG:context_size'] = 1
# CollectWgsMetrics: count unpaired reads (--COUNT_UNPAIRED: bool)
metrics_params['wgs:count_unpaired'] = False
# CollectWgsMetrics: coverage cap (--COVERAGE_CAP: int)
metrics_params['wgs:cov_cap'] = 250
# CollectWgsMetrics: Include base quality histogram in outputs (--INCLUDE_BQ_HISTORGRAM: bool)
metrics_params['wgs:bq_histo'] = False
# CollectWgsMetrics: Ignore reads at depths above cap (--LOCUS_ACCUMULATION_CAP: int)
metrics_params['wgs:locus_cap'] = 100000
# CollectWgsMetrics: Minimum base quality to be included in coverage (--MINIMUM_BASE_QUALITY: int)
metrics_params['wgs:min_qual'] = 20
# CollectWgsMetrics: Minimum mapping quality for read to be included in coverage (--MINIMUM_MAPPING_QUALITY: int)
metrics_params['wgs:min_mapq'] = 20
# CollectWgsMetrics: Average read length (--READ_LENGTH: int)
metrics_params['wgs:read_length'] = 150
# CollectWgsMetrics: Sample size for theoretical het sensitivity sampling (--SAMPLE_SIZE: int)
metrics_params['wgs:sample_size'] = 10000

##############################################################################################################
#
# MultiQC
#
# Interactive plots are produced by default where there are less than 100 samples, otherwise flat plots are produced
# Valid values: default/flat/interactive (--flat/--interactive)
mqc_params['plot_type']='default'
#
##############################################################################################################

from dag_core import get_jobname, pickle_params, get_reference_file
from dag_map_bwa_mem_pe import get_bwa_args, get_markdup_args, get_metrics_args, get_multiqc_args, get_samples 

# Directories
fastqDir = "fastq/"

# Setup workflow with functions from dag_qc_pe.py
SAMPLES=get_samples(fastqDir)
sample_file = open('.sample.list','w')
for sample in SAMPLES:
    sample_file.write("%s\n" % sample)
sample_file.close()
JOBNAME=get_jobname()
pickle_params(bwa_params, '.bwaparams')
pickle_params(markdup_params, '.markdup_params')
pickle_params(metrics_params, '.metrics_params')
pickle_params(mqc_params, '.mqcparams')

bwa_args = get_bwa_args(bwa_params)
parsed_markdup_dat = get_markdup_args(markdup_params)
markdup_args=parsed_markdup_dat['args']
markdup_program=parsed_markdup_dat['program']
metrics_args=get_metrics_args(metrics_params)
alignmentSummary_args=metrics_args['alignment_summary']
baseDistribution_args=metrics_args['base_distribution']
gcBias_args=metrics_args['gc_bias']
insertSize_args=metrics_args['insert_size']
oxoG_args=metrics_args['oxoG']
wgs_args=metrics_args['wgs']
multiqc_args = get_multiqc_args(mqc_params)

REF = get_reference_file()
REFERENCE=REF.get('REFERENCE')
REFPREFIX=REF.get('PREFIX')

MULTIQC = ["multiqc/report.html"]
REPORT = expand("{jobname}.report.html",jobname=JOBNAME)

INDEX=expand("reference/{prefix}.bwt",prefix=REFPREFIX)

INDEX_FILES=(expand("reference/{prefix}.amb", prefix=REFPREFIX), expand("reference/{prefix}.ann", prefix=REFPREFIX),\
	 expand("reference/{prefix}.bwt", prefix=REFPREFIX), expand("reference/{prefix}.pac", prefix=REFPREFIX),  \
	 expand("reference/{prefix}.sa", prefix=REFPREFIX), expand("reference/{reference}.fai", reference=REFPREFIX), \
     expand("reference/{prefix}.dict", prefix=REFPREFIX))


ALIGNMENTS=expand("alignments/{sample}.bam",sample=SAMPLES)+expand("alignments/{sample}.bam.bai",sample=SAMPLES)


MULTIQC_OUTPUTS=("multiqc/report.html", "multiqc/report_data/multiqc_data.json", 
                 "multiqc/report_data/multiqc_picard_AlignmentSummaryMetrics.txt", "multiqc/report_data/multiqc_picard_dups.txt",
				 "multiqc/report_data/multiqc_picard_gcbias.txt", "multiqc/report_data/multiqc_general_stats.txt", 
                 "multiqc/report_data/multiqc.log", "multiqc/report_data/multiqc_sources.txt")

TARGETS=INDEX + ALIGNMENTS + MULTIQC + REPORT

rule all:
    input: expand("{targets}",targets=TARGETS)

rule index:
	version: "1.0"
	message: "Creating indices"
	input: expand("reference/{reference}",reference=REFPREFIX)
	output: INDEX_FILES
	shell: '''
		REFERENCE=$(basename {input})
		PREFIX=$(echo $REFERENCE|sed 's/\..*$//')
		echo REFERENCE=$REFERENCE
		echo PREFIX=$PREFIX
		cp -v {input} $TMPDIR
		cd $TMPDIR
		bwa index -p ${{PREFIX}} ${{REFERENCE}}
		samtools faidx $TMPDIR/${{REFERENCE}}
		samtools dict $TMPDIR/${{REFERENCE}} > ${{PREFIX}}.dict
		rm -v ${{REFERENCE}}
		cd -
		ls -l $TMPDIR
		cp -v $TMPDIR/* reference/
	'''
	
rule process_sample:
	version: "1.0"
	message: "Aligning reads"
	input:
		R1 = "fastq/{sample}_1.fq.gz",
		R2 = "fastq/{sample}_2.fq.gz",
		INDEX = INDEX_FILES
	output: ("alignments/{sample}.bam","alignments/{sample}.bam.bai","picard/markdup/{sample}.duplicates.txt")
	threads: 8
	shell:'''
		REFERENCE=$(ls reference|awk -F. '{{print $1}}'|sort -u)
		FQ1=$(basename {input.R1})
		FQ2=$(basename {input.R2})
		SAMPLE=$(echo $FQ1|awk -F_ '{{print $1}}')
		SAMPNUM=$(grep -n ${{SAMPLE}} .sample.list |awk -F: '{{print $1}}')
		echo REFERENCE=$REFERENCE
		echo FQ1=$FQ1
		echo FQ2=$FQ2
		echo SAMPNUM=$SAMPNUM
		cp -Rv reference/ $TMPDIR
		cp -v {input.R1} $TMPDIR
		cp -v {input.R2} $TMPDIR
		ORIGDIR=$(pwd)
		cd $TMPDIR
		if [ ! -e "reference/$REFERENCE.fa" ]; then
			cd reference
			ln -s $REFERENCE $REFERENCE.fa
			cd -
		fi
		ls -l reference
		BAMFILE=$TMPDIR/${{SAMPLE}}.bam
		BAIFILE=$TMPDIR/${{SAMPLE}}.bam.bai
		bwa mem -t 8 {bwa_args}  reference/$REFERENCE $FQ1 $FQ2 \
		 | samtools sort -o $BAMFILE -
		samtools index -@ 8 $BAMFILE
		if [[ {run_mark_duplicates} = "True" ]]; then
			NEWBAMFILE=$TMPDIR/${{SAMPLE}}.nodup.bam
			picard {markdup_program} {markdup_args} -I $BAMFILE -O $NEWBAMFILE -M ${{SAMPLE}}.duplicates.txt -R $TMPDIR/reference/$REFERENCE.fa
			mv $NEWBAMFILE $BAMFILE
			samtools index -@ 8 $BAMFILE
			samtools flagstat $BAMFILE > ${{SAMPLE}}.flagstat.txt
		fi
		NEWBAMFILE=$(echo $BAMFILE|sed 's/.bam/.rg.bam/')
		picard AddOrReplaceReadGroups I=$BAMFILE O=$NEWBAMFILE RGID=$SAMPNUM RGPL={rgpl} RGSM=${{SAMPLE}} RGLB=${{SAMPLE}} RGPU=${{SAMPNUM}}
		mv -v $NEWBAMFILE $BAMFILE
		picard CollectAlignmentSummaryMetrics -Dpicard.useLegacyParser=false {alignmentSummary_args} -I $BAMFILE -O ${{SAMPLE}}.alignment_metrics.txt -R $TMPDIR/reference/$REFERENCE.fa
		picard CollectBaseDistributionByCycle -Dpicard.useLegacyParser=false {baseDistribution_args} -I $BAMFILE -O ${{SAMPLE}}.base_distribution_metrics.txt -CHART ${{SAMPLE}}_base_distribution_chart.pdf
		picard CollectGcBiasMetrics -Dpicard.useLegacyParser=false {gcBias_args} -I $BAMFILE -O ${{SAMPLE}}.gcbias_metrics.txt -R $TMPDIR/reference/$REFERENCE.fa -S ${{SAMPLE}}.gcbias.summary.txt --CHART_OUTPUT ${{SAMPLE}}.gcbias.chart.pdf 
		picard CollectInsertSizeMetrics -Dpicard.useLegacyParser=false {insertSize_args} -I $BAMFILE -O ${{SAMPLE}}.insertsize_metrics.txt -H ${{SAMPLE}}.insertsize_histogram.pdf
		picard CollectOxoGMetrics -Dpicard.useLegacyParser=false {oxoG_args} -I $BAMFILE -O ${{SAMPLE}}.oxoG_metrics.txt -R $TMPDIR/reference/$REFERENCE.fa
		picard CollectWgsMetrics -Dpicard.useLegacyParser=false {wgs_args} -I $BAMFILE -O ${{SAMPLE}}.wgs_metrics.txt -R $TMPDIR/reference/$REFERENCE.fa
		ls -l
		cd $ORIGDIR
		cp -v $BAMFILE alignments/${{SAMPLE}}.bam
		cp -v $BAMFILE.bai alignments/${{SAMPLE}}.bam.bai
		mkdir -pv {{flagstats,picard/markdup,picard/alignment_metrics,picard/base_distribution,picard/gcbias,picard/insertsize,picard/oxogmetrics,picard/wgs_metrics}}
		cp -v $TMPDIR/${{SAMPLE}}.flagstat.txt flagstats/
		if [ -e $TMPDIR/${{SAMPLE}}.duplicates.txt ]; then
			cp -v $TMPDIR/${{SAMPLE}}.duplicates.txt picard/markdup/${{SAMPLE}}.duplicates.txt
		fi
		cp -v $TMPDIR/${{SAMPLE}}.alignment_metrics.txt picard/alignment_metrics/${{SAMPLE}}.alignment_metrics.txt
		cp -v $TMPDIR/${{SAMPLE}}.base_distribution* picard/base_distribution/
		cp -v $TMPDIR/${{SAMPLE}}.gcbias* picard/gcbias/
		cp -v $TMPDIR/${{SAMPLE}}.insertsize* picard/insertsize/
		cp -v $TMPDIR/${{SAMPLE}}.oxoG_metrics.txt picard/oxogmetrics/
		cp -v $TMPDIR/${{SAMPLE}}.wgs_metrics.txt picard/wgs_metrics/
	'''

rule multiqc:
    version: "1.0"
    message: "Running multiqc on job outputs..."
    input: expand("alignments/{sample}.bam",sample=SAMPLES)
    output: expand("{multiqc_outputs}",multiqc_outputs=MULTIQC_OUTPUTS) 
    shell:
        '''
			multiqc {multiqc_args} -f --filename report --outdir multiqc alignments picard
		'''

rule report:
       message: "Generating report..."
       input: "multiqc/report.html"
       output: expand("{jobname}.report.html",jobname=JOBNAME)
       shell: "dag-map-bwa-mem-pe.report.py {output}"

onerror:
    if os.path.exists('.start_date.txt'):
        os.remove(".start_date.txt")
