#!/usr/bin/env bash

set -e
set -o errexit
set -o pipefail

RED='\033[0;31m'
DEF='\033[0m'
BOLD='\033[1m'


function usage {
	usage=$(cat <<'EOUSAGE'
Usage: dag-wf-rename-files [-i inputdir] [-o outputdir] [-m mapping_file] [-h]

  Renames batches of files to make them amenable to running through DAG workflows.
	Symbolic links are created in outputdir with a filename determined by the various fields
	in the mapping file.
 
  The required mapping file should be a tab delimited file containing the following fields:

  Original_filename Sample  Read

  where:
     original_filename:  the current name of the file
     sample: the sample name to which the file relates
     read:  either '1' or '2' (not required for single-ended fastq files)

	Any uncompressed fastq files found will be compressed using gzip

  Options:
	* -i: path to input directory containing files to rename
	* -o: path to output directory in which symlinks will be created
	* -m: mapping file of filename to sample mappings
	* -h: Display help
EOUSAGE
)

	echo "$usage"
	echo
	exit 0
}

# Option parsing...
optspec=":i:o:m:h"

while getopts "$optspec" optchar; do
    case "${optchar}" in
        i)
            in_dir=$OPTARG
            ;;
				o)
						out_dir=$OPTARG
						;;
				m)
            mapping_file=$OPTARG
            ;;
        h)
			usage
            ;;
    esac
done

if [[ -z ${in_dir} ]] || [[ -z ${out_dir} ]] || [[ -z ${mapping_file} ]]; then
    usage
fi

if [[ ! -d ${in_dir} ]]; then 
	printf "\n${RED}Error: input directory '${in_dir}' not found...${DEF}\n\n"
    exit 1
fi

if [[ ! -f ${mapping_file} ]]; then 
	printf "\n${RED}Error: mapping file '${mapping_file}' not found...${DEF}\n\n"
    exit 1
fi

eval SPACES=$(grep -c ' ' $mapping_file)
if [ "${SPACES}" -ne 0 ]; then
	echo
	printf "${RED}ERROR: Spaces were found in $mapping_file. This file must be tab-separated.${DEF}\n"
	echo "Spaces found are indicated below as '*'"
	echo
	grep ' ' ${mapping_file} | tr ' ' '*'
	echo
	exit 1
fi

mkdir -p ${out_dir}

function rename_files() {

    dry_run=$1

    for line in "${mappings[@]}"; do
        IFS=$'\t' read -a vals <<< "${line}"
        infile=${vals[0]}

				# Check we have at least two fields in the mapping data
				if [[ -z ${vals[1]} ]]; then
					printf "\n${RED}Error: '${mapping_file}' is not a valid mapping file.\n"
					printf "Please check this is a tab-delimited file with 2 or more columns${DEF}\n"
					exit 1
				fi

					# Check the input file defiend in field1 exists
        if [[ ! -f "${in_dir}/${infile}" ]]; then
	        printf "\n${RED}Error: File '${infile}' defined in ${mapping_file} not found in ${in_dir}...${DEF}\n\n"
            exit 1
        fi
				
				# check read pairing info looks correct
				if [[ ${vals[2]} -eq '1' ]]; then
					pairs=$(grep -c ${vals[1]} ${mapping_file})
					if [[ ${pairs} -ne "2" ]]; then
							echo
							printf "\n${RED}Error: Incorrect read pairing for sample ${vals[1]}${DEF}\n"
							echo
							echo "Read paired data should have two files for each sample, with '1' and '2' in column 3 of the mapping file respectively"
							echo "${pairs} entries were found for ${vals[1]}"
							echo
							echo "Error found in following line:"
							echo
							grep ${vals[1]} ${mapping_file}
							echo
							exit
					fi
						have2=($(grep ${vals[1]} ${mapping_file} | cut -f3 ))
							if [[ ${have2[1]} -ne '2' ]]; then
									echo
									printf "\n${RED}Error: Read 1 for ${vals[1]} in mapping file does not have corresponding entry for read 2\n"
									echo
									exit 1
							fi
				fi
				
					#now check we have a valid read 1 entry which matches a read 2...
				if [[ ${vals[2]} -eq '2' ]]; then
						have1=($(grep ${vals[1]} ${mapping_file} | cut -f3))
						if [[ ${have1[0]} -ne '1' ]]; then
								echo
								printf "\n${RED}Error: Read 2 for ${vals[1]} in mapping file does not have corresponding read 1 entry\n"
								echo
								exit 1
						fi
				fi
	
        if [[ -z "${vals[2]}" ]]; then
            outfile="${vals[1]}.fq"
        else 
            outfile="${vals[1]}_${vals[2]}.fq"
        fi
    
        eval gzipped=$(file -L ${in_dir}/${infile}|grep -c 'gzip')
        if [[ "${gzipped}" == '1' ]] ; then
            outfile="${outfile}.gz"
        fi
				
				# resolve relative paths
				in_dir=$(echo "$(cd "$(dirname "${in_dir}")"; pwd)/$(basename "${in_dir}")")

        if [[ -z ${dry_run} ]]; then
						ln -sv ${in_dir}/${infile} ${out_dir}/${outfile} | tee -a ${LOGFILE}
        else
            echo "${infile} -> ${outfile}"
        fi

    done
}

START_DATE=$(date +%d%m%y_%H%M%S)
LOGFILE="fastq_rename.${START_DATE}.log"
readarray mappings <${mapping_file}

echo
echo "Files will be renamed as follows:"
echo

rename_files 'dry_run'

echo
printf "${RED}Please check the renaming above is correct\n"
read -r -p "Do you wish to proceed? [y/N] " response
printf "${DEF}"

case "$response" in
    [yY][eE][sS]|[yY]) 
        echo
        rename_files
        echo
        echo "A log of changes made has been written to ${LOGFILE}"
        
        ;;
    *)
        echo "Exited without making any changes.."
        ;;
esac
