#!/usr/bin/env bash # # ================================================================================================================================ # Purpose: Copy files from one location to another location and performs minor transformations and cleansing if needed # --------------------------- # Called From: Tidal agent # Author: estark # =============================================================================================================================== # function usage() { echo "Copies text files with the following parameters:" echo " --jobname= REQUIRED. A job name with no spaces and special characters. Example : daily_transactions" echo " --env= REQUIRED. Environment to run in (dev, e2e, prd, prf)" echo " --files= REQUIRED. Files to copy" echo " --days= OPTIONAL. Iterates {date} variable for Last d days." echo " If -d not provided {date} = today. -d 1 = today, -d 2 = yesterday and today" echo " --output_format= OPTIONAL. Default = gz . File output format. Available options : gz, csv" echo " --output_location= REQUIRED. File output target S3 location. Example : s3://my_s3_bucket/my_target_prefix/" echo " --removefirst= OPTIONAL. Removes first x lines from the file" echo " --removelast= OPTIONAL. Removes last x lines from the file" echo " --removeheader OPTIONAL. Removes header line (no param value is needed)" echo " --addheader OPTIONAL. Adds the header line to each split files (no param value is needed)" echo " --mergefiles OPTIONAL. Merges small files into a large file" echo " --splitrows= OPTIONAL. Max number of rows you want in each file" echo echo echo " Examples :" echo " sh filecopy.sh" echo " --jobname=daily_transactions" echo " --env=prd" echo " --files='/data/folder/subfolder/Fileprefix_{date}*tar.gz'" echo " --days=1" echo " --output_format=csv" echo " --output_location='/data2/anothersubfolder/'" echo " --splitrows=500000" echo " --addheader" echo "" echo " sh filecopy.sh" echo " --jobname=daily_transactions" echo " --env=prd" echo " --files='/data/folder/subfolder/Fileprefix_{date}*zip'" echo " --days=10" echo " --output_format=gz" echo " --output_location='s3://mybucket/myprefix/'" echo " --removefirst=1" echo " --removelast=1" echo " --removeheader" echo " --mergefiles" exit 1 } clear set -e # Exit immediately if a command exits with a non-zero status. machine="" pJOBNAME="" pDAYS=1 pSPLITROWS=0 pOUTPUT_FORMAT="gz" pMERGE_FILES="" pREMOVE_FIRST=0 pREMOVE_LAST=0 pREMOVE_HEADER="" pADD_HEADER="" optspec=":hv-:" while getopts "$optspec" optchar; do case "${optchar}" in -) case "${OPTARG}" in jobname=*) pJOBNAME="${OPTARG#*=}" ;; env=*) pENV="${OPTARG#*=}" ;; files=*) pFILES="${OPTARG#*=}" ;; days=*) pDAYS=${OPTARG#*=} ;; splitrows=*) pSPLITROWS=${OPTARG#*=} ;; output_format=*) pOUTPUT_FORMAT="${OPTARG#*=}" ;; output_location=*) pTARGET="${OPTARG#*=}" ;; removefirst=*) pREMOVE_FIRST=${OPTARG#*=} ;; removelast=*) pREMOVE_LAST=${OPTARG#*=} ;; removeheader) pREMOVE_HEADER='Yes' ;; addheader) pADD_HEADER='Yes' ;; mergefiles) pMERGE_FILES='Yes' ;; *) if [ "$OPTERR" != 1 ] || [ "${optspec:0:1}" = ":" ]; then echo "Non-option argument: '-${OPTARG}'" >&2 fi ;; esac;; h) usage ;; *) if [ "$OPTERR" != 1 ] || [ "${optspec:0:1}" = ":" ]; then echo "Non-option argument: '-${OPTARG}'" >&2 fi ;; esac done if [ -z "$pJOBNAME" ] then echo "Need to pass the --jobname argument" echo "" exit 1 fi if [ -z "$pENV" ] then echo "Need to pass the --env argument" echo "" exit 1 fi if [ -z "$pFILES" ] then echo "Need to pass the --files argument" echo "" exit 1 fi if [ -z "$pTARGET" ] then echo "Need to pass the --output_location argument" echo "" exit 1 fi if [ $pDAYS -lt 1 ];then pDAYS=1 fi pTARGET=${pTARGET/"{env}"/$pENV} if [[ $pTARGET == "s3://"* ]]; then target_type='s3' else target_type='fs' fi echo ------------------------------------------------------------------------------------ echo Files : $pFILES echo Target : $pTARGET echo Output : $pOUTPUT_FORMAT echo - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rootdir=./tmp pTMPDIR=$rootdir/$pJOBNAME pINPUTDIR=$pTMPDIR/input pUNZIPDIR=$pTMPDIR/unzipped pSPLITDIR=$pTMPDIR/split pOUTPUTDIR=$pTMPDIR/output mkdir -p $rootdir rm -rf $pTMPDIR mkdir -p $pTMPDIR function clean_tempdirs() { if [ "$(ls -A ${pTMPDIR})" ]; then rm -rf $pTMPDIR/* fi mkdir -p $pINPUTDIR mkdir -p $pUNZIPDIR mkdir -p $pSPLITDIR mkdir -p $pOUTPUTDIR } function check_if_file_needs_transformation() { f=$1 file_needs_transformation="No" if [[ $pREMOVE_FIRST -gt 0 ]]; then file_needs_transformation="Yes" fi # Remove last x lines if needed if [[ $pREMOVE_LAST -gt 0 ]]; then file_needs_transformation="Yes" fi # Remove the header if needed if [ "$pREMOVE_HEADER" == 'Yes' ]; then file_needs_transformation="Yes" fi # Split the file if needed if [[ $pSPLITROWS -gt 0 ]]; then file_needs_transformation="Yes" fi current_file_format="${f##*.}" # Gzip files if asked. This will also move files from unzip to output folder if [[ "$current_file_format" != "$pOUTPUT_FORMAT" ]]; then file_needs_transformation="Yes" fi case "$current_file_format" in "zip"|"tar.gz"|"tar") file_needs_transformation="Yes" ;; esac } function unzip_file() { ff=$1 unzip -o $ff -d $pUNZIPDIR rm $ff echo ">> File unziped :" $ff } function untar_file() { ff=$1 tar -zxf $ff -C $pUNZIPDIR/ rm $ff echo ">> File untarred :" $ff } function remove_first_lines { ff=$1 lines_to_start=$(expr $pREMOVE_FIRST + 1 ) tail -n +$lines_to_start $ff > $ff.new rm $ff mv $ff.new $ff echo ">> First $pREMOVE_FIRST line(s) removed :" $ff } function remove_last_lines { ff=$1 if [ $machine == 'Mac' ]; then ghead -n -$pREMOVE_LAST $ff > $ff.new else head -n -$pREMOVE_LAST $ff > $ff.new fi rm $ff mv $ff.new $ff echo ">> Last $pREMOVE_LAST line(s) removed :" $ff } function remove_header { ff=$1 lines_to_start=2 tail -n +$lines_to_start $ff > $ff.new rm $ff mv $ff.new $ff echo ">> Header removed from :" $ff } function merge_files { mkdir $pTMPDIR/merge ffilename=$(basename "$1") fname="${ffilename%%.*}" cat $pUNZIPDIR/* > $pTMPDIR/merge/$fname.txt rm $pUNZIPDIR/* mv $pTMPDIR/merge/$fname.txt $pUNZIPDIR/ echo ">> Files have been merged :" $fname.txt } function convert_files_to_zip() { zip $pUNZIPDIR/* mv $pUNZIPDIR/* $pOUTPUTDIR/ echo ">> File zipped :" $file } function convert_files_to_gzip() { gzip $pUNZIPDIR/* mv $pUNZIPDIR/* $pOUTPUTDIR/ #rm $file #file=$pOUTPUTDIR/$filename.gz echo ">> File(s) gzipped and moved to $pOUTPUTDIR" } function split_file() { ff=$1 echo ">> File to split :" $ff filedir="$(dirname "$ff")"/ filename=$(basename $ff) filetype="${ff##*.}" if [ "$pADD_HEADER" == "Yes" ]; then echo ">> Headers will be added" # Adds the first line each splitted file head -n 1 $ff > $pSPLITDIR/tmp_header.txt tail -n +2 $ff | split -l $pSPLITROWS - ${pSPLITDIR}/$filename. for f in $pSPLITDIR/*.* do cat "$pSPLITDIR/tmp_header.txt" > "$f.new" cat "$f" >> "$f.new" mv -f "$f.new" "$f" done rm -f $pSPLITDIR/tmp_header.txt echo ">> Headers added" else split -l $pSPLITROWS $ff ${pSPLITDIR}/$filename. fi rm $ff mv $pSPLITDIR/* $filedir echo ">> File split completed" } function copy_files_to_s3() { aws s3 cp $pOUTPUTDIR $pTARGET --recursive echo ">> File(s) copied to : " $pTARGET } function copy_files_to_fs() { cp $pOUTPUTDIR/* $pTARGET echo ">> File(s) copied to : " $pTARGET } function copy_files() { # iterate files found in $pFILES echo "-----------------------------------------------------------------" echo ">> File(s) to be processed : " $pFILES echo "-----------------------------------------------------------------" for inputfile in $pFILES do echo ">>> File to Copy :" $inputfile clean_tempdirs filename=$(basename $inputfile) { #try cp $inputfile $pINPUTDIR/$filename && { inputfile=$pINPUTDIR/$filename echo ">>>" $inputfile "has been copied to" $pINPUTDIR/$filename } } || { # catch echo "!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!" echo ">>> Cannot copy" $inputfile "to" $pINPUTDIR/$filename echo "Exiting the script" echo "!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!" exit 1 } file_basedir=$(dirname $inputfile) if [[ $file_basedir != $pINPUTDIR ]]; then echo "!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!" echo ">>> File is not in " $pINPUTDIR echo ">>> Exiting the script" echo "!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!" exit 1 fi # Check the script parameters to see if the file needs decompression check_if_file_needs_transformation $inputfile; if [ "$file_needs_transformation" = 'Yes' ]; then # Move file(s) to Unzip folder if [[ $inputfile == *.zip ]]; then unzip_file $inputfile # Untar the file if needed elif [[ $inputfile == *.tar.gz ]]; then untar_file $inputfile else # Assuming else files are csv/text files # Move the file to Unzip dir mv $inputfile $pUNZIPDIR fi; for file in $pUNZIPDIR/*.* do echo " >> File : " $file # if file is not empty if [ -s $file ] then # Remove first x lines if needed if [[ $pREMOVE_FIRST -gt 0 ]]; then remove_first_lines $file fi # Remove last x lines if needed if [[ $pREMOVE_LAST -gt 0 ]]; then remove_last_lines $file fi # Remove the header if needed if [ "$pREMOVE_HEADER" == 'Yes' ]; then remove_header $file fi else echo " >> Removing empty file : " $file rm $file fi done #Check if Unzip folder is not empty if [ "$(ls -A $pUNZIPDIR)" ]; then # Merge small files if needed if [ "$pMERGE_FILES" == "Yes" ]; then merge_files $filename fi # Split files if needed if [[ $pSPLITROWS -gt 0 ]]; then for file in $pUNZIPDIR/*.* do split_file $file done fi # Gzip files if asked. This will also move files from unzip to output folder if [ "$pOUTPUT_FORMAT" == "gz" ]; then convert_files_to_gzip # Zip files if asked. This will also move files from unzip to output folder elif [ "$pOUTPUT_FORMAT" == "zip" ]; then convert_files_to_zip else # Move files from unzip to output folder mv $pUNZIPDIR/* $pOUTPUTDIR/ fi else echo ">>>> Unzip folder is Empty" fi else # Move files that didn't need any processing from Inputdir to Outputdir mv $pINPUTDIR/* $pOUTPUTDIR/ fi # Copy the files from Output folder to the target location if [ "$(ls -A $pOUTPUTDIR)" ]; then if [ "$target_type" == "s3" ]; then copy_files_to_s3 else copy_files_to_fs fi else echo ">>>> Output folder is Empty" fi done } function get_os_type() { unameOut="$(uname -s)" case "${unameOut}" in Linux*) machine=Linux;; Darwin*) machine=Mac;; CYGWIN*) machine=Cygwin;; MINGW*) machine=MinGw;; *) machine="UNKNOWN:${unameOut}" esac } echo echo ">>>>>> FILE COPY STARTED <<<<<<<<<<" echo # Get the machine OS type get_os_type # Iterate dates if specified if [[ "$pTARGET" == *"{date}"* ]] || [[ "$pFILES" == *"{date}"* ]]; then echo ">>> Iterating dates" filepath=$pFILES target=$pTARGET pDAYS=$(($pDAYS - 1)) today=$(date +%Y-%m-%d) if [ $machine == 'Mac' ]; then tomorrow=$(date -j -v +1d -f "%Y-%m-%d" $today +%Y-%m-%d) d=$(date -j -v -${pDAYS}d -f "%Y-%m-%d" $today +%Y-%m-%d) else tomorrow=$(date -I -d "$today + 1 day") d=$(date -I -d "$today - ${pDAYS} day") fi while [ "$d" != $tomorrow ]; do pFILES=${filepath/"{date}"/${d//-}} pTARGET=${target/"{date}"/${d//-}} num_of_files=$(ls -d $pFILES | wc -l) if [ $num_of_files -gt 0 ]; then # Execute Copy copy_files else echo "--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--" echo ">> No File(s) found to process : " $pFILES echo "--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--!--" fi if [ $machine == 'Mac' ]; then d=$(date -j -v +1d -f "%Y-%m-%d" $d +%Y-%m-%d) else d=$(date -I -d "$d + 1 day") fi done else # Otherwise Execute Copy copy_files fi rm -rf $pTMPDIR echo echo ">>>>>> FILE COPY FINISHED <<<<<<<<<<" echo