#!/usr/bin/env /links/application/dsu/.bds/bds /* The BDS automatic command line parsing allows the control on what to run * bds yoda_analysis.bds -reRun true -latestFolder /home/sbsuser/yoda/150304_M01761_0119_000000000-ADTAN * * For trying out what would run use the dryRun flag: * bds -c /links/application/dsu/.bds/bds.config -dryRun -s ssh yoda_analysis.bds -reRun \ * -latestFolder /home/sbsuser/yoda/150304_M01761_0119_000000000-ADTAN \ * -runReadRTATimestamp -runSampleSheetCreation -runTriggerBcl2fastq -runDemultiplexStats -runRsyncOnDemux * -runRsyncFlowcell -runCreateFastqc -runBarcodeDistribution -runRsyncDemux */ bool reRun bool runReadRTATimestamp bool runSampleSheetCreation bool runTriggerBcl2fastq bool runDemultiplexStats bool runRsyncOnDemux bool runRsyncFlowcell bool runCreateFastqc bool runBowtie bool runBarcodeDistribution bool runRsyncDemux bool debugRun string sequencer string latestFolder runBase := "/links/shared/dsu/runs/$sequencer" if (latestFolder.isEmpty()) { latestFolder = getLatestFolder(runBase) } analysisStarted := "$latestFolder" + "/Analysis.started" analysisFinished := "$latestFolder" + "/Analysis.finished" runCompleted := "$latestFolder" + "/RTAComplete.txt" dss := "/links/shared/dsu/dss" # Drop boxes paths rtaIncoming := "$dss/v2_read-rta-timestamp" flowCellData := "$dss/v2_register-flowcell" unalignedData := "$dss/v2_register-flowlane" # is set in the function depending on the run #demuxData := "$dss/v2_read-demultiplex-stats-miseq-hiseq" fastqcData := "$dss/v2_register-fastqc" barcodeDistData := "$dss/v2_register-undetermined" reportsData := "$dss/v2_register-demuliplex-stats" demultiplexedFolder := "demultiplexed" marker := ".MARKER_is_finished_" #marker := ".MARKER_" samplePrefix := "BSSE_QGF_" if (!latestFolder.exists()) { print("Folder $latestFolder does not exist!\n\n") sys exit 1 } string fcName runFolderName := latestFolder.baseName() print("Runfolder: $runFolderName\n") splits := runFolderName.split("_") if (splits[3].startsWith("000")) { fcName = splits[3] } else { fcName = splits[3].substr(1) } print("Flowcell: $fcName\n") string model = get_model(splits[1]) print("Model: $model\n") bool {} taskList # ----------------------------------------------------------------------------- # Pre-Checks if (reRun) { removeOutputFiles() taskList = {"runReadRTATimestamp" => true, \ "runSampleSheetCreation" => true, \ "runTriggerBcl2fastq" => true, \ "runDemultiplexStats" => true, \ "runRsyncOnDemux" => true, \ "runRsyncFlowcell" => true, \ "runCreateFastqc" => true, \ "runBowtie" => true, \ "runBarcodeDistribution" => true, \ "runRsyncDemux" => true} } else { taskList = {"runReadRTATimestamp" => runReadRTATimestamp, \ "runSampleSheetCreation" => runSampleSheetCreation, \ "runTriggerBcl2fastq" => runTriggerBcl2fastq, \ "runDemultiplexStats" => runDemultiplexStats, \ "runRsyncOnDemux" => runRsyncOnDemux, \ "runRsyncFlowcell" => runRsyncFlowcell, \ "runCreateFastqc" => runCreateFastqc, \ "runBowtie" => runBowtie, \ "runBarcodeDistribution" => runBarcodeDistribution, \ "runRsyncDemux" => runRsyncDemux, \ "debugRun" => debugRun} } print("taskList\n$taskList\n") if (analysisStarted.canRead() && (!taskList{"debugRun"})) { print("Analysis already started/done for flowcell $runFolderName\n") exit 0 } if ( runCompleted.canRead() ) { print ("Run is complete, found: $runCompleted\n" ) analysisStarted.write("Started: " + getDate()) startAnalysis("$fcName", "$model") } # -------------------------------------------------------------------------- # Helper functions string getLatestFolder (string runBase) { string [] runFolderList folderList := runBase.dir("*") for (string folder : folderList) { fullfolder := "$runBase/$folder" if (fullfolder.isDir()) { runFolderList.add(fullfolder) } } runFolderList.sort() latestFolder := runFolderList.pop() return latestFolder } void removeOutputFiles() { print "Removing files...\n" string taskId task ( canFail := true, cpus := 1 ){ sys rm "$analysisStarted" "$analysisFinished" sys rm -rf "$latestFolder/$demultiplexedFolder" } wait taskId } string cleanString (string toClean) { return toClean.replace("-", "_") } string [] getLaneNumbers (string searchFolder, string fileRegex) { # try to figure out which lanes are present and returns the result as a string list string [] listOfLanes fileList := searchFolder.dir(fileRegex) for (string aFile : fileList) { string lane splittedName := aFile.split("_") if (splittedName[2].startsWith("L00")) { lane = splittedName[2].substr(3,4) } # Assuming that Illumina leaves out the Lane # information when there using the option "--no-lane-splitting" # with bcl2fastq else { lane = "1" } if (!listOfLanes.has(lane) && !lane.isEmpty()) { listOfLanes.add(lane) } } return listOfLanes } string get_model(string machineId) { """ Guesses the sequencer model from the run folder name Current Naming schema for Illumina run folders, as far as I know, no documentation found on this, Illumina introduced a field called on the NextSeq runParameters.xml. That might be an option for the future. Alternatively a combination of the fields and . MiSeq: 150130_M01761_0114_000000000-ACUR0 NextSeq: 150202_NS500318_0047_AH3KLMBGXX HiSeq 2000: 130919_SN792_0281_BD2CHRACXX HiSeq 2500: 150203_D00535_0052_AC66RWANXX HiSeq 3000: 150724_J00121_0017_AH2VYMBBXX HiSeq 4000: 150210_K00111_0013_AH2372BBXX HiSeq X: 141121_ST-E00107_0356_AH00C3CCXX """ if (machineId.startsWith("NS")) model = "NEXTSEQ_500" else if (machineId.startsWith("M")) model = "MISEQ" else if (machineId.startsWith("D")) model = "HISEQ_2500" else if (machineId.startsWith("SN")) model = "HISEQ_2000" else if (machineId.startsWith("J")) model = "HISEQ_3000" else if (machineId.startsWith("K")) model = "HISEQ_4000" else if (machineId.startsWith("ST")) model = "HISEQ_X" else model = "UNIDENTIFIED" return model } void rsyncRunFolder (string[] rsyncParameters, string source, string targetFolder, string markerFile) { targetFolder.mkdir() source.trim() joinedParameters := rsyncParameters.join() print("rsync $joinedParameters $source $targetFolder\n") string taskId = task /usr/bin/rsync $joinedParameters $source $targetFolder wait taskId if (!markerFile.isEmpty()) { task touch "$markerFile" } } # -------------------------------------------------------------------------- void startAnalysis (string fcName, string model) { # Main function # Read RTA timestamp if (taskList{"runReadRTATimestamp"}) { rsyncRunFolder(["-a"], \ "$runCompleted", \ "$rtaIncoming/$runFolderName", \ "$rtaIncoming/$marker$runFolderName") } # Create a sample sheet from data from openBIS if (taskList{"runSampleSheetCreation"}) { sampleSheetName := triggerSampleSheetCreation() } # Start Demultiplexing if (taskList{"runTriggerBcl2fastq"}) { triggerBcl2fastq(model) } # html demultiplexing overview if (taskList{"runDemultiplexStats"}) { triggerDemultiplexStats() } # Rsync Flow Cell Raw Data if (taskList{"runRsyncFlowcell"}) { sys mkdir "$flowCellData/$runFolderName" rsyncRunFolder (["-a", \ "--exclude='*.cif'", \ "--exclude='*.FWHMMap'", \ "--exclude='demultiplexed'", \ "--exclude='Images'", \ "--exclude='L00*'", \ "--exclude='fastqc'", \ "--exclude='*_pos.txt'"], \ "$latestFolder", \ "$flowCellData/$runFolderName", \ "$flowCellData/$marker$runFolderName") } if (taskList{"runCreateFastqc"}) { createFastqc () } # run in parallel par { if (taskList{"runBarcodeDistribution"}) { barcodeDistribution () } if (taskList{"runBowtie"}) { bowtie () } } wait # Rsync the demultiplexed files if (taskList{"runRsyncOnDemux"}) { rsyncOnDemux() sleep(600) } # Sync Statistic for Flowcell and Lane statistic if (taskList{"runRsyncDemux"}) { rsyncDemux() } if (!taskList{"debugRun"}) { analysisFinished.write("Finished: " + getDate()) send_mail ("Sequencer $sequencer: Analysis finished", "Analysis for flow cell $fcName is finished.") } } /* --------------------------------------------------------------------------*/ string triggerSampleSheetCreation() { createSampleSheetBinary := "/links/application/dsu/createSampleSheet/createSampleSheet_bcl2fastq.sh" splits := runFolderName.split("_") SampleSheetName := "SampleSheet_" + "$fcName" + ".csv" task $createSampleSheetBinary \ -f $fcName \ -o $latestFolder wait return SampleSheetName } void triggerBcl2fastq (string model) { sampleSheetName := "SampleSheet_" + "$fcName" + ".csv" bcl2fastqBinary := "/usr/local/bin/bcl2fastq" string laneSplitting # just appending the option if (model == "NEXTSEQ_500") bcl2fastqBinary += " --no-lane-splitting " task ( cpus := 16 ) { sys /usr/bin/nohup $bcl2fastqBinary \ --with-failed-reads \ --ignore-missing-bcls \ --ignore-missing-controls \ --ignore-missing-positions \ --ignore-missing-filter \ --runfolder-dir $latestFolder \ --input-dir $latestFolder/Data/Intensities/BaseCalls \ --output-dir $latestFolder/$demultiplexedFolder \ --min-log-level DEBUG \ --sample-sheet $latestFolder/$sampleSheetName \ > $latestFolder/nohup_$runFolderName.txt 2>> $latestFolder/nohup_$runFolderName.txt } wait } void rsyncOnDemux () { searchFolder := "$latestFolder/$demultiplexedFolder" listOfLanes := getLaneNumbers(searchFolder, "*R1_001*.fastq.gz") debug(listOfLanes) string [] filesPerLane for (string lane : listOfLanes) { string newFolderName = "Undetermined_" + cleanString(fcName) + "_" + lane #Undetermined_000000000_AH4PH_1 string newFolderPath = searchFolder + "/" + newFolderName newFolderPath.mkdir() # Assuming it is only one lane if (listOfLanes.size() == 1 && listOfLanes[0] == 1) { filesPerLane = searchFolder.dir("*.fastq.gz") } else { filesPerLane = searchFolder.dir("*L00" + lane + "*.fastq.gz") } for (string fastqFile : filesPerLane) { sys mv "$searchFolder/$fastqFile" "$searchFolder/$newFolderName" sys ln -s "$searchFolder/$newFolderName/$fastqFile" "$searchFolder/$fastqFile" } rsyncRunFolder (["-a"], \ "$newFolderPath", \ "$unalignedData", \ "$unalignedData/$marker$newFolderName") } # BSSE_QGF_36104_000000000_AH4PH_1 sampleDirList := searchFolder.dir("$samplePrefix*") for( string sampleFolder : sampleDirList) { rsyncRunFolder (["-a"], \ "$searchFolder/$sampleFolder", \ "$unalignedData", \ "$unalignedData/$marker$sampleFolder") } } void triggerDemultiplexStats () { rsyncRunFolder (["-a"], \ "$latestFolder/$demultiplexedFolder/Reports", \ "$reportsData/$runFolderName", \ "") rsyncRunFolder (["-a"], \ "$latestFolder/$demultiplexedFolder/Stats", \ "$reportsData/$runFolderName", \ "$reportsData/$marker$runFolderName") } void rsyncDemux () { conversionStatsFile := "ConversionStats.xml" demuxFile := "$latestFolder/$demultiplexedFolder/Stats/$conversionStatsFile" if (!demuxFile.canRead()) { printErr("Cannot read $demuxFile") } else { print("Found $demuxFile") } demuxData := "" folder_splits := runFolderName.split("_") if (folder_splits[1].startsWith("NS")) { demuxData = "$dss/v2_read-demultiplex-stats-nextseq" } else { demuxData = "$dss/v2_read-demultiplex-stats-miseq-hiseq" } rsyncRunFolder (["-a"], \ "$demuxFile", \ "$demuxData/$runFolderName", \ "$demuxData/$marker$runFolderName") } void createFastqc () { fastqcBinary := "/links/application/dsu/Python-scripts/fastqc_plots_improved.py" fastqcOutputFolder := "fastqc" outPutFolder := "$latestFolder/$demultiplexedFolder/$fastqcOutputFolder" task python3.4 $fastqcBinary \ --path $latestFolder/$demultiplexedFolder \ --outpath $outPutFolder \ --regex "*.fastq.gz" \ --debug wait listOfLanes := getLaneNumbers("$outPutFolder", "*R1_001*") print("$listOfLanes\n") string laneString # TODO distinguish between single lane and more lanes if (listOfLanes.size() > 1) { laneString = "L00" } else{ laneString = "" } for (string lane : listOfLanes) { filesPerLane := outPutFolder.dir("*$laneString$lane*.html") print("$filesPerLane") folderName := cleanString("$fcName") + "_" + "$lane" newFastqcFolder := "$outPutFolder/$folderName" newFastqcFolder.mkdir() for (string fastqcHtmlfile : filesPerLane) { sys mv "$outPutFolder/$fastqcHtmlfile" "$newFastqcFolder" } rsyncRunFolder (["-a"], \ "$newFastqcFolder", \ "$fastqcData", \ "$fastqcData/$marker$folderName") } } void barcodeDistribution () { barcodeDistBinary := "/links/application/dsu/barcodeDistribution/source/barcodeDistribution.py" searchFolder := "$latestFolder/$demultiplexedFolder/" listOfLanes := getLaneNumbers("$searchFolder", "*R1_001*.gz") print("$listOfLanes\n") string laneString # TODO distinguish between single lane and more lanes if (listOfLanes.size() > 1) { laneString = "L00" } else{ laneString = "" } for (string lane : listOfLanes) { outputFolder := cleanString("$fcName") + "_" + "$lane" fullOutputfolder := "$barcodeDistData/$outputFolder" regex := "Undetermined*$laneString$lane*.fastq.gz" fullOutputfolder.mkdir() task (cpus := 5) { sys python3.4 $barcodeDistBinary \ -f $fcName \ -p $latestFolder/$demultiplexedFolder \ -r $regex \ -o $fullOutputfolder \ -d } wait task touch "$barcodeDistData/$marker$outputFolder" } } void bowtie () { undeterminedPath := "$latestFolder/$demultiplexedFolder/Undetermined_indices/Sample_lane1" bowtie2Binary := "/links/application/dsu/aligner/bowtie2/bowtie2" bowtie2PhixIndices := "/links/application/dsu/Genomes/bowtie2_indexes/phix/phix" r1 := "$undeterminedPath/lane1_Undetermined_L001_R1_001.fastq.gz" r2 := "$undeterminedPath/lane1_Undetermined_L001_R2_001.fastq.gz" task ( cpus := 7 ) { sys $bowtie2Binary \ -p7 \ -x $bowtie2PhixIndices \ -1 $r1 \ -2 $r2 2>"$undeterminedPath/bowtie2_outPE.txt" \ -S "$undeterminedPath/bowtie2_phiX_mapped_reads_PE.sam" } } string getDate() { return sys date } void send_mail (string subject, string message){ mailList := "manuel.kohler@bsse.ethz.ch" task echo "$message" | /usr/bin/mutt -s "$subject" $mailList }