#!/usr/bin/env /links/application/dsu/.bds/bds /* The BDS automatic command line parsing allows the control on what to run * bds yoda_analysis.bds -reRun true -latestFolder /home/sbsuser/yoda/150304_M01761_0119_000000000-ADTAN * * For trying out what would run use the dryRun flag: * bds -c /links/application/dsu/.bds/bds.config -dryRun -s ssh yoda_analysis.bds -reRun \ * -latestFolder /home/sbsuser/yoda/150304_M01761_0119_000000000-ADTAN \ * -runReadRTATimestamp -runSampleSheetCreation -runTriggerBcl2fastq -runDemultiplexStats -runRsyncOnDemux * -runRsyncFlowcell -runCreateFastqc -runBarcodeDistribution -runRsyncDemux */ bool reRun bool runReadRTATimestamp bool runSampleSheetCreation bool runTriggerBcl2fastq bool runDemultiplexStats bool runRsyncFlowcell bool runCreateFastqc bool runAggregateFastqc bool runBarcodeDistribution bool runRsyncDemultiplexedFiles bool runRsyncLaneStatictics bool runBowtie bool runReadJSON bool runChecksums bool debugRun string sequencer string latestFolder string [] laneList int mismatches runBase := "/links/shared/dsu/runs/$sequencer" if (latestFolder.isEmpty()) { latestFolder = getLatestFolder(runBase) } analysisStarted := "$latestFolder" + "/Analysis.started" analysisFinished := "$latestFolder" + "/Analysis.finished" runCompleted := "$latestFolder" + "/RTAComplete.txt" dss := "/links/shared/dsu/dss" # Drop boxes paths rtaIncoming := "$dss/v2_read-rta-timestamp" flowCellData := "$dss/v2_register-flowcell" unalignedData := "$dss/v2_register-flowlane" # is set in the function depending on the run #demuxData := "$dss/v2_read-demultiplex-stats-miseq-hiseq" fastqcData := "$dss/v2_register-fastqc" fastqcAggregateData := "$dss/v2_register-fastqc-aggregate" barcodeDistData := "$dss/v2_register-undetermined" reportsData := "$dss/v2_register-demuliplex-stats" demultiplexedFolder := "demultiplexed" marker := ".MARKER_is_finished_" #marker := ".MARKER_" samplePrefix := "BSSE_QGF_" if (!latestFolder.exists()) { print("Folder $latestFolder does not exist!\n") exit 0 } string fcName runFolderName := latestFolder.baseName() print("Runfolder: $runFolderName\n") splits := runFolderName.split("_") if (splits[3].startsWith("000")) { fcName = splits[3] } else { fcName = splits[3].substr(1) } print("Flowcell: $fcName\n") string model = get_model(splits[1]) print("Model: $model\n") bool {} taskList # ----------------------------------------------------------------------------- # Pre-Checks if (reRun) { removeOutputFiles() if (laneList.isEmpty()) { taskList = {"runReadRTATimestamp" => true, \ "runSampleSheetCreation" => true, \ "runTriggerBcl2fastq" => true, \ "runDemultiplexStats" => true, \ "runRsyncDemultiplexedFiles" => true, \ "runRsyncFlowcell" => true, \ "runCreateFastqc" => true, \ "runAggregateFastqc" => true, \ "runBowtie" => true, \ "runBarcodeDistribution" => true, \ "runRsyncLaneStatictics" => true,\ "runReadJSON" => true, \ "runChecksums" => true, \ "debugRun" => debugRun} } else { taskList = {"runReadRTATimestamp" => false, \ "runSampleSheetCreation" => true, \ "runTriggerBcl2fastq" => true, \ "runDemultiplexStats" => true, \ "runRsyncDemultiplexedFiles" => true, \ "runRsyncFlowcell" => true, \ "runCreateFastqc" => true, \ "runAggregateFastqc" => true, \ "runBarcodeDistribution" => true, \ "runBowtie" => false, \ "runRsyncLaneStatictics" => true, \ "runReadJSON" => true, \ "runChecksums" => true, \ "debugRun" => true} } } else { taskList = {"runReadRTATimestamp" => runReadRTATimestamp, \ "runSampleSheetCreation" => runSampleSheetCreation, \ "runTriggerBcl2fastq" => runTriggerBcl2fastq, \ "runDemultiplexStats" => runDemultiplexStats, \ "runRsyncDemultiplexedFiles" => runRsyncDemultiplexedFiles, \ "runRsyncFlowcell" => runRsyncFlowcell, \ "runCreateFastqc" => runCreateFastqc, \ "runAggregateFastqc" => runAggregateFastqc, \ "runBowtie" => runBowtie, \ "runBarcodeDistribution" => runBarcodeDistribution, \ "runRsyncLaneStatictics" => runRsyncLaneStatictics, \ "runReadJSON" => runReadJSON, \ "runChecksums" => runChecksums, \ "debugRun" => debugRun} } print("taskList\n$taskList\n") if (analysisStarted.canRead() && (!taskList{"debugRun"})) { print("Analysis already started/done for flowcell $runFolderName\n") exit 0 } if ( runCompleted.canRead() ) { print ("Run is complete, found: $runCompleted\n" ) analysisStarted.write("Started: " + getDate()) startAnalysis("$fcName", "$model") } # -------------------------------------------------------------------------- # Helper functions string getLatestFolder (string runBase) { string [] runFolderList folderList := runBase.dir("*") for (string folder : folderList) { fullfolder := "$runBase/$folder" Analysisstarted := "$fullfolder/" + "Analysis.started" runComplete := "$fullfolder/" + "RTAComplete.txt" if (fullfolder.isDir() && !Analysisstarted.exists() && runComplete.exists()) { runFolderList.add(fullfolder) } } print("Run Folders to consider: " + "$runFolderList\n") string latestFolder = "does_not_exist" if (runFolderList.size() > 0 ) { runFolderList.sort() reversedList := runFolderList.reverse() latestFolder = reversedList.pop() } return latestFolder } void removeOutputFiles() { print "Removing files...\n" for (int i=1; i < 9; i++) { oldDemuxFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$i" print("Removing $oldDemuxFolder\n") string demuxTaskID task ( canFail := true, cpus := 1 ){ sys rm -rf "$oldDemuxFolder" } wait demuxTaskID } string markerTaskId task (canFail := true, cpus := 4 ){ sys rm -f "$analysisStarted" sys rm -f "$analysisFinished" } wait markerTaskId sleep(30) sys rm -f "$analysisStarted" } string cleanString (string toClean) { return toClean.replace("-", "_") } string [] getLaneNumbers (string searchFolder, string fileRegex) { # try to figure out which lanes are present and returns the result as a string list string [] listOfLanes fileList := searchFolder.dir(fileRegex) # print("$searchFolder\n") # print("$fileRegex\n") for (string aFile : fileList) { string lane splittedName := aFile.split("_") # print("splittedName") # print(splittedName) # 'Regular case': Undetermined_S0_L003_R1_001.fastq.gz if (splittedName[2].startsWith("L00") || splittedName[0].startsWith("BSSE")) { splitSize := splittedName.size() lane = splittedName[splitSize-3].substr(3,4) } # Assuming that Illumina leaves out the Lane # information when there using the option "--no-lane-splitting" # with bcl2fastq else { lane = "1" } if (!listOfLanes.has(lane) && !lane.isEmpty()) { listOfLanes.add(lane) } } return listOfLanes } int extractLaneNumberfromRunInfo () { string laneCount = sys /bin/grep LaneCount "$latestFolder/RunInfo.xml" | /bin/awk '{ print $2 }' | /usr/bin/tr -dc '0-9' laneCountInt := laneCount.parseInt() if (model == "NEXTSEQ_500") { laneCountInt = 1 } return laneCountInt } string get_model(string machineId) { """ Guesses the sequencer model from the run folder name Current Naming schema for Illumina run folders, as far as I know, no documentation found on this, Illumina introduced a field called on the NextSeq runParameters.xml. That might be an option for the future. Alternatively a combination of the fields and . MiSeq: 150130_M01761_0114_000000000-ACUR0 NextSeq: 150202_NS500318_0047_AH3KLMBGXX HiSeq 2000: 130919_SN792_0281_BD2CHRACXX HiSeq 2500: 150203_D00535_0052_AC66RWANXX HiSeq 3000: 150724_J00121_0017_AH2VYMBBXX HiSeq 4000: 150210_K00111_0013_AH2372BBXX HiSeq X: 141121_ST-E00107_0356_AH00C3CCXX """ if (machineId.startsWith("NS")) model = "NEXTSEQ_500" else if (machineId.startsWith("M")) model = "MISEQ" else if (machineId.startsWith("D")) model = "HISEQ_2500" else if (machineId.startsWith("SN")) model = "HISEQ_2000" else if (machineId.startsWith("J")) model = "HISEQ_3000" else if (machineId.startsWith("K")) model = "HISEQ_4000" else if (machineId.startsWith("ST")) model = "HISEQ_X" else model = "UNIDENTIFIED" return model } void rsyncRunFolder (string[] rsyncParameters, string source, string targetFolder, string markerFile) { targetFolder.mkdir() source.trim() joinedParameters := rsyncParameters.join() print("rsync $joinedParameters $source $targetFolder\n") string taskId = task /usr/bin/rsync $joinedParameters $source $targetFolder wait taskId if (!markerFile.isEmpty()) { task touch "$markerFile" } } int[] buildLaneList(string [] laneList) { """ Builds a list of lanes which need to be processed. Could be all lanes or a subset which is given by a parameter. """ int [] laneListInt if (laneList.isEmpty()) { int [] laneList int laneCount = extractLaneNumberfromRunInfo() for (int i=1; i < laneCount + 1; i++) { laneListInt.add(i) } } else { for (string lane : laneList) { laneListInt.add(lane.parseInt()) } } return laneListInt } # -------------------------------------------------------------------------- void startAnalysis (string fcName, string model) { # Main function int laneCount = extractLaneNumberfromRunInfo() int [] laneListInt laneListInt = buildLaneList(laneList) print("laneListInt: " + "$laneListInt\n") # Read RTA timestamp if (taskList{"runReadRTATimestamp"}) { rsyncRunFolder(["-a"], \ "$runCompleted", \ "$rtaIncoming/$runFolderName", \ "$rtaIncoming/$marker$runFolderName") } # Read the JSON created with illuminate if (taskList{"runReadJSON"}) { triggerRunReadJSON() } # Create a sample sheet from data from openBIS if (taskList{"runSampleSheetCreation"}) { sampleSheetName := triggerSampleSheetCreation() } # Start Demultiplexing for lanes if (taskList{"runTriggerBcl2fastq"}) { triggerBcl2fastq(model, laneListInt, mismatches) } # html demultiplexing overview if (taskList{"runDemultiplexStats"}) { triggerDemultiplexStats(laneListInt) } if (taskList{"runChecksums"}) { max_jobs := 15 if ((model != "MISEQ") || (model != "NEXTSEQ_500")){ max_jobs = 8 } triggerChecksums(max_jobs, laneListInt) } # Rsync Flow Cell Raw Data if (taskList{"runRsyncFlowcell"}) { # sys mkdir "$flowCellData/$runFolderName" rsyncRunFolder (["-a", \ "--exclude='*.cif'", \ "--exclude='*.FWHMMap'", \ "--exclude='demultiplexed*'", \ "--exclude='Images'", \ "--exclude='L00*'", \ "--exclude='fastqc'", \ "--exclude='*_pos.txt'"], \ "$latestFolder", \ "$flowCellData", \ "$flowCellData/$marker$runFolderName") } if (taskList{"runCreateFastqc"}) { createFastqc (laneListInt) } if (taskList{"runAggregateFastqc"}) { aggregateFastqc (laneListInt) } # run in parallel par { if (taskList{"runBarcodeDistribution"}) { # We skip this for now as we have the stats in the bcl2fastq barcodeDistribution (laneListInt) } if (taskList{"runBowtie"}) { bowtie () } } wait # Rsync the demultiplexed files: Register flow_lane if (taskList{"runRsyncDemultiplexedFiles"}) { rsyncDemultiplexedFiles(laneListInt) if (model == "MISEQ") { sleep(600) } else { sleep(1) } } # Sync Statistic for Flowcell and Lane statistic if (taskList{"runRsyncLaneStatictics"}) { rsyncLaneStatictics(laneListInt) # Ugly hack to ensure that the data have been registered and we can set the properties #if ((model != "MISEQ") || (model != "NEXTSEQ_500")){ sleep(7200) rsyncLaneStatictics(laneListInt) #} } if (!taskList{"debugRun"}) { analysisFinished.write("Finished: " + getDate()) send_mail ("Sequencer $sequencer: Analysis finished", "Analysis for flow cell $fcName is finished.") } } /* --------------------------------------------------------------------------*/ string triggerSampleSheetCreation() { createSampleSheetBinary := "/links/application/dsu/createSampleSheet/createSampleSheet_bcl2fastq.sh" splits := runFolderName.split("_") SampleSheetName := "SampleSheet_" + "$fcName" + ".csv" task $createSampleSheetBinary \ -f $fcName \ -o $latestFolder \ -s wait return SampleSheetName } void triggerBcl2fastq (string model, int [] laneList, int mismatches) { bcl2fastqBinary := "/usr/local/bin/bcl2fastq" string laneSplitting print("Starting demultiplexing using bcl2fastq\n") if (model == "NEXTSEQ_500") { # just appending the option bcl2fastqBinary += " --no-lane-splitting " } print("laneList" + "$laneList\n") for(int lane : laneList) { sampleSheetName := "SampleSheet_" + "$fcName" + "_" + "$lane" + ".csv" outDir := "$latestFolder/$demultiplexedFolder" + "_" + "$lane" nohupFile := "$latestFolder/" + "nohup_" + "$runFolderName" + "_" + "$lane" + ".txt" task ( cpus := 8 ) { sys /usr/bin/nohup $bcl2fastqBinary \ --with-failed-reads \ --ignore-missing-bcls \ --ignore-missing-controls \ --ignore-missing-positions \ --ignore-missing-filter \ --no-bgzf-compression \ --barcode-mismatches $mismatches \ --runfolder-dir $latestFolder \ --input-dir $latestFolder/Data/Intensities/BaseCalls \ --output-dir $outDir\ --min-log-level DEBUG \ --sample-sheet $latestFolder/$sampleSheetName \ > $nohupFile 2>> $nohupFile } wait } } void triggerDemultiplexStats (int [] laneCount) { print("triggerDemultiplexStats: laneCount: " + "$laneCount") binary := "/home/sbsuser/munge_demultiplex_files_bcl2fastq/source/mungeDemultiplexStats_bcl2fastq.py" task python3.5 $binary \ -p "$latestFolder" \ -o "$reportsData/$runFolderName" wait sleep(5) task touch "$reportsData/$marker$runFolderName" for(int lane : laneCount) { rsyncRunFolder (["-a"], \ "$latestFolder/$demultiplexedFolder" + "_" + "$lane" + "/Reports", \ "$reportsData/$runFolderName"+ "_" + "$lane", \ "") rsyncRunFolder (["-a"], \ "$latestFolder/$demultiplexedFolder" + "_" + "$lane" + "/Stats", \ "$reportsData/$runFolderName"+ "_" + "$lane", \ "$reportsData/$marker$runFolderName"+ "_" + "$lane") } } void createFastqc (int [] laneCount) { fastqcBinary := "/links/application/dsu/Python-scripts/fastqc_plots_improved.py" fastqcOutputFolder := "fastqc" for(int intLane : laneCount) { inputFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" outPutFolder := "$inputFolder/$fastqcOutputFolder" task python3.5 $fastqcBinary \ --path $inputFolder \ --outpath $outPutFolder \ --regex "*.fastq.gz" \ --debug wait filesPerLane := outPutFolder.dir("*$intLane*.html") print("$filesPerLane") folderName := cleanString("$fcName") + "_" + "$intLane" newFastqcFolder := "$outPutFolder/$folderName" newFastqcFolder.mkdir() for (string fastqcHtmlfile : filesPerLane) { sys mv "$outPutFolder/$fastqcHtmlfile" "$newFastqcFolder" } rsyncRunFolder (["-a"], \ "$newFastqcFolder", \ "$fastqcData", \ "$fastqcData/$marker$folderName") } } void aggregateFastqc (int [] laneCount) { fastqc_aggregate_binary := "/links/application/dsu/fastqc-aggregation/fastqc_aggregate/fastqc_aggregate.py" fastqcOutputFolder := "fastqc-aggregate" folderName := fastqcOutputFolder + "_" + cleanString("$fcName") outPutFolder := "$latestFolder$folderName" for(int intLane : laneCount) { inputFolder := "$latestFolder$demultiplexedFolder" + "_" + "$intLane/fastqc" filename := cleanString("$fcName") + "_" + "$intLane" + ".html" task python3.5 $fastqc_aggregate_binary --path $inputFolder --outpath $outPutFolder --filename "$filename" --ids M1,M2,M4,M5,M6,M8,M10 } wait rsyncRunFolder (["-a"], \ "$outPutFolder", \ "$fastqcAggregateData", \ "$fastqcAggregateData/$marker$folderName") } void barcodeDistribution (int [] laneCount) { barcodeDistBinary := "/links/application/dsu/barcodeDistribution/source/barcodeDistribution.py" for(int intLane : laneCount) { searchFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" listOfLanes := getLaneNumbers("$searchFolder", "*R1_001*.gz") print("$listOfLanes\n") string laneString # TODO distinguish between single lane and more lanes if (listOfLanes.size() > 1) { laneString = "L00" } else{ laneString = "" } for (string lane : listOfLanes) { outputFolder := cleanString("$fcName") + "_" + "$lane" fullOutputfolder := "$barcodeDistData/$outputFolder" regex := "Undetermined*$laneString$lane*R1*.fastq.gz" fullOutputfolder.mkdir() task (cpus := 2) { sys python3.5 $barcodeDistBinary \ -f $outputFolder \ -p $searchFolder \ -r $regex \ -o $fullOutputfolder \ -d } wait task touch "$barcodeDistData/$marker$outputFolder" } wait } } void rsyncDemultiplexedFiles (int [] laneCount) { for(int intLane : laneCount) { searchFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" nohupFile := "$latestFolder/" + "nohup_" + "$runFolderName" + "_" + "$intLane" + ".txt" listOfLanes := getLaneNumbers(searchFolder, "*R1_001*.fastq.gz") print(listOfLanes) debug(listOfLanes) string [] filesPerLane # just for the Undetermined for (string lane : listOfLanes) { string newFolderName = "Undetermined_" + cleanString(fcName) + "_" + lane #Undetermined_000000000_AH4PH_1 string newFolderPath = searchFolder + "/" + newFolderName # Assuming it is only one lane if (listOfLanes.size() == 1 && listOfLanes[0] == 1) { filesPerLane = searchFolder.dir("*.fastq.gz") } else { filesPerLane = searchFolder.dir("*L00" + lane + "*.fastq.gz") } newFolderPath.mkdir() print("Created $newFolderPath\n") for (string fastqFile : filesPerLane) { string moveAndLinkID task ( canFail := true, cpus := 1 ){ # sys echo "$searchFolder/$fastqFile" "$searchFolder/$newFolderName" sys [[ -f "$searchFolder/$fastqFile" ]] && mv "$searchFolder/$fastqFile" "$searchFolder/$newFolderName/" sys [[ ! -f "$searchFolder/$fastqFile" ]] && ln -s "$searchFolder/$newFolderName/$fastqFile" "$searchFolder/$fastqFile" } } sys chmod -R 774 "$newFolderPath" rsyncRunFolder (["-a"], \ "$nohupFile", \ "$unalignedData/$newFolderName", \ "") rsyncRunFolder (["-a"], \ "$newFolderPath", \ "$unalignedData", \ "$unalignedData/$marker$newFolderName") } # BSSE_QGF_36104_000000000_AH4PH_1 sampleDirList := searchFolder.dir("$samplePrefix*") for( string sampleFolder : sampleDirList) { sys chmod -R 774 "$searchFolder/$sampleFolder" rsyncRunFolder (["-a"], \ "$searchFolder/$sampleFolder", \ "$unalignedData", \ "") rsyncRunFolder (["-a"], \ "$nohupFile", \ "$unalignedData/$sampleFolder", \ "$unalignedData/$marker$sampleFolder") } } } void rsyncLaneStatictics (int [] laneCount) { print("Starting rsyncLaneStatictics with lanes $laneCount\n") for(int intLane : laneCount) { conversionStatsFile := "ConversionStats.xml" demuxFile := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" + "/Stats/$conversionStatsFile" if (!demuxFile.canRead()) { printErr("Cannot read $demuxFile") } else { print("Found $demuxFile") } demuxData := "" folder_splits := runFolderName.split("_") if (folder_splits[1].startsWith("NS")) { demuxData = "$dss/v2_read-demultiplex-stats-nextseq" } else { demuxData = "$dss/v2_read-demultiplex-stats-miseq-hiseq" } outputDir:= "$demuxData/$runFolderName" + "_" + "$intLane" markerFile := "$demuxData/$marker" + "$runFolderName" + "_" + "$intLane" print("$demuxFile") rsyncRunFolder (["-a"], \ "$demuxFile", \ "$outputDir", \ "$markerFile") } } void bowtie () { undeterminedPath := "$latestFolder/$demultiplexedFolder/Undetermined_indices/Sample_lane1" bowtie2Binary := "/links/application/dsu/aligner/bowtie2/bowtie2" bowtie2PhixIndices := "/links/application/dsu/Genomes/bowtie2_indexes/phix/phix" r1 := "$undeterminedPath/lane1_Undetermined_L001_R1_001.fastq.gz" r2 := "$undeterminedPath/lane1_Undetermined_L001_R2_001.fastq.gz" task ( cpus := 7 ) { sys $bowtie2Binary \ -p7 \ -x $bowtie2PhixIndices \ -1 $r1 \ -2 $r2 2>"$undeterminedPath/bowtie2_outPE.txt" \ -S "$undeterminedPath/bowtie2_phiX_mapped_reads_PE.sam" } } void triggerRunReadJSON() { monitoringBinary := "/links/application/dsu/monitor_Illumina/source/monitor.py" v2_read_json_dropbox := "/home/sbsuser/dss/v2_read-json" task /usr/local/bin/python2.7 $monitoringBinary \ --single $latestFolder \ --calculate \ --outpath $v2_read_json_dropbox wait task touch "$v2_read_json_dropbox/$marker$fcName" } void triggerChecksums(int max_jobs, int [] laneCount) { checksumBinary := "/links/application/dsu/crc32/create_checksum_file.sh" for(int intLane : laneCount) { inputFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" task $checksumBinary $inputFolder $max_jobs } wait } string getDate() { return sys date } void send_mail (string subject, string message){ mailList := "kohleman@ethz.ch cbeisel@ethz.ch" #mailList := "kohleman@ethz.ch" task echo "$message" | /usr/bin/mutt -s "$subject" $mailList }