#!/usr/bin/env /links/application/dsu/.bds/bds /* The BDS automatic command line parsing allows the control on what to run * bds yoda_analysis.bds -reRun true -latestFolder /home/sbsuser/yoda/150304_M01761_0119_000000000-ADTAN * * For trying out what would run use the dryRun flag: * bds -c /links/application/dsu/.bds/bds.config -dryRun -s ssh yoda_analysis.bds -reRun \ * -latestFolder /home/sbsuser/yoda/150304_M01761_0119_000000000-ADTAN \ * -runReadRTATimestamp -runSampleSheetCreation -runTriggerBcl2fastq -runDemultiplexStats -runRsyncOnDemux * -runRsyncFlowcell -runCreateFastqc -runBarcodeDistribution -runRsyncDemux */ bool reRun bool runReadRTATimestamp bool runSampleSheetCreation bool runTriggerBcl2fastq bool runDemultiplexStats bool runRsyncFlowcell bool runCreateFastqc bool runBarcodeDistribution bool runRsyncDemultiplexedFiles bool runRsyncLaneStatictics bool runBowtie bool debugRun string sequencer string latestFolder string [] laneList runBase := "/links/shared/dsu/runs/$sequencer" if (latestFolder.isEmpty()) { latestFolder = getLatestFolder(runBase) } analysisStarted := "$latestFolder" + "/Analysis.started" analysisFinished := "$latestFolder" + "/Analysis.finished" runCompleted := "$latestFolder" + "/RTAComplete.txt" dss := "/links/shared/dsu/dss" # Drop boxes paths rtaIncoming := "$dss/v2_read-rta-timestamp" flowCellData := "$dss/v2_register-flowcell" unalignedData := "$dss/v2_register-flowlane" # is set in the function depending on the run #demuxData := "$dss/v2_read-demultiplex-stats-miseq-hiseq" fastqcData := "$dss/v2_register-fastqc" barcodeDistData := "$dss/v2_register-undetermined" reportsData := "$dss/v2_register-demuliplex-stats" demultiplexedFolder := "demultiplexed" marker := ".MARKER_is_finished_" #marker := ".MARKER_" samplePrefix := "BSSE_QGF_" if (!latestFolder.exists()) { print("Folder $latestFolder does not exist!\n") exit 0 } string fcName runFolderName := latestFolder.baseName() print("Runfolder: $runFolderName\n") splits := runFolderName.split("_") if (splits[3].startsWith("000")) { fcName = splits[3] } else { fcName = splits[3].substr(1) } print("Flowcell: $fcName\n") string model = get_model(splits[1]) print("Model: $model\n") bool {} taskList # ----------------------------------------------------------------------------- # Pre-Checks if (reRun) { removeOutputFiles() if (laneList.isEmpty()) { taskList = {"runReadRTATimestamp" => true, \ "runSampleSheetCreation" => true, \ "runTriggerBcl2fastq" => true, \ "runDemultiplexStats" => true, \ "runRsyncDemultiplexedFiles" => true, \ "runRsyncFlowcell" => true, \ "runCreateFastqc" => true, \ "runBowtie" => true, \ "runBarcodeDistribution" => true, \ "runRsyncLaneStatictics" => true,\ "debugRun" => debugRun} } else { taskList = {"runReadRTATimestamp" => false, \ "runSampleSheetCreation" => true, \ "runTriggerBcl2fastq" => true, \ "runDemultiplexStats" => true, \ "runRsyncDemultiplexedFiles" => true, \ "runRsyncFlowcell" => true, \ "runCreateFastqc" => true, \ "runBarcodeDistribution" => true, \ "runBowtie" => false, \ "runRsyncLaneStatictics" => true, \ "debugRun" => true} } } else { taskList = {"runReadRTATimestamp" => runReadRTATimestamp, \ "runSampleSheetCreation" => runSampleSheetCreation, \ "runTriggerBcl2fastq" => runTriggerBcl2fastq, \ "runDemultiplexStats" => runDemultiplexStats, \ "runRsyncDemultiplexedFiles" => runRsyncDemultiplexedFiles, \ "runRsyncFlowcell" => runRsyncFlowcell, \ "runCreateFastqc" => runCreateFastqc, \ "runBowtie" => runBowtie, \ "runBarcodeDistribution" => runBarcodeDistribution, \ "runRsyncLaneStatictics" => runRsyncLaneStatictics, \ "debugRun" => debugRun} } print("taskList\n$taskList\n") if (analysisStarted.canRead() && (!taskList{"debugRun"})) { print("Analysis already started/done for flowcell $runFolderName\n") exit 0 } if ( runCompleted.canRead() ) { print ("Run is complete, found: $runCompleted\n" ) analysisStarted.write("Started: " + getDate()) startAnalysis("$fcName", "$model") } # -------------------------------------------------------------------------- # Helper functions string getLatestFolder (string runBase) { string [] runFolderList folderList := runBase.dir("*") for (string folder : folderList) { fullfolder := "$runBase/$folder" Analysisstarted := "$fullfolder/" + "Analysis.started" if (fullfolder.isDir() && !Analysisstarted.exists()) { runFolderList.add(fullfolder) } } string latestFolder = "does_not_exist" if (runFolderList.size() > 0 ) { runFolderList.sort() latestFolder = runFolderList.pop() } return latestFolder } void removeOutputFiles() { print "Removing files...\n" for (int i=1; i < 9; i++) { oldDemuxFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$i" print("Removing $oldDemuxFolder\n") string demuxTaskID task ( canFail := true, cpus := 1 ){ sys rm -rf "$oldDemuxFolder" } wait demuxTaskID } string markerTaskId task ( canFail := true, cpus := 1 ){ sys rm -f "$analysisStarted" sys rm -f "$analysisFinished" } wait markerTaskId } string cleanString (string toClean) { return toClean.replace("-", "_") } string [] getLaneNumbers (string searchFolder, string fileRegex) { # try to figure out which lanes are present and returns the result as a string list string [] listOfLanes fileList := searchFolder.dir(fileRegex) for (string aFile : fileList) { string lane splittedName := aFile.split("_") if (splittedName[2].startsWith("L00")) { lane = splittedName[2].substr(3,4) } # Assuming that Illumina leaves out the Lane # information when there using the option "--no-lane-splitting" # with bcl2fastq else { lane = "1" } if (!listOfLanes.has(lane) && !lane.isEmpty()) { listOfLanes.add(lane) } } return listOfLanes } int extractLaneNumberfromRunInfo () { string laneCount = sys grep LaneCount "$latestFolder/RunInfo.xml" | awk '{ print $2 }' | tr -dc [:digit:] laneCountInt := laneCount.parseInt() if (model == "NEXTSEQ_500") { laneCountInt = 1 } return laneCountInt } string get_model(string machineId) { """ Guesses the sequencer model from the run folder name Current Naming schema for Illumina run folders, as far as I know, no documentation found on this, Illumina introduced a field called on the NextSeq runParameters.xml. That might be an option for the future. Alternatively a combination of the fields and . MiSeq: 150130_M01761_0114_000000000-ACUR0 NextSeq: 150202_NS500318_0047_AH3KLMBGXX HiSeq 2000: 130919_SN792_0281_BD2CHRACXX HiSeq 2500: 150203_D00535_0052_AC66RWANXX HiSeq 3000: 150724_J00121_0017_AH2VYMBBXX HiSeq 4000: 150210_K00111_0013_AH2372BBXX HiSeq X: 141121_ST-E00107_0356_AH00C3CCXX """ if (machineId.startsWith("NS")) model = "NEXTSEQ_500" else if (machineId.startsWith("M")) model = "MISEQ" else if (machineId.startsWith("D")) model = "HISEQ_2500" else if (machineId.startsWith("SN")) model = "HISEQ_2000" else if (machineId.startsWith("J")) model = "HISEQ_3000" else if (machineId.startsWith("K")) model = "HISEQ_4000" else if (machineId.startsWith("ST")) model = "HISEQ_X" else model = "UNIDENTIFIED" return model } void rsyncRunFolder (string[] rsyncParameters, string source, string targetFolder, string markerFile) { targetFolder.mkdir() source.trim() joinedParameters := rsyncParameters.join() print("rsync $joinedParameters $source $targetFolder\n") string taskId = task /usr/bin/rsync $joinedParameters $source $targetFolder wait taskId if (!markerFile.isEmpty()) { task touch "$markerFile" } } int[] buildLaneList(string [] laneList) { """ Builds a list of lanes which need to be processed. Could be all lanes or a subset which is given by a parameter. """ int [] laneListInt if (laneList.isEmpty()) { int [] laneList int laneCount = extractLaneNumberfromRunInfo() for (int i=1; i < laneCount + 1; i++) { laneListInt.add(i) } } else { for (string lane : laneList) { laneListInt.add(lane.parseInt()) } } return laneListInt } # -------------------------------------------------------------------------- void startAnalysis (string fcName, string model) { # Main function int laneCount = extractLaneNumberfromRunInfo() int [] laneListInt laneListInt = buildLaneList(laneList) # Read RTA timestamp if (taskList{"runReadRTATimestamp"}) { rsyncRunFolder(["-a"], \ "$runCompleted", \ "$rtaIncoming/$runFolderName", \ "$rtaIncoming/$marker$runFolderName") } # Create a sample sheet from data from openBIS if (taskList{"runSampleSheetCreation"}) { sampleSheetName := triggerSampleSheetCreation() } # Start Demultiplexing for lanes if (taskList{"runTriggerBcl2fastq"}) { triggerBcl2fastq(model, laneListInt) } # html demultiplexing overview if (taskList{"runDemultiplexStats"}) { triggerDemultiplexStats(laneListInt) } # Rsync Flow Cell Raw Data if (taskList{"runRsyncFlowcell"}) { # sys mkdir "$flowCellData/$runFolderName" rsyncRunFolder (["-a", \ "--exclude='*.cif'", \ "--exclude='*.FWHMMap'", \ "--exclude='demultiplexed*'", \ "--exclude='Images'", \ "--exclude='L00*'", \ "--exclude='fastqc'", \ "--exclude='*_pos.txt'"], \ "$latestFolder", \ "$flowCellData", \ "$flowCellData/$marker$runFolderName") } if (taskList{"runCreateFastqc"}) { createFastqc (laneListInt) } # run in parallel par { if (taskList{"runBarcodeDistribution"}) { barcodeDistribution (laneListInt) } if (taskList{"runBowtie"}) { bowtie () } } wait # Rsync the demultiplexed files if (taskList{"runRsyncDemultiplexedFiles"}) { rsyncDemultiplexedFiles(laneListInt) sleep(1500) } # Sync Statistic for Flowcell and Lane statistic if (taskList{"runRsyncLaneStatictics"}) { rsyncLaneStatictics(laneListInt) } if (!taskList{"debugRun"}) { analysisFinished.write("Finished: " + getDate()) send_mail ("Sequencer $sequencer: Analysis finished", "Analysis for flow cell $fcName is finished.") } } /* --------------------------------------------------------------------------*/ string triggerSampleSheetCreation() { createSampleSheetBinary := "/links/application/dsu/createSampleSheet/createSampleSheet_bcl2fastq.sh" splits := runFolderName.split("_") SampleSheetName := "SampleSheet_" + "$fcName" + ".csv" task $createSampleSheetBinary \ -f $fcName \ -o $latestFolder \ -s wait return SampleSheetName } void triggerBcl2fastq (string model, int [] laneList) { bcl2fastqBinary := "/usr/local/bin/bcl2fastq" string laneSplitting if (model == "NEXTSEQ_500") { # just appending the option bcl2fastqBinary += " --no-lane-splitting " } for(int lane : laneList) { sampleSheetName := "SampleSheet_" + "$fcName" + "_" + "$lane" + ".csv" outDir := "$latestFolder/$demultiplexedFolder" + "_" + "$lane" nohupFile := "$latestFolder/" + "nohup_" + "$runFolderName" + "_" + "$lane" + ".txt" task ( cpus := 16 ) { sys /usr/bin/nohup $bcl2fastqBinary \ --with-failed-reads \ --ignore-missing-bcls \ --ignore-missing-controls \ --ignore-missing-positions \ --ignore-missing-filter \ --runfolder-dir $latestFolder \ --input-dir $latestFolder/Data/Intensities/BaseCalls \ --output-dir $outDir\ --min-log-level DEBUG \ --sample-sheet $latestFolder/$sampleSheetName \ > $nohupFile 2>> $nohupFile } wait } } void triggerDemultiplexStats (int [] laneCount) { for(int lane : laneCount) { rsyncRunFolder (["-a"], \ "$latestFolder/$demultiplexedFolder" + "_" + "$lane" + "/Reports", \ "$reportsData/$runFolderName"+ "_" + "$lane", \ "") rsyncRunFolder (["-a"], \ "$latestFolder/$demultiplexedFolder" + "_" + "$lane" + "/Stats", \ "$reportsData/$runFolderName"+ "_" + "$lane", \ "$reportsData/$marker$runFolderName"+ "_" + "$lane") } } void createFastqc (int [] laneCount) { fastqcBinary := "/links/application/dsu/Python-scripts/fastqc_plots_improved.py" fastqcOutputFolder := "fastqc" for(int intLane : laneCount) { inputFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" outPutFolder := "$inputFolder/$fastqcOutputFolder" task python3.4 $fastqcBinary \ --path $inputFolder \ --outpath $outPutFolder \ --regex "*.fastq.gz" \ --debug wait filesPerLane := outPutFolder.dir("*$intLane*.html") print("$filesPerLane") folderName := cleanString("$fcName") + "_" + "$intLane" newFastqcFolder := "$outPutFolder/$folderName" newFastqcFolder.mkdir() for (string fastqcHtmlfile : filesPerLane) { sys mv "$outPutFolder/$fastqcHtmlfile" "$newFastqcFolder" } rsyncRunFolder (["-a"], \ "$newFastqcFolder", \ "$fastqcData", \ "$fastqcData/$marker$folderName") } } void barcodeDistribution (int [] laneCount) { barcodeDistBinary := "/links/application/dsu/barcodeDistribution/source/barcodeDistribution.py" for(int intLane : laneCount) { searchFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" listOfLanes := getLaneNumbers("$searchFolder", "*R1_001*.gz") print("$listOfLanes\n") string laneString # TODO distinguish between single lane and more lanes if (listOfLanes.size() > 1) { laneString = "L00" } else{ laneString = "" } for (string lane : listOfLanes) { outputFolder := cleanString("$fcName") + "_" + "$lane" fullOutputfolder := "$barcodeDistData/$outputFolder" regex := "Undetermined*$laneString$lane*.fastq.gz" fullOutputfolder.mkdir() task (cpus := 2) { sys python3.4 $barcodeDistBinary \ -f $outputFolder \ -p $searchFolder \ -r $regex \ -o $fullOutputfolder \ -d } wait task touch "$barcodeDistData/$marker$outputFolder" } wait } } void rsyncDemultiplexedFiles (int [] laneCount) { for(int intLane : laneCount) { searchFolder := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" listOfLanes := getLaneNumbers(searchFolder, "*R1_001*.fastq.gz") debug(listOfLanes) string [] filesPerLane for (string lane : listOfLanes) { string newFolderName = "Undetermined_" + cleanString(fcName) + "_" + lane #Undetermined_000000000_AH4PH_1 string newFolderPath = searchFolder + "/" + newFolderName newFolderPath.mkdir() # Assuming it is only one lane if (listOfLanes.size() == 1 && listOfLanes[0] == 1) { filesPerLane = searchFolder.dir("*.fastq.gz") } else { filesPerLane = searchFolder.dir("*L00" + lane + "*.fastq.gz") } for (string fastqFile : filesPerLane) { sys mv "$searchFolder/$fastqFile" "$searchFolder/$newFolderName" sys ln -s "$searchFolder/$newFolderName/$fastqFile" "$searchFolder/$fastqFile" } sys chmod -R 774 "$newFolderPath" rsyncRunFolder (["-a"], \ "$newFolderPath", \ "$unalignedData", \ "$unalignedData/$marker$newFolderName") } # BSSE_QGF_36104_000000000_AH4PH_1 sampleDirList := searchFolder.dir("$samplePrefix*") for( string sampleFolder : sampleDirList) { sys chmod -R 774 "$searchFolder/$sampleFolder" rsyncRunFolder (["-a"], \ "$searchFolder/$sampleFolder", \ "$unalignedData", \ "$unalignedData/$marker$sampleFolder") } } } void rsyncLaneStatictics (int [] laneCount) { print("Starting rsyncLaneStatictics with lanes $laneCount\n") for(int intLane : laneCount) { conversionStatsFile := "ConversionStats.xml" demuxFile := "$latestFolder/$demultiplexedFolder" + "_" + "$intLane" + "/Stats/$conversionStatsFile" if (!demuxFile.canRead()) { printErr("Cannot read $demuxFile") } else { print("Found $demuxFile") } demuxData := "" folder_splits := runFolderName.split("_") if (folder_splits[1].startsWith("NS")) { demuxData = "$dss/v2_read-demultiplex-stats-nextseq" } else { demuxData = "$dss/v2_read-demultiplex-stats-miseq-hiseq" } outputDir:= "$demuxData/$runFolderName" + "_" + "$intLane" markerFile := "$demuxData/$marker" + "$runFolderName" + "_" + "$intLane" print("$demuxFile") rsyncRunFolder (["-a"], \ "$demuxFile", \ "$outputDir", \ "$markerFile") } } void bowtie () { undeterminedPath := "$latestFolder/$demultiplexedFolder/Undetermined_indices/Sample_lane1" bowtie2Binary := "/links/application/dsu/aligner/bowtie2/bowtie2" bowtie2PhixIndices := "/links/application/dsu/Genomes/bowtie2_indexes/phix/phix" r1 := "$undeterminedPath/lane1_Undetermined_L001_R1_001.fastq.gz" r2 := "$undeterminedPath/lane1_Undetermined_L001_R2_001.fastq.gz" task ( cpus := 7 ) { sys $bowtie2Binary \ -p7 \ -x $bowtie2PhixIndices \ -1 $r1 \ -2 $r2 2>"$undeterminedPath/bowtie2_outPE.txt" \ -S "$undeterminedPath/bowtie2_phiX_mapped_reads_PE.sam" } } string getDate() { return sys date } void send_mail (string subject, string message){ mailList := "manuel.kohler@bsse.ethz.ch" task echo "$message" | /usr/bin/mutt -s "$subject" $mailList }