library("ShortRead")

# Collect arguments
args <- commandArgs(TRUE)
print(args)
path <- (args[1])
flowcell <- (args[2])
pattern <- (args[3])
#flowcell <- "C3CFPACXX"
#path <- "/Users/kohleman/PycharmProjects/qgf/barcodeDistribution/testData"  
#pattern <- 'lane4'
filenames <- list.files(path, pattern=pattern, full.names=TRUE)
totalIndexList <- ""

plotTopIndices <- function (topIndices, xName="", path) {
  
  # Creates a barplot out of a data frame.
  #
  # Args:
  #   topIndices: data.frame which must have the columns 
  #   'totalIndexList' and 'Freq'
  #   xName: Name for the x-axes (xlab)
  #
  # Returns:
  #   an image/ a pdf
  
  fileName <- (paste(xName,".pdf", sep=""))
  fileFullPath <- (file.path(path, fileName))

  pdf(file=fileFullPath)
  par(mfcol=c(1, 1))
  countLength<- nchar(toString(max(topIndices$Freq)))
  
  par(las=2) 
  par(mar=c(5,8,4,2)) # increase y-axis margin.
  
  b <- barplot(topIndices$Freq, main="Top 15 indices distribution", xlab=paste("Count of Indices for", xName), horiz=TRUE,
               names=topIndices$totalIndexList, cex.names=.75, las=1)
  text(cex=.75, x=topIndices$Freq/2, y=b, labels=topIndices$Freq, xpd=TRUE)
  dev.off()
}

# -----------------------------------------------------------------------------

multmerge <- function(datalist, mergeBy){
  # Merges a list if data frames by the 'mergeBy' parameter
  Reduce(function(...) {merge(..., by = mergeBy, all=TRUE)}, datalist)
}

# returns string w/o leading or trailing whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)

# -----------------------------------------------------------------------------

streamFASTQ <- function (file, verbose = TRUE) {

  fileBaseName <- basename(file)
  strm <- FastqStreamer(file, readerBlockSize=1e7)
  if (verbose) {
    print (paste("Opening " ,fileBaseName))
  }
  s <- unlist(strsplit(fileBaseName, "_"))
    
  repeat {
    fq <- yield(strm)
    if (length(fq) == 0)
      break
    
    #print (fq)
    #print(sread(fq))
    #print(quality(fq))
    
    header <- id(fq)
    a <- (as.character(header))
    split <- (strsplit(a, ":"))
    indexList <- lapply(split, function(x) strsplit(x, ":")[10])
    indices <- unlist(indexList)
    
    totalIndexList <- c(totalIndexList, indices)
  }

  close(strm)

  counts <- as.data.frame(table(totalIndexList)) 
  sortedCounts <- counts[order(- counts$Freq), ]
  
  return (sortedCounts)
}

ldf <- lapply(filenames, streamFASTQ)

# if there are more than one FASTQ file we do some more stuff
if (length(ldf) > 1) {
  list1 <- multmerge(ldf, "totalIndexList")
  # Replacing all NAs with 0
  list1 [is.na(list1)] <- 0
  # sum up the values row-wise for each column, but leave out the first column
  list1$Freq<- apply(list1[,c(-1)],1,sum)
} else {
  print ("Only one FASTQ found")
  list1 <- ldf[[1]]
}

subList <- list1[c("totalIndexList","Freq")]
orderedDf <- subList[with(subList, order(- Freq)), ]
topIndices <- orderedDf[1:15, ]
#print (topIndices)

fileBaseName <- paste(flowcell, unlist(strsplit(basename(filenames[1]), "_"))[1], sep="_")
#fileBaseName <- trim(fileBaseName)
plotTopIndices (topIndices, fileBaseName, path)

#write.table(list1, file = "list1.csv", sep = ",", col.names = NA, qmethod = "double")