# validate the header -- row 1 contains a strainid, row 2 a value type, row 3, a value unit
def validate_header_line(row, first_data_col, line, errors):
  # validate the strain
  if row is 0:
    for i in range(first_data_col, len(line)):
      strain = line[i]
      if not isStrainIdValid(strain):
        errors.append(createFileValidationError("Strain in col " + str(i + 1) + " " + strainValidationErrorMessageFragment(strain)))
  
  # validate the value type
  elif row is 1:
    for i in range(first_data_col, len(line)):
      isControlledVocabularyPropertyValid(line[i],
        "value type", ['VALUE', 'MEAN', 'MEDIAN', 'STD', 'VAR', 'ERROR', 'IQR'], 
        "'Value', 'Mean', 'Median', 'Std', 'Var', 'Error', 'Iqr'",
        errors)

  # validate the value unit
  else:
    for i in range(first_data_col, len(line)):
      isControlledVocabularyPropertyValid(line[i], 
        "value unit", ['MM', 'UM', 'RATIOT1', 'RATIOCS'], "'mM', 'uM', 'RatioT1', 'RatioCs'",
        errors)
      

def validate_data(time_series_data, first_data_row, first_data_col, errors):
  chebiRegex = re.compile("^CHEBI:[0-9]+")
  bsbmeRegex = re.compile("^BSBME:[0-9]+")
  dataLines = time_series_data.getRawDataLines()
  lineCount = 0
  for line in dataLines:
    # Dispatch to another function to validate the header
    if lineCount < first_data_row:
      validate_header_line(lineCount, first_data_col, line, errors)
      lineCount = lineCount + 1
      continue

    # The header needs to be CompoundID    
    if lineCount is first_data_row:
      if line[0] != "CompoundID":
        errors.append(createFileValidationError("The first data column must be 'CompoundID'"))
        break
      lineCount = lineCount + 1
      continue

    # The compound id should be one of these forms
    compoundId = line[0]
    if not chebiRegex.match(compoundId):
      if not bsbmeRegex.match(compoundId):
        errors.append(createFileValidationError("Line " + str(lineCount + 1) + ", column 1 must be of the format 'CHEBI:#' or 'BSBME:#' (instead of " + compoundId + ")."))
    lineCount = lineCount + 1
    
def validate_metadata(time_series_data, errors):
  metadata = time_series_data.getMetadataMap()
  validationHelper = ValidationHelper(metadata, errors)
  
    # validate the header format
  validationHelper.validateExplicitHeaderFormat("METABOL HYBRID")
  
  # validate the timepoint type
  validationHelper.validateControlledVocabularyProperty("TIMEPOINT TYPE", 
    "time point type", ['EX', 'IN', 'SI'], "'EX', 'IN', 'SI'")

  # validate the cell location    
  validationHelper.validateControlledVocabularyProperty("CELL LOCATION",
     "cell location", ['CE', 'ES', 'ME', 'CY', 'NC'], "'CE', 'ES', 'ME', 'CY', 'NC'")
     
  # validate the scale
  validationHelper.validateControlledVocabularyProperty("SCALE", "scale",
    ['LIN', 'LOG2', 'LOG10', 'LN'], "'lin', 'log2', 'log10', 'ln'")
    
  # validate the data position specification
  validationHelper.validateStartDataRowCol()


def validate_data_set_file(file):
  errors = []
  time_series_data = create_time_series_excel(file.getAbsolutePath())
  if time_series_data is None:
    errors.append(createFileValidationError(file.getName() + " is not an Excel file."))
    return errors
    
  # validate the metadata
  validate_metadata(time_series_data, errors)
  
  data_start = getInitialDataRowAndCol(time_series_data.getMetadataMap())
      
  # validate the data
  validate_data(time_series_data, data_start[0], data_start[1], errors)
  
  return errors