from datetime import datetime from eu.basynthec.cisd.dss import TimeSeriesDataExcel def set_data_type(data_set): data_set.setPropertyValue("DATA_TYPE", "OD600") def retrieve_experiment(tr, exp_id): """Get the specified experiment form the server. Return the experiment.""" if exp_id is None: exp = None else: exp = tr.getExperiment(exp_id) return exp def extract_strains(): """Extract the strains from the data sheet""" strains = [] lines = timeSeriesData.getRawDataLines() for i in range(1, len(lines)): line = lines[i] strains.append(line[0].upper()) return ",".join(strains) def strain_canonical(strainId): """Return the canonical form of the strainId""" if strainId.lower().startswith('jjs-din'): return "JJS-DIn" + strainId[7:] else: return strainId.upper() def assign_properties(dataset, metadata): """Assign properties to the data set from information in the data.""" propertyNameMap = { "STRAIN_NAMES": "STRAIN_NAMES", "TIMEPOINT TYPE": "TIMEPOINT_TYPE", "CELL LOCATION": "CELL_LOCATION", "VALUE TYPE": "VALUE_TYPE", "VALUE UNIT": "VALUE_UNIT", "SCALE": "SCALE" } for prop in metadata.keySet(): key = propertyNameMap.get(prop) if key is not None: value = metadata.get(prop) if (key == "STRAIN"): value = value + " (STRAIN)" dataset.setPropertyValue(key, strain_canonical(value)) def convert_data_to_tsv(tr, dataset, location): """Create a tsv file containing the data and add it to the data set.""" tr.createNewDirectory(dataset, location) tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + ".tsv") tsv = open(tsvFileName, 'w') for line in timeSeriesData.getRawDataLines(): for i in range(0, len(line) - 1): tsv.write(line[i]) tsv.write("\t") tsv.write(line[len(line) - 1]) tsv.write("\n") tsv.close() def convert_data_to_split_tsv(tr, dataset, location): """Create one tsv file per strain in the original data.""" raw_data_lines = timeSeriesData.getRawDataLines() # Extract the header -- this is shared by all files header_line = raw_data_lines[0] # In the header we don't need the strain, but we start with a run number header = 'RunNumber\t' + '\t'.join(header_line[1:len(header_line)]) tr.createNewDirectory(dataset, location) # Keep track of the strains, since a strain can be measured multiple times data_per_strain = {} lines_len = len(raw_data_lines) for i in range(1, len(raw_data_lines)): line = raw_data_lines[i] strain_name = line[0] strain_data = data_per_strain.setdefault(strain_name, []) # Append the line -- this is run number + the data strain_data.append(str(len(strain_data)) + '\t' + '\t'.join(line[1:len(line)])) # Create the files for strain in data_per_strain.iterkeys(): tsvFileName = tr.createNewFile(dataset, location, incoming.getName() + "_" + strain + ".tsv") tsv = open(tsvFileName, 'w') tsv.write(header) strain_data = data_per_strain[strain] for line in strain_data: tsv.write("\n") tsv.write(line) tsv.close() def store_original_data(tr, dataset, location): """Put the original data into the data set.""" tr.createNewDirectory(dataset, location) tr.moveFile(incoming.getAbsolutePath(), dataset, location + "/" + incoming.getName()) tr = service.transaction(incoming) timeSeriesData = TimeSeriesDataExcel.createTimeSeriesDataExcel(incoming.getAbsolutePath()) # create the data set and assign the metadata from the file dataset = tr.createNewDataSet("OD600") metadata = timeSeriesData.getMetadataMap() # Strains are not in the metadata, but in the data, so extract them metadata["STRAIN_NAMES"] = extract_strains() assign_properties(dataset, metadata) # Store the original and tsv data in data sets original_dataset = tr.createNewDataSet("EXCEL_ORIGINAL") set_data_type(original_dataset) store_original_data(tr, original_dataset, "xls") tsv_dataset = tr.createNewDataSet("TSV_MULTISTRAIN_EXPORT") set_data_type(tsv_dataset) convert_data_to_tsv(tr, tsv_dataset, "tsv-multi") tsv_split_dataset = tr.createNewDataSet("TSV_EXPORT") set_data_type(tsv_split_dataset) convert_data_to_split_tsv(tr, tsv_split_dataset, "tsv") # Make the original contain these contained_codes = [original_dataset.getDataSetCode(), tsv_dataset.getDataSetCode(), tsv_split_dataset.getDataSetCode()] dataset.setContainedDataSetCodes(contained_codes) # If no experiment has been set, then get the experiment from the excel file if dataset.getExperiment() is None: exp_id = metadata.get("EXPERIMENT") exp = retrieve_experiment(tr, exp_id) if exp is not None: dataset.setExperiment(exp) original_dataset.setExperiment(exp) tsv_dataset.setExperiment(exp) tsv_split_dataset.setExperiment(exp)