import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Scanner; import java.io.*; import org.apache.commons.lang.StringUtils; /** * @author walshs */ class TsvMangler { // The names of these files correspond to functions available in the openBIS // interface which are used to add Samples with properties and to attach // samples to experiment private final String sampleRegisterFile = "SAMPLES_register_from_a_file_and_attach.txt"; private final String sampleListFile = "SAMPLES_specify_the_list_of_existing_samples.txt"; // The Data Store Server default incoming directory is needed to // write a portion of it's service.properties // Assumes login is openBis private final String incomingDir = new String("~/sprint/datastore_server/data/incoming"); // An integer specification the row which contains column headings private int headRow; // The column headings specified by headRow are stored in header private ArrayList header; // Rows to ignore private ArrayList ignoreRow; // The columns which define a unique Sample key private ArrayList keyCols = new ArrayList(); // List of Samples with Properties private ArrayList swp = new ArrayList(); // The file to parse private String inFile; // The name of the DataSet used by the Date Set Server private String dataSetName; // The name of the experiment private String expt; // The project name private String project; // The TSV is stored as an ArrayList of ArrayLists (of Strings) private ArrayList> data = new ArrayList>(); // directory for writing output files private String topLevel = new String(); // Data Set Property Types column config private HashMap dspts = new HashMap(); // Sample Property Types column config private HashMap spts = new HashMap(); // CONSTRUCTER public TsvMangler(int h, ArrayList i, String in, ArrayList k, String e, HashMap d, String dsn, HashMap s, String p) { headRow = h; ignoreRow = i; inFile = in; expt = e; dspts = d; dataSetName = dsn; project = p; // Change column specs from 1 based references to 0 based refs // for Sample key columns Iterator iterator = k.iterator(); while (iterator.hasNext()) { Integer j = new Integer(iterator.next()); j--; keyCols.add(j); } // Change column specs from 1 based references to 0 based refs // for Sample Property Types for (Integer foo : s.keySet()) { Integer bar = foo; bar--; spts.put(bar, s.get(foo)); } loadTsvFile(); System.out.println("header is size ; " + header.size()); generateSampleKeys(); topLevel = ("/tmp/Tsv2openBIS/" + expt + '/' + System.getProperty("user.name") + "/" + System.currentTimeMillis() + "/"); File top = new File(topLevel); top.mkdirs(); System.out.println("Writing files to " + topLevel); } private void loadTsvFile() { // loads the tsv to memory // splits the data and header into seperate structures try { BufferedReader input = new BufferedReader(new FileReader(inFile)); try { String line = null; Integer lineNum = 0; while ((line = input.readLine()) != null) { lineNum++; if (line.matches("^\\s+$")) { System.err.println("Skipping line " + lineNum + " as it contains no data"); // Weird bug with TSV generated by excel // This line.match skips some empty lines because the println runs // However, it doesn't match all empty lines as some get put into the "data" // ArrayList // Can see this when using the debugger // It's difficult to inspect because the Mac command line tools such as tail // and less // don't recognise /r as a newline break. They display as ^M. // However, the java readLine doesn't have a problem. // Easy quick work around was to go into Excel and delete the blank row of // cells // so that they are truly empty } else { ArrayList dataLine = new ArrayList(); Scanner tokenize = new Scanner(line).useDelimiter("\t"); while (tokenize.hasNext()) { dataLine.add(tokenize.next()); } if (lineNum == headRow) { header = dataLine; } else if (ignoreRow.contains(lineNum)) { // do nothing } else { data.add(dataLine); } } } } finally { input.close(); } } catch (IOException ex) { ex.printStackTrace(); } } private void generateSampleKeys() { // Generate openBIS SAMPLE IDs Iterator> di = data.iterator(); String key2; while (di.hasNext()) { ArrayList row = di.next(); Iterator kci = keyCols.iterator(); StringBuffer buf = new StringBuffer(); while (kci.hasNext()) { Integer i = kci.next(); String value = row.get(i); buf.append(value + '_'); } key2 = new String(StringUtils.chop(buf.toString())); key2 = key2.toUpperCase(); SampleWithProperties s = new SampleWithProperties(key2); // now have to add properties Iterator pi = spts.keySet().iterator(); while (pi.hasNext()) { Integer key3 = pi.next(); String val = row.get(key3); val = val.toUpperCase(); s.addProperty(spts.get(key3), val); } swp.add(s); } } private String getDataSetPropertiesAsText(){ return StringUtils.join(dspts.values().toArray(),"\n"); } public void writeDataSetPropertyLoader() { File f = new File(topLevel + "DataSetProperties.txt"); String txt = "This file simply lists DataSet properties to be registerd for the DataSet " + dataSetName + "\n" + "When the HTTPInvoker API is available this method could write a script to automate\n" + "the registration process\n" + "Anyhow, for now we just list : \n\n" + getDataSetPropertiesAsText(); try { BufferedWriter fOut = new BufferedWriter(new FileWriter(f)); fOut.write(txt); fOut.close(); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); } } public void writeSamplePropertyLoader() { File f = new File(topLevel + "SampleProperties.txt"); String txt = "This file simply lists Sample properties to be registerd for the sample.\n" + "When the HTTPInvoker API is available this method could write a script to automate\n" + "the registration process\n" + "Anyhow, for now we just list : \n\n" + getSamplePropertiesAsText(); try { BufferedWriter fOut = new BufferedWriter(new FileWriter(f)); fOut.write(txt); fOut.close(); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); } } private Object[] getSamplePropertiesAsArray() { // Uses the first Samples With Properties object // All these object have the same Property Types return (swp.get(0)).getPropertyTypes().toArray(); } private String getSamplePropertiesAsText() { return StringUtils.join(getSamplePropertiesAsArray(), "\n"); } public void writeSamplesFile() { Iterator swpi = swp.iterator(); String head = ("identifier\t" + StringUtils.join(getSamplePropertiesAsArray(), "\t") + "\n"); StringBuffer body = new StringBuffer(""); StringBuffer sl = new StringBuffer(""); while (swpi.hasNext()) { SampleWithProperties sample = swpi.next(); body.append("CISD:/" + project + "/" + sample.getName() + "\t" + StringUtils.join(sample.getPropertyValues().toArray(), "\t") + "\n"); sl.append(sample.getName() + "\n"); } File f = new File(topLevel + sampleRegisterFile); File f2 = new File(topLevel + sampleListFile); try { BufferedWriter fOut = new BufferedWriter(new FileWriter(f)); fOut.write(head.toString()); fOut.write(body.toString()); fOut.close(); BufferedWriter fOut2 = new BufferedWriter(new FileWriter(f2)); fOut2.write(sl.toString()); fOut2.close(); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); } } private String getConfigFromTemplate(String t) { StringBuffer sb = new StringBuffer(); sb.append("\n"); sb .append("# ---------------------------------------------------------------------------\n"); sb.append("# " + t + " thread configuration\n"); sb .append("# ---------------------------------------------------------------------------\n"); sb.append("# The directory to watch for incoming data.\n"); sb.append(t + ".incoming-dir = data/incoming/" + t + "\n"); sb.append(t + ".incoming-data-completeness-condition = auto-detection\n"); sb.append(t + ".group-code = " + project + "\n"); sb.append("# ---------------- Plugin properties\n"); sb .append(t + ".data-set-info-extractor = ch.systemsx.cisd.etlserver.DefaultDataSetInfoExtractor\n"); sb.append("# following should be set to $ further up the file \n"); sb .append(t + ".data-set-info-extractor.entity-separator = ${data-set-file-name-entity-separator}\n"); sb.append(t + ".data-set-info-extractor.group-code = " + project + "\n"); sb .append(t + ".data-set-info-extractor.data-set-properties-file-name = data-set-properties.tsv\n"); sb.append(t + ".type-extractor = ch.systemsx.cisd.etlserver.SimpleTypeExtractor\n"); sb.append(t + ".type-extractor.file-format-type = TSV\n"); sb.append(t + ".type-extractor.locator-type = RELATIVE_LOCATION\n"); sb.append(t + ".type-extractor.data-set-type = " + t + "\n"); sb.append(t + ".storage-processor = ch.systemsx.cisd.etlserver.DefaultStorageProcessor\n"); return sb.toString(); } public void writeDssMkdir() { StringBuffer mkdir = new StringBuffer("#!/bin/bash\n" + "mkdir -v "); mkdir.append(incomingDir + "/" + dataSetName + "\n"); File f = new File(topLevel + "make_incoming_dirs.sh"); try { BufferedWriter fOut = new BufferedWriter(new FileWriter(f)); fOut.write(mkdir.toString()); fOut.close(); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); } } public void writeServiceDotProperties() { String inputsLine = new String("inputs=" + dataSetName); StringBuffer config = new StringBuffer(); String t = getConfigFromTemplate(dataSetName); config.append(t); String sptext = new String(inputsLine + "\n\n" + config.toString() + "\n\n"); File f = new File(topLevel + "thread_config_service_properties.txt"); try { BufferedWriter fOut = new BufferedWriter(new FileWriter(f)); fOut.write(sptext); fOut.close(); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); } } public void writeDataForDssUploadOneDirPerRowStyle() { Iterator> di = data.iterator(); Iterator swpi = swp.iterator(); // each row of data while (di.hasNext()) { ArrayList row = di.next(); String sample = swpi.next().getName(); // a cell of data Iterator ci = row.iterator(); Iterator hi = header.iterator(); File path = new File(topLevel + "/DataSet/" + dataSetName + "/" + sample + "/"); File ftsv = new File(path + "/data-set-properties.tsv"); File fres = new File(path + "/result.txt"); try { path.mkdirs(); BufferedWriter ftsvOut = new BufferedWriter(new FileWriter(ftsv)); // TSV BufferedWriter fresOut = new BufferedWriter(new FileWriter(fres)); // RESULT // write the first line of the result file while (hi.hasNext()) { String head = hi.next().toUpperCase(); fresOut.write("'" + head + "'" + "\t"); } fresOut.write("\n"); // write the first line of the data-set-properties file ftsvOut.write("property\tvalue\n"); // write the data and meta-data Integer j = 0; while (ci.hasNext()) { j++; //String cell = ci.next().toUpperCase(); String cell = ci.next(); if (dspts.containsKey(j)) { if (cell.equals("")){ System.err.println(dspts.get(j) + " is `" + cell + "` for sample " + sample); } else { ftsvOut.write(dspts.get(j) + "\t" + cell + "\n"); } } fresOut.write("'" + cell + "'" + "\t"); } System.out.println("printed " + j + " cells for sample " + sample + "\n"); Integer blankCells = header.size() - j; System.out.println("need to print " + blankCells + " blank cells\n"); while (blankCells > 0){ fresOut.write("''" + "\t"); blankCells--; } fresOut.write("\n"); ftsvOut.close(); fresOut.close(); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); } } } }