diff --git a/library_information_validator.py b/library_information_validator.py new file mode 100755 index 0000000..fa6260a --- /dev/null +++ b/library_information_validator.py @@ -0,0 +1,507 @@ +#!/usr/bin/env python + +# load libraries +from __future__ import print_function +import argparse +from openpyxl import load_workbook +import coloredlogs +import logging +import re +import couchdb +import numbers +import decimal +import yaml +from numpy import setdiff1d +from collections import Counter +import Levenshtein as lev +import sys + +# Set up a logger with colored output +logger = logging.getLogger(__name__) +logger.propagate = False # Otherwise the messages appeared twice +coloredlogs.install(level='INFO', logger=logger, + fmt='%(asctime)s %(levelname)s %(message)s') + + +class LibrarySheet: +#Class Attributes + SHEET_NAME = 'Sample information' + FIRST_LINE = 20 # First line where user submitted data is located + SAMPLE_NAME_COL = 'P' # user defined sample name + POOL_NAME_COL = 'V' # column of pool names in pool summary + POOL_NAME_SAMPLE_COL = "O" # column of pool names in index definition + LENGTH_COL = 'Y' # average fragment length of pools + MOLARITY_COL = 'AA' # molarity of the pool, currently not used + SINDEX_COL = 'S' # NGI standard index sequences (automated) + CINDEX_COL = 'T' # custom index sequences + PLATE_ID = 'N6' # plate ID as specified in user sheet + MAX_DISTANCE = 2 # required differences between Indexes in the same pool + + # Initializer / Instance attributes + def __init__(self, library_info_sheet): + self.library_info_sheet = library_info_sheet + self.library_sheet = None + self.sample_rec = None + self.Project_Information = None + + # instance methods + def getAccessUserSheet(self): + if self.library_sheet == None: + wb = load_workbook(self.library_info_sheet, read_only=True, data_only=True) + ws = wb[LibrarySheet.SHEET_NAME] + self.library_sheet = ws + + def projectID(self): + """retrieves the project and plate ID from the excel sheet and checks the + correctness of the plate ID format.""" + self.getAccessUserSheet() + plate_id = self.library_sheet[LibrarySheet.PLATE_ID].value + if(len(re.findall('P\d+P\d+', plate_id))>0): + project_id_user = re.findall('P\d+', plate_id)[0] + else: + logger.error( + 'The given plate ID ({}) in cell {} has the wrong format. It should be in the format' + ' PxxxxxPx, where x are numbers. If you think your Plate ID is correct, contact your project coordinator.'\ + .format(plate_id, LibrarySheet.PLATE_ID) + ) + sys.exit(1) + return([project_id_user, plate_id]) + + def getRows(self, column): + """ identifies all rows containing a sample name, discards rows without entry. + Rows containing whitespace only trigger a warning and are disregarded in subsequent + tests """ + warning_empty_row = 0 + cellID_withSample = list() + cellID_noSample = list() + for i in range(LibrarySheet.FIRST_LINE, LibrarySheet.FIRST_LINE+96): + cell_id = "{col}{row_iter}".format(col=column,row_iter=i) + cell_value = str(self.library_sheet[cell_id].value) + if(cell_value.isspace()): + logger.warning( + 'Cell {} contains empty spaces only. Remove content.'.format(cell_id) + ) + warning_empty_row += 1 + elif(self.library_sheet[cell_id].value != None): + cellID_withSample.append(i) + else: + cellID_noSample.append(cell_id) + return(cellID_withSample, warning_empty_row) + + def ProjectInfo(self, config): + """ + Retrieves the project information from couchDB, checks that the project exists in + couchDB and is unique. Returns the information and the full project plateID. + """ + # access to the project in couchDB using information from the given config file + with open(config) as settings_file: + server_settings = yaml.load(settings_file, Loader=yaml.FullLoader) + couch = couchdb.Server(server_settings.get("couch_server", None)) + db = couch["projects"] + # check the existence of the project number in couchDB + project_plate_ID = self.projectID() + project_id_found = db.view("project/project_id", key=project_plate_ID[0]) + prow = project_id_found.rows + # Project not found + if len(prow) == 0: + logger.error( + 'Project not found, please check your entry for the PlateID, it should have the format' + 'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.' + ) + sys.exit(1) + # more than one project found + elif len(prow) > 1: + logger.error( + 'Project ID not unique, please check your entry for the PlateID, it should have the format' + 'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.' + ) + sys.exit(1) + else: + # puts the Document of the identified project in a new variable "pdoc" + self.Project_Information = db.get(prow[0].id) + self.Project_Plate_ID = project_plate_ID[1] + + def validate_project_Name(self): + """ + Prints the identified project name based on the user supplied Plate/Project ID for + control purposes by the project coordinator. Further checks that the + plate number is not already in couchDB. + """ + # print(self.pdoc) + project_name_DB = self.Project_Information['project_name'] + samples = self.Project_Information['samples'].keys() + plate ='P{}_{}'.format(self.Project_Plate_ID.split("P")[1],self.Project_Plate_ID.split("P")[2]) + found_plate = [s for s in samples if plate in s] + warning_project_name = 0 + if(len(found_plate)>0): + new_plate_no = int(self.Project_Plate_ID.split("P")[2]) + new_plate_no += 1 + new_plate_ID = 'P{}P{}'.format(self.Project_Plate_ID.split("P")[1], new_plate_no) + logger.warning( + 'Plate number {} is already used. Please increase the plate number to {}.'.format(self.Project_Plate_ID, new_plate_ID)) + warning_project_name = 1 + return(warning_project_name) + + def validate_sequencing_setup(self, info, cell_id_length): + """ + checks that the ordered cycle number fits the given the insert length + OBS currently there is a 15% difference allowed before a warning is given, + this is hard coded atm. + """ + warning_cycles = 0 + cycle_no_string = info['details']['sequencing_setup'] + cycle_no_array = cycle_no_string.split("x") + cycle_no = int(cycle_no_array[0])*int(cycle_no_array[1]) + insert_length = self.library_sheet[cell_id_length].value + + if((int(cycle_no)*1.15) < int(insert_length)): + logger.warning('The cycle number ({} = {}) is lower than the insert length ({}bp) in cell {}.'\ + .format(cycle_no_string,cycle_no,insert_length, cell_id_length)) + warning_cycles += 1 + return(warning_cycles) + + def validate(self): + """ + - identifies the samples in a pool + - detects missing entry in pool column + - initiates index check which is done for each pool independently + """ + pool_warnings = [] + + cell_rowid_sample, warning_eR_sample = self.getRows(LibrarySheet.SAMPLE_NAME_COL) + cell_rowid_pool, warning_eR_pool_sample = self.getRows(LibrarySheet.POOL_NAME_SAMPLE_COL) + cell_rowid_pool_info, warning_eR_pool = self.getRows(LibrarySheet.POOL_NAME_COL) + + #check for missing for pool ID entries + if(len(cell_rowid_sample) > len(cell_rowid_pool)): + missing_pool_rowid_list = setdiff1d(cell_rowid_sample, cell_rowid_pool) + for missing_pool_rowid in missing_pool_rowid_list: + logger.error( + 'Missing pool definition in {}{}'.format(LibrarySheet.POOL_NAME_SAMPLE_COL, missing_pool_rowid)) + sys.exit(1) + + #initiate check for sequencing setup and discrepancies between ordered numbers + #of cycles and average read length + warnings_cycle = [] + for row_nr in cell_rowid_pool_info: + cell_id_mol = "{col}{row_nr}".format(col=LibrarySheet.MOLARITY_COL, row_nr=row_nr) # to work on later + cell_id_length = "{col}{row_nr}".format(col=LibrarySheet.LENGTH_COL, row_nr=row_nr) + cell_id_pool = "{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_COL, row_nr=row_nr) + validator = Validator(self.library_sheet,cell_id_mol) # molarity is currently not checked + result_numeric, warnings_numeric = validator.validate_numeric() + warnings_c = self.validate_sequencing_setup(self.Project_Information, cell_id_length) + warnings_cycle.append(warnings_c) + + #retrieve all pool IDs defined, in order to later analyse by pool + pool_values = [] + for row_nr in cell_rowid_sample: + current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=row_nr) + current_cell_value_pool = self.library_sheet[current_cell_id_pool].value + pool_values.append(current_cell_value_pool) + + # initiate check by pool for index issues + # - differences in index lengths + # - low diversity between Indexes + # - duplicated indixes + # - differences in length within dual Indexes + # - unbalanced pools + poolIDs = list(dict.fromkeys(pool_values)) + i = 0 + current_pool_rows = [] + pools_with_warnings = [] + for pool in poolIDs: + for nrow_nr in cell_rowid_sample: + current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=nrow_nr) + current_cell_value_pool = self.library_sheet[current_cell_id_pool].value + if(current_cell_value_pool == pool): + current_pool_rows.append(nrow_nr) + validator = Validator(self.library_sheet) + + dictionary_sel_index = validator.select_index(current_pool_rows, pool) + dictionary_val_index = \ + validator.validate_index(dictionary_sel_index["result_index"], + pool, + dictionary_sel_index["sindex"] + + ) + # summary of warnings + all_warnings = [ + dictionary_val_index["warning_low_div"], + dictionary_val_index["warning_index_length"], + dictionary_val_index["warning_index_balance"], + warnings_cycle[i], + dictionary_sel_index["warning_index_mix"], + dictionary_sel_index["warning_length_comp"] + ] + sums_warnings = sum(all_warnings) + if(sums_warnings > 0): + pools_with_warnings.append(pool) + current_pool_rows =[] + i += 1 + + return(len(pools_with_warnings), len(poolIDs)) + +class Validator(object): + # Initializer / Instance attributes + def __init__(self, access_sample_info_sheet, molarityID = None): + self.access_sample_info_sheet = access_sample_info_sheet + self.molarityID = molarityID + + # instance methods + def validate_numeric(self): + """Checks whether value is numeric or not.""" + warnings_numeric = 0 + for checkNumbers in [self.molarityID]: + if not isinstance(self.access_sample_info_sheet[checkNumbers].value, numbers.Number): + try: + float(self.access_sample_info_sheet[checkNumbers].value.replace(",", ".")) + logger.error( + 'Cell {} with value \"{}\" is not numeric due to decimal point/comma clash.'\ + .format(self.access_sample_info_sheet[checkNumbers].coordinate, self.access_sample_info_sheet[checkNumbers].value) + ) + warnings_numeric += 1 + return False + except ValueError: + logger.error( + 'Cell {} with value {} is \"'.format(self.access_sample_info_sheet[checkNumbers].coordinate)+ self.access_sample_info_sheet[checkNumbers].value + '\" is not numeric' + ) + warnings_numeric += 1 + except TypeError: + if self.access_sample_info_sheet[checkNumbers].value is None: + logger.error( + 'Cell {} is numeric but empty'.format(self.access_sample_info_sheet[checkNumbers].coordinate) + ) + warnings_numeric += 1 + return False + else: + raise + return(False, warnings_numeric) + + # currently not used + #def validate_mol(self, min_mol, max_mol): + # warning_val_mol = 0 + # if(self.access_sample_info_sheet[self.concentrationID].value < min_mol) \ + # or (self.access_sample_info_sheet[self.concentrationID].value > max_mol): + # logger.warning('Sample molarity ({}ng/ul) in cell {} is out of specifications: {}-{}ng/ul'\ + # .format(self.access_sample_info_sheet[self.molarityID].value,self.molarityID, min_mol, max_mol)) + # warning_val_mol += 1 + # return(True, warning_val_mol) + + def select_index(self, pool_rows, pool): + ''' + - identifes whether pools contain standard indixes, custom indexes or both + - will detect non-nucleotide letters + - checks for double indexes in pools + - detects missing Indexes + - checks for minimal distance between Indexes + - checks for similar index length (warning) + - checks pool balance + ''' + warning_mixed_indexes = 0 + warning_component_length = 0 + + # retrieve the Cell IDs in for NGI standard indexes or custom indexes + cell_id_standard_index = [] + cell_id_custom_index = [] + for row_nr in pool_rows: + cell_id_standard_index.append("{col}{row_nr}".format(col =LibrarySheet.SINDEX_COL, row_nr = row_nr)) + cell_id_custom_index.append("{col}{row_nr}".format(col = LibrarySheet.CINDEX_COL, row_nr = row_nr)) + + # retrieves index sequences for NGI standard indexes + sindex_list = [] + sindex_absent = [] + for sindex in cell_id_standard_index: + if (self.access_sample_info_sheet[sindex].value is None): + sindex_absent.append(sindex) + elif(self.access_sample_info_sheet[sindex].value == "noIndex"): + sindex_list.append(self.access_sample_info_sheet[sindex].value) + else: + index_only = re.split('\(|\)', self.access_sample_info_sheet[sindex].value)[1] + + # checks that dual indexes have the same length in both components + split_index = re.split('[-_]',index_only) + if(len(split_index) > 1): + if(len(split_index[0]) != len(split_index[1])): + logger.warning("Length of the two components in dual index {} and {} is different.".format(split_index[0], split_index[1])) + warning_component_length += 1 + + # merges dual indexes to one sequence + index_only = re.sub('[-_]', '', index_only) + sindex_list.append(index_only) + + # generates error if both custom and NGI standard index are selected for the same sample + if (self.access_sample_info_sheet["{col}{row_nr}".format(col=LibrarySheet.CINDEX_COL, row_nr=sindex.split(LibrarySheet.SINDEX_COL)[1])].value is not None): + logger.error('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\ + .format(LibrarySheet.SINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1], LibrarySheet.CINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1])) + sys.exit(1) + + # retrieves index sequences for custom indexes + cindex_list = [] + cindex_absent = [] + for cindex in cell_id_custom_index: + cindex_value = self.access_sample_info_sheet[cindex].value + if (cindex_value is None): + cindex_absent.append(cindex) + else: + # checks that dual indexes have the same length in both components + split_cindex = re.split('[-_]',cindex_value) + if(len(split_cindex) > 1): + if(len(split_cindex[0]) != len(split_cindex[1])): + logger.warning("Length of the two components in dual index {} and {} is different.".format(split_cindex[0], split_cindex[1])) + warning_component_length += 1 + + # merges dual indexes to one sequence + index_only = re.sub('[-]','', cindex_value) + cindex_list.append(index_only) + + # identifies missing index specification for individual samples + # and detects wether NGI standard indexes or custom indexes are chosen for a given pool + error_missing_index = 0 + warning_mixed_indexes = 0 + sindex_chosen = True + if(len(cindex_absent) == len(pool_rows)): + sel_index = sindex_list + if(len(sindex_list) != len(pool_rows)): + for absent_index in sindex_absent: + logger.error("missing index in row {}".format(re.sub("\D","",absent_index))) + sys.exit(1) + elif(len(sindex_absent) == len(pool_rows)): + sel_index = cindex_list + sindex_chosen = False + if(len(cindex_list) != len(pool_rows)): + for absent_index in cindex_absent: + logger.error("missing index in row {}".format(re.sub("\D","",absent_index))) + sys.exit(1) + else: + sindex_chosen = False + + # warning if a pool consists out of a mix of NGI standard indexes and custom indexes + sel_index = sindex_list + cindex_list + if(len(sel_index) != len(pool_rows)): + rowID_cindex_absent = [re.sub("\D","",x) for x in cindex_absent] + rowID_sindex_absent = [re.sub("\D","",x) for x in sindex_absent] + + missing_indexes = list(set(rowID_cindex_absent).intersection(rowID_sindex_absent)) + for mIndex in missing_indexes: + logger.error('missing index in row {}'.format(mIndex)) + + logger.warning('mix between custom and standard indexes in pool {}.'\ + .format(pool)) + warning_mixed_indexes += 1 + + dictionary_index = { + "result_index" : sel_index, + "sindex" : sindex_chosen, + "warning_index_mix" : warning_mixed_indexes, + "warning_length_comp" : warning_component_length + } + + # returns warnings for check summary + return(dictionary_index) + + def validate_index(self, index_seq, pool_name, sindex): + ''' + does all the fancy index checks + ''' + # allows for entry "noIndex" if only one sample is defined in the pool + c = Counter(index_seq) + if(c['noIndex'] > 0 and len(index_seq) != 1): + logger.error('Pool {} contains undefined index(es) (\"noIndex\")'.format(pool_name)) + sys.exit(1) + elif(c['noIndex'] == 1 and len(index_seq) == 1): + logger.info('Pool {} containing one sample is not indexed.'.format(pool_name)) + elif(c['noIndex'] == 0): + # checks that all indexes in a pool are unique + for index, index_count in c.most_common(): + if(index_count>1): + logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\ + .format(index, pool_name)) + sys.exit(1) + else: + break + + warning_low_div = 0 + index_count = 1 + charRE = re.compile(r'[^ATCGNatcgn\-\.]') + for index in index_seq: + # checks that indexes only contain valid letters + index_search = charRE.search(index) + if(bool(index_search) and index != "noIndex"): + logger.error('The index sequence \"{}\" in pool {} contains invalid characters.'\ + ' Allowed characters: A/T/C/G/N/a/t/c/g/n/-' + .format(index, pool_name)) + sys.exit(1) + + # check that indexes within a pool have minimum diversity + for i in range(index_count,len(index_seq)): + levenshtein_distance = lev.distance(index.lower(), index_seq[i].lower()) + if levenshtein_distance < LibrarySheet.MAX_DISTANCE: + logger.warning('The index sequences {} and {} in pool {}'\ + ' display low diversity (only {} nt difference).'\ + .format(index,index_seq[i], pool_name, levenshtein_distance)) + warning_low_div += 1 + index_count += 1 + + # checks index length + warning_index_length = 0 + warning_index_balance = 0 + index_length = [] + for index in index_seq: + index_length.append(len(index)) + count_length = Counter(index_length) + + # checks color balance in the pool + max_length = sorted(count_length.keys())[-1] + min_length = sorted(count_length.keys())[0] + index_list_colour = [] + for index in index_seq: + index_colour = index.replace('T','G').replace('A','B').replace('C', 'R') + index_list_colour.append(list(index_colour)) + for row_nr in range(0,max_length): + column = [] + for row in index_list_colour: + try: + column.append(row[row_nr]) + except IndexError: # exception for pools with different index length of samples + logger.warning('There are {} different index lengths in pool {}, please double check the sequences.'\ + .format(len(count_length),pool_name)) + warning_index_length += 1 + count_colour = Counter(column) + if(len(count_colour)<2 and sum(count_colour.values()) > 1): + logger.warning('Indexes in pool {} unbalanced at position {}'\ + .format(pool_name, row_nr+1)) + warning_index_balance += 1 + + dictionary_val_index = { + "warning_low_div" : warning_low_div, + "warning_index_length" : warning_index_length, + "warning_index_balance" : warning_index_balance + } + # returns warnings for check summary + return(dictionary_val_index) + + +def main(input_sheet, config_statusDB): + # Instantiate the LibrarySheet object + sheetOI = LibrarySheet(input_sheet) + # get Project Information from couchDB + sheetOI.ProjectInfo(config_statusDB) + # validate the project name to ensure correct identification in couchDB + sheetOI.validate_project_Name() + # validate all entries + pool_fail, poolIDs = sheetOI.validate() + # final check summary + logger.info( + 'Library submission check complete. {}/{} pool(s) pass without warnings.'\ + .format((poolIDs-pool_fail), poolIDs)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('libraryInfoSheet', + help="Completed sample info sent to NGI by the user.") + parser.add_argument('config_statusDB', + help="settings file in yaml format to access statusDB \ + in the format \"couch_server: http://:@tools.scilifelab.se:5984\"") + args = parser.parse_args() + + main(args.libraryInfoSheet, args.config_statusDB)