From f3a996264236382686f849a4ae127d2134f3f848 Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 27 Mar 2025 13:08:36 +0100 Subject: [PATCH 1/6] Formatting with Ruff --- deeplc/__init__.py | 10 ++- deeplc/__main__.py | 112 ++++++++++++++++++---------- deeplc/_argument_parser.py | 21 ++---- deeplc/_exceptions.py | 1 + deeplc/deeplc.py | 133 +++++++++++++++------------------- deeplc/feat_extractor.py | 94 +++++++++++------------- deeplc/gui.py | 7 +- deeplc/plot.py | 25 +++---- deeplc/trainl3.py | 6 +- examples/deeplc_example.py | 32 ++++---- pyproject.toml | 10 +++ streamlit/deeplc_streamlit.py | 9 ++- streamlit/streamlit_utils.py | 2 +- tests/test_deepcallc.py | 118 ++++++++++++++++-------------- tests/test_deeplc.py | 10 ++- tests/test_deeplc_lib.py | 22 +++--- tests/test_deeplc_retrain.py | 22 +++--- 17 files changed, 327 insertions(+), 307 deletions(-) diff --git a/deeplc/__init__.py b/deeplc/__init__.py index 953aa36..0b3c79f 100644 --- a/deeplc/__init__.py +++ b/deeplc/__init__.py @@ -1,14 +1,16 @@ -import sys +__all__ = ["DeepLC", "FeatExtractor"] +import sys -if sys.version_info >= (3,8): +if sys.version_info >= (3, 8): from importlib.metadata import version - __version__ = version('deeplc') + + __version__ = version("deeplc") else: import pkg_resources + __version__ = pkg_resources.require("deeplc")[0].version from deeplc.deeplc import DeepLC from deeplc.feat_extractor import FeatExtractor - diff --git a/deeplc/__main__.py b/deeplc/__main__.py index eb6079f..6c26032 100644 --- a/deeplc/__main__.py +++ b/deeplc/__main__.py @@ -1,7 +1,12 @@ """Main command line interface to DeepLC.""" __author__ = ["Robbin Bouwmeester", "Ralf Gabriels"] -__credits__ = ["Robbin Bouwmeester", "Ralf Gabriels", "Prof. Lennart Martens", "Sven Degroeve"] +__credits__ = [ + "Robbin Bouwmeester", + "Ralf Gabriels", + "Prof. Lennart Martens", + "Sven Degroeve", +] __license__ = "Apache License, Version 2.0" __maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"] __email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"] @@ -26,27 +31,28 @@ def setup_logging(passed_level): log_mapping = { - 'critical': logging.CRITICAL, - 'error': logging.ERROR, - 'warning': logging.WARNING, - 'info': logging.INFO, - 'debug': logging.DEBUG, + "critical": logging.CRITICAL, + "error": logging.ERROR, + "warning": logging.WARNING, + "info": logging.INFO, + "debug": logging.DEBUG, } if passed_level.lower() not in log_mapping: print( "Invalid log level. Should be one of the following: ", - ', '.join(log_mapping.keys()) + ", ".join(log_mapping.keys()), ) exit(1) logging.basicConfig( stream=sys.stdout, - format='%(asctime)s - %(levelname)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=log_mapping[passed_level.lower()] + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=log_mapping[passed_level.lower()], ) + def main(gui=False): """Main function for the CLI.""" argu = parse_arguments(gui=gui) @@ -55,13 +61,13 @@ def main(gui=False): # Reset logging levels if DEBUG (see deeplc.py) if argu.log_level.lower() == "debug": - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' - logging.getLogger('tensorflow').setLevel(logging.DEBUG) - warnings.filterwarnings('default', category=DeprecationWarning) - warnings.filterwarnings('default', category=FutureWarning) - warnings.filterwarnings('default', category=UserWarning) + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" + logging.getLogger("tensorflow").setLevel(logging.DEBUG) + warnings.filterwarnings("default", category=DeprecationWarning) + warnings.filterwarnings("default", category=FutureWarning) + warnings.filterwarnings("default", category=UserWarning) else: - os.environ['KMP_WARNINGS'] = '0' + os.environ["KMP_WARNINGS"] = "0" try: run(**vars(argu)) @@ -101,13 +107,13 @@ def run( for fm in file_model: if len(sel_group) == 0: sel_group = "_".join(fm.split("_")[:-1]) - fm_dict[sel_group]= fm + fm_dict[sel_group] = fm continue m_group = "_".join(fm.split("_")[:-1]) if m_group == sel_group: fm_dict[m_group] = fm file_model = fm_dict - + with open(file_pred) as f: first_line_pred = f.readline().strip() if file_cal: @@ -118,49 +124,70 @@ def run( # Read input files df_pred = pd.read_csv(file_pred) if len(df_pred.columns) < 2: - df_pred = pd.read_csv(file_pred,sep=" ") + df_pred = pd.read_csv(file_pred, sep=" ") df_pred = df_pred.fillna("") file_pred = "" list_of_psms = [] - for seq,mod,ident in zip(df_pred["seq"],df_pred["modifications"],df_pred.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) + for seq, mod, ident in zip(df_pred["seq"], df_pred["modifications"], df_pred.index): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident)) psm_list_pred = PSMList(psm_list=list_of_psms) df_pred = None else: psm_list_pred = read_file(file_pred) if "msms" in file_pred and ".txt" in file_pred: - mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() psm_list_pred.rename_modifications(mapper) # Allow for calibration file to be empty (undefined), fill in if/elif if present psm_list_cal = [] - if "modifications" in first_line_cal.split(",") and "seq" in first_line_cal.split(",") and file_cal: + if ( + "modifications" in first_line_cal.split(",") + and "seq" in first_line_cal.split(",") + and file_cal + ): df_cal = pd.read_csv(file_cal) if len(df_cal.columns) < 2: - df_cal = pd.read_csv(df_cal,sep=" ") + df_cal = pd.read_csv(df_cal, sep=" ") df_cal = df_cal.fillna("") file_cal = "" list_of_psms = [] - for seq,mod,ident,tr in zip(df_cal["seq"],df_cal["modifications"],df_cal.index,df_cal["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + for seq, mod, ident, tr in zip( + df_cal["seq"], df_cal["modifications"], df_cal.index, df_cal["tr"] + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod), + spectrum_id=ident, + retention_time=tr, + ) + ) psm_list_cal = PSMList(psm_list=list_of_psms) df_cal = None elif file_cal: psm_list_cal = read_file(file_cal) if "msms" in file_cal and ".txt" in file_cal: - mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() psm_list_cal.rename_modifications(mapper) # Make a feature extraction object; you can skip this if you do not want to # use the default settings for DeepLC. Here we want to use a model that does # not use RDKit features so we skip the chemical descriptor making # procedure. - f_extractor = FeatExtractor( - cnn_feats=True, - verbose=verbose - ) - + f_extractor = FeatExtractor(cnn_feats=True, verbose=verbose) + # Make the DeepLC object that will handle making predictions and calibration dlc = DeepLC( path_model=file_model, @@ -173,9 +200,9 @@ def run( batch_num=batch_num, n_jobs=n_threads, verbose=verbose, - deeplc_retrain=transfer_learning + deeplc_retrain=transfer_learning, ) - + # Calibrate the original model based on the new retention times if len(psm_list_cal) > 0: logger.info("Selecting best model and calibrating predictions...") @@ -187,14 +214,19 @@ def run( if len(psm_list_cal) > 0: preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred) else: - preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred, calibrate=False) - - #df_pred["predicted_tr"] = preds + preds = dlc.make_preds( + seq_df=df_pred, + infile=file_pred, + psm_list=psm_list_pred, + calibrate=False, + ) + + # df_pred["predicted_tr"] = preds logger.info("Writing predictions to file: %s", file_pred_out) - - file_pred_out = open(file_pred_out,"w") + + file_pred_out = open(file_pred_out, "w") file_pred_out.write("Sequence proforma,predicted retention time\n") - for psm,tr in zip(psm_list_pred,preds): + for psm, tr in zip(psm_list_pred, preds): file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n") file_pred_out.close() diff --git a/deeplc/_argument_parser.py b/deeplc/_argument_parser.py index 09875cf..e12f526 100644 --- a/deeplc/_argument_parser.py +++ b/deeplc/_argument_parser.py @@ -74,17 +74,13 @@ def parse_arguments(gui=False): parser = ArgumentParser( prog="DeepLC", - description=( - "Retention time prediction for (modified) peptides using deep " "learning." - ), + description=("Retention time prediction for (modified) peptides using deep learning."), usage="deeplc [OPTIONS] --file_pred ", formatter_class=lambda prog: HelpFormatter(prog, max_help_position=42), add_help=False, ) - io_args = parser.add_argument_group( - "Input and output files", **gooey_args["io_args"] - ) + io_args = parser.add_argument_group("Input and output files", **gooey_args["io_args"]) io_args.add_argument( "--file_pred", required=True, @@ -97,9 +93,7 @@ def parse_arguments(gui=False): type=str, default=None, metavar="Input peptides for calibration" if gui else "", - help=( - "path to peptide CSV file with retention times to use for " "calibration" - ), + help=("path to peptide CSV file with retention times to use for calibration"), **gooey_args["file_cal"], ) io_args.add_argument( @@ -166,10 +160,7 @@ def parse_arguments(gui=False): dest="split_cal", default=50, metavar="split cal" if gui else "", - help=( - "number of splits in the chromatogram for piecewise linear " - "calibration fit" - ), + help=("number of splits in the chromatogram for piecewise linear calibration fit"), **gooey_args["split_cal"], ) model_cal_args.add_argument( @@ -265,8 +256,6 @@ def parse_arguments(gui=False): results = parser.parse_args() if not results.file_pred_out: - results.file_pred_out = ( - os.path.splitext(results.file_pred)[0] + "_deeplc_predictions.csv" - ) + results.file_pred_out = os.path.splitext(results.file_pred)[0] + "_deeplc_predictions.csv" return results diff --git a/deeplc/_exceptions.py b/deeplc/_exceptions.py index 1be41ae..c3cde3a 100644 --- a/deeplc/_exceptions.py +++ b/deeplc/_exceptions.py @@ -1,5 +1,6 @@ """DeepLC exceptions.""" + class DeepLCError(Exception): pass diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index ab49568..dcaa146 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -16,7 +16,6 @@ "Sven Degroeve", ] - # Default models, will be used if no other is specified. If no best model is # selected during calibration, the first model in the list will be used. import os @@ -44,10 +43,9 @@ from itertools import chain from tempfile import TemporaryDirectory -from sklearn.preprocessing import SplineTransformer from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline - +from sklearn.preprocessing import SplineTransformer # If CLI/GUI/frozen: disable Tensorflow info and warnings before importing IS_CLI_GUI = os.path.basename(sys.argv[0]) in ["deeplc", "deeplc-gui"] @@ -233,9 +231,7 @@ def __init__( try: tf.config.threading.set_intra_op_parallelism_threads(n_jobs) except RuntimeError: - logger.warning( - "DeepLC tried to set intra op threads, but was unable to do so." - ) + logger.warning("DeepLC tried to set intra op threads, but was unable to do so.") if "NUMEXPR_MAX_THREADS" not in os.environ: os.environ["NUMEXPR_MAX_THREADS"] = str(n_jobs) @@ -300,20 +296,19 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]): if not self.predict_ccs: for seq, mod, ident in zip(seqs, mods, identifiers): - list_of_psms.append( - PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident) - ) + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident)) else: for seq, mod, ident, z in zip(seqs, mods, identifiers, charges): list_of_psms.append( - PSM(peptide=peprec_to_proforma(seq, mod, z), spectrum_id=ident) + PSM( + peptide=peprec_to_proforma(seq, mod, z), + spectrum_id=ident, + ) ) psm_list = PSMList(psm_list=list_of_psms) - return self.f_extractor.full_feat_extract( - psm_list, predict_ccs=self.predict_ccs - ) + return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) def do_f_extraction_pd(self, df_instances, charges=[]): """ @@ -336,11 +331,11 @@ def do_f_extraction_pd(self, df_instances, charges=[]): list_of_psms = [] if len(charges) == 0: for seq, mod, ident in zip( - df_instances["seq"], df_instances["modifications"], df_instances.index + df_instances["seq"], + df_instances["modifications"], + df_instances.index, ): - list_of_psms.append( - PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident) - ) + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident)) else: for seq, mod, ident, z in zip( df_instances["seq"], @@ -357,9 +352,7 @@ def do_f_extraction_pd(self, df_instances, charges=[]): psm_list = PSMList(psm_list=list_of_psms) - return self.f_extractor.full_feat_extract( - psm_list, predict_ccs=self.predict_ccs - ) + return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) def do_f_extraction_pd_parallel(self, df_instances): """ @@ -414,9 +407,7 @@ def do_f_extraction_psm_list(self, psm_list): pd.DataFrame feature matrix """ - return self.f_extractor.full_feat_extract( - psm_list, predict_ccs=self.predict_ccs - ) + return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) def do_f_extraction_psm_list_parallel(self, psm_list): """ @@ -454,9 +445,7 @@ def do_f_extraction_psm_list_parallel(self, psm_list): logger.debug("got feature extraction results") else: logger.debug("start feature extraction") - all_feats_async = pool.map_async( - self.do_f_extraction_psm_list, psm_list_split - ) + all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split) logger.debug("wait for feature extraction") all_feats_async.wait() @@ -465,9 +454,7 @@ def do_f_extraction_psm_list_parallel(self, psm_list): matrix_names = res[0].keys() all_feats = { matrix_name: dict( - enumerate( - chain.from_iterable((v[matrix_name].values() for v in res)) - ) + enumerate(chain.from_iterable((v[matrix_name].values() for v in res))) ) for matrix_name in matrix_names } @@ -493,9 +480,7 @@ def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max): # Use spline model within the range of X within_range = (uncal_preds >= cal_min) & (uncal_preds <= cal_max) - within_range = ( - within_range.ravel() - ) # Ensure this is a 1D array for proper indexing + within_range = within_range.ravel() # Ensure this is a 1D array for proper indexing # Create a prediction array initialized with spline predictions cal_preds = np.copy(y_pred_spline) @@ -586,10 +571,10 @@ def make_preds_core( predictions """ if calibrate: - assert ( - self.calibrate_dict - ), "DeepLC instance is not yet calibrated.\ + assert self.calibrate_dict, ( + "DeepLC instance is not yet calibrated.\ Calibrate before making predictions, or use calibrate=False" + ) if len(X) == 0 and len(psm_list) > 0: if self.verbose: @@ -644,7 +629,12 @@ def make_preds_core( return ret_preds def make_preds( - self, psm_list=None, infile="", calibrate=True, seq_df=None, mod_name=None + self, + psm_list=None, + infile="", + calibrate=True, + seq_df=None, + mod_name=None, ): """ Make predictions for sequences, in batches if required. @@ -690,11 +680,12 @@ def make_preds( ) ) else: - for seq, mod, ident in zip( - seq_df["seq"], seq_df["modifications"], seq_df.index - ): + for seq, mod, ident in zip(seq_df["seq"], seq_df["modifications"], seq_df.index): list_of_psms.append( - PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident) + PSM( + peptidoform=peprec_to_proforma(seq, mod), + spectrum_id=ident, + ) ) psm_list = PSMList(psm_list=list_of_psms) @@ -810,7 +801,10 @@ def calibrate_preds_func_pygam( ) else: for seq, mod, ident, tr in zip( - seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"] + seq_df["seq"], + seq_df["modifications"], + seq_df.index, + seq_df["tr"], ): list_of_psms.append( PSM( @@ -828,9 +822,7 @@ def calibrate_preds_func_pygam( # sort two lists, predicted and observed based on measured tr tr_sort = [ (mtr, ptr) - for mtr, ptr in sorted( - zip(measured_tr, predicted_tr), key=lambda pair: pair[1] - ) + for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1]) ] measured_tr = np.array([mtr for mtr, ptr in tr_sort], dtype=np.float32) predicted_tr = np.array([ptr for mtr, ptr in tr_sort], dtype=np.float32) @@ -848,9 +840,7 @@ def calibrate_preds_func_pygam( spline_model = linear_model linear_model_right = linear_model else: - spline = SplineTransformer( - degree=4, n_knots=int(len(measured_tr) / 500) + 5 - ) + spline = SplineTransformer(degree=4, n_knots=int(len(measured_tr) / 500) + 5) spline_model = make_pipeline(spline, LinearRegression()) spline_model.fit(predicted_tr.reshape(-1, 1), measured_tr) @@ -946,7 +936,10 @@ def calibrate_preds_func( ) else: for seq, mod, tr, ident in zip( - seq_df["seq"], seq_df["modifications"], seq_df["tr"], seq_df.index + seq_df["seq"], + seq_df["modifications"], + seq_df["tr"], + seq_df.index, ): list_of_psms.append( PSM( @@ -964,9 +957,7 @@ def calibrate_preds_func( # sort two lists, predicted and observed based on measured tr tr_sort = [ (mtr, ptr) - for mtr, ptr in sorted( - zip(measured_tr, predicted_tr), key=lambda pair: pair[1] - ) + for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1]) ] measured_tr = np.array([mtr for mtr, ptr in tr_sort]) predicted_tr = np.array([ptr for mtr, ptr in tr_sort]) @@ -1017,18 +1008,12 @@ def calibrate_preds_func( raise CalibrationError( "Not enough measured tr ({}) for the chosen number of splits ({}). " "Choose a smaller split_cal parameter or provide more peptides for " - "fitting the calibration curve.".format( - len(measured_tr), self.split_cal - ) + "fitting the calibration curve.".format(len(measured_tr), self.split_cal) ) if len(mtr_mean) == 0: - raise CalibrationError( - "The measured tr list is empty, not able to calibrate" - ) + raise CalibrationError("The measured tr list is empty, not able to calibrate") if len(ptr_mean) == 0: - raise CalibrationError( - "The predicted tr list is empty, not able to calibrate" - ) + raise CalibrationError("The predicted tr list is empty, not able to calibrate") # calculate calibration curves for i in range(0, len(ptr_mean)): @@ -1051,7 +1036,10 @@ def calibrate_preds_func( calibrate_min = v if v > calibrate_max: calibrate_max = v - calibrate_dict[str(round(v, self.bin_dist))] = [slope, intercept] + calibrate_dict[str(round(v, self.bin_dist))] = [ + slope, + intercept, + ] return calibrate_min, calibrate_max, calibrate_dict @@ -1115,7 +1103,10 @@ def calibrate_preds( ) else: for seq, mod, ident, tr in zip( - seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"] + seq_df["seq"], + seq_df["modifications"], + seq_df.index, + seq_df["tr"], ): list_of_psms.append( PSM( @@ -1265,10 +1256,7 @@ def calibrate_preds( perf = sum(abs(np.array(measured_tr) - np.array(preds))) if self.verbose: - logger.debug( - "For %s model got a performance of: %s" - % (m_name, perf / len(preds)) - ) + logger.debug("For %s model got a performance of: %s" % (m_name, perf / len(preds))) if perf < best_perf: if self.deepcallc_mod: @@ -1293,9 +1281,7 @@ def calibrate_preds( self.model = best_model if self.deepcallc_mod: - self.deepcallc_model = train_en( - pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"] - ) + self.deepcallc_model = train_en(pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"]) # self.n_jobs = 1 @@ -1307,12 +1293,13 @@ def calibrate_preds( plotly_return_dict = {} plotly_df = pd.DataFrame( list(zip(temp_obs, temp_pred)), - columns=["Observed retention time", "Predicted retention time"], + columns=[ + "Observed retention time", + "Predicted retention time", + ], ) plotly_return_dict["scatter"] = deeplc.plot.scatter(plotly_df) - plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline( - plotly_df - ) + plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(plotly_df) return plotly_return_dict return {} diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py index 1b12262..087baa1 100644 --- a/deeplc/feat_extractor.py +++ b/deeplc/feat_extractor.py @@ -51,15 +51,28 @@ class FeatExtractor: def __init__( self, main_path=os.path.dirname(os.path.realpath(__file__)), - lib_path_mod=os.path.join( - os.path.dirname(os.path.realpath(__file__)), "unimod/" - ), + lib_path_mod=os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/"), lib_aa_composition=os.path.join( os.path.dirname(os.path.realpath(__file__)), "aa_comp_rel.csv" ), split_size=7, verbose=True, - include_specific_posses=[0, 1, 2, 3, 4, 5, 6, -1, -2, -3, -4, -5, -6, -7], + include_specific_posses=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + -1, + -2, + -3, + -4, + -5, + -6, + -7, + ], add_sum_feat=False, ptm_add_feat=False, ptm_subtract_feat=False, @@ -81,9 +94,7 @@ def __init__( ptm_add_feat = cparser.getboolean("featExtractor", "ptm_add_feat") ptm_subtract_feat = cparser.getboolean("featExtractor", "ptm_subtract_feat") add_rolling_feat = cparser.getboolean("featExtractor", "add_rolling_feat") - include_unnormalized = cparser.getboolean( - "featExtractor", "include_unnormalized" - ) + include_unnormalized = cparser.getboolean("featExtractor", "include_unnormalized") include_specific_posses = ast.literal_eval( cparser.get("featExtractor", "include_specific_posses") ) @@ -241,9 +252,7 @@ def get_feats_mods( look_up_mod_subtract = {} look_up_mod_add = {} - len_init = len( - [ao + str(spl_s) for spl_s in range(split_size) for ao in atoms_order] - ) + len_init = len([ao + str(spl_s) for spl_s in range(split_size) for ao in atoms_order]) for index_name, mod, seq in zip(identifiers, mods, seqs): mod_dict[index_name] = dict( zip( @@ -265,9 +274,7 @@ def get_feats_mods( continue if self.verbose: - logger.debug( - "Time to calculate mod features: %s seconds" % (time.time() - t0) - ) + logger.debug("Time to calculate mod features: %s seconds" % (time.time() - t0)) df_ret = pd.DataFrame(mod_dict, dtype=int).T del mod_dict return df_ret @@ -388,15 +395,9 @@ def rolling_sum(a, n=2): peptide_composition = [mass.std_aa_comp[aa] for aa in seq] # Initialize all feature matrices - matrix = np.zeros( - (padding_length, len(dict_index.keys())), dtype=np.float16 - ) - matrix_hc = np.zeros( - (padding_length, len(dict_aa.keys())), dtype=np.float16 - ) - matrix_pos = np.zeros( - (len(positions), len(dict_index.keys())), dtype=np.float16 - ) + matrix = np.zeros((padding_length, len(dict_index.keys())), dtype=np.float16) + matrix_hc = np.zeros((padding_length, len(dict_aa.keys())), dtype=np.float16) + matrix_pos = np.zeros((len(positions), len(dict_index.keys())), dtype=np.float16) for i, position_composition in enumerate(peptide_composition): for k, v in position_composition.items(): @@ -437,21 +438,15 @@ def rolling_sum(a, n=2): except KeyError: warn_once(f"Could not add the following atom: {atom}") except IndexError: - warn_once( - f"Could not add the following atom: {pn} {atom} {val}" - ) + warn_once(f"Could not add the following atom: {pn} {atom} {val}") for i, peptide_position in enumerate(peptidoform.parsed_sequence): try: matrix_hc[i, dict_aa[peptide_position[0]]] = 1.0 except KeyError: - warn_once( - f"Skipping the following (not in library): {i} {peptide_position}" - ) + warn_once(f"Skipping the following (not in library): {i} {peptide_position}") except IndexError: - warn_once( - f"Could not add the following atom: {i} {peptide_position}" - ) + warn_once(f"Could not add the following atom: {i} {peptide_position}") if peptide_position[1] is not None: try: @@ -472,12 +467,11 @@ def rolling_sum(a, n=2): atom_change, ) in modification_composition.items(): try: - matrix[ - i, dict_index[atom_position_composition] - ] += atom_change + matrix[i, dict_index[atom_position_composition]] += atom_change if i in positions: matrix_pos[ - i, dict_index_pos[atom_position_composition] + i, + dict_index_pos[atom_position_composition], ] += atom_change elif i - seq_len in positions: matrix_pos[ @@ -492,12 +486,11 @@ def rolling_sum(a, n=2): atom_position_composition = sub( "\[.*?\]", "", atom_position_composition ) - matrix[ - i, dict_index[atom_position_composition] - ] += atom_change + matrix[i, dict_index[atom_position_composition]] += atom_change if i in positions: matrix_pos[ - i, dict_index_pos[atom_position_composition] + i, + dict_index_pos[atom_position_composition], ] += atom_change elif i - seq_len in positions: matrix_pos[ @@ -534,10 +527,12 @@ def rolling_sum(a, n=2): (seq.count("F") + seq.count("W") + seq.count("Y")) / float(seq_len), ) matrix_all = np.append( - matrix_all, (seq.count("D") + seq.count("E")) / float(seq_len) + matrix_all, + (seq.count("D") + seq.count("E")) / float(seq_len), ) matrix_all = np.append( - matrix_all, (seq.count("K") + seq.count("R")) / float(seq_len) + matrix_all, + (seq.count("K") + seq.count("R")) / float(seq_len), ) matrix_all = np.append(matrix_all, charge) @@ -584,7 +579,10 @@ def full_feat_extract( list_of_psms = [] for seq, mod, id in zip(seqs, mods, identifiers): list_of_psms.append( - PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=id) + PSM( + peptidoform=peprec_to_proforma(seq, mod), + spectrum_id=id, + ) ) psm_list = PSMList(psm_list=list_of_psms) @@ -599,14 +597,10 @@ def full_feat_extract( if self.ptm_add_feat: if self.verbose: logger.debug("Extracting compositional add features for modifications") - X_feats_add = self.get_feats_mods( - psm_list, split_size=self.split_size, add_str="_add" - ) + X_feats_add = self.get_feats_mods(psm_list, split_size=self.split_size, add_str="_add") if self.ptm_subtract_feat: if self.verbose: - logger.debug( - "Extracting compositional subtract features for modifications" - ) + logger.debug("Extracting compositional subtract features for modifications") X_feats_neg = self.get_feats_mods( psm_list, split_size=self.split_size, @@ -642,9 +636,7 @@ def full_feat_extract( X = X_feats_neg if self.verbose: - logger.debug( - "Time to calculate all features: %s seconds" % (time.time() - t0) - ) + logger.debug("Time to calculate all features: %s seconds" % (time.time() - t0)) return X diff --git a/deeplc/gui.py b/deeplc/gui.py index ae4cf27..47ee007 100644 --- a/deeplc/gui.py +++ b/deeplc/gui.py @@ -1,8 +1,10 @@ """Graphical user interface.""" + import sys from multiprocessing import freeze_support from pathlib import Path import importlib.resources + try: from gooey import Gooey, local_resource_path except ImportError: @@ -17,7 +19,7 @@ # Get path to package_data/images # Workaround with parent of specific file required for Python 3.9+ support -with importlib.resources.path(img_module, 'config_icon.png') as resource: +with importlib.resources.path(img_module, "config_icon.png") as resource: _IMG_DIR = Path(resource).parent @@ -27,12 +29,13 @@ tabbed_groups=True, default_size=(720, 480), monospace_display=True, - target=None if getattr(sys, 'frozen', False) else "deeplc-gui" + target=None if getattr(sys, "frozen", False) else "deeplc-gui", ) def start_gui(): """Run main with GUI enabled.""" freeze_support() # Required for multiprocessing with PyInstaller main(gui=True) + if __name__ == "__main__": start_gui() diff --git a/deeplc/plot.py b/deeplc/plot.py index fd337d0..a4b2a37 100644 --- a/deeplc/plot.py +++ b/deeplc/plot.py @@ -12,7 +12,7 @@ def scatter( observed_column: str = "Observed retention time", xaxis_label: str = "Observed retention time", yaxis_label: str = "Predicted retention time", - plot_title: str = "Predicted vs. observed retention times" + plot_title: str = "Predicted vs. observed retention times", ) -> go.Figure: """ Plot a scatter plot of the predicted vs. observed retention times. @@ -42,15 +42,15 @@ def scatter( y=predicted_column, opacity=0.3, ) - + # Draw diagonal line fig.add_scatter( x=[min(df[observed_column]), max(df[observed_column])], y=[min(df[observed_column]), max(df[observed_column])], - mode='lines', - line=dict(color='red', width=3, dash='dash'), + mode="lines", + line=dict(color="red", width=3, dash="dash"), ) - + # Hide legend fig.update_layout( title=plot_title, @@ -58,7 +58,7 @@ def scatter( xaxis_title=xaxis_label, yaxis_title=yaxis_label, ) - + return fig @@ -85,9 +85,7 @@ def distribution_baseline( """ # Get baseline data baseline_df = pd.read_csv( - Path(__file__) - .absolute() - .parent.joinpath("baseline_performance/baseline_predictions.csv") + Path(__file__).absolute().parent.joinpath("baseline_performance/baseline_predictions.csv") ) baseline_df["rel_mae_best"] = baseline_df[ ["rel_mae_transfer_learning", "rel_mae_new_model", "rel_mae_calibrate"] @@ -97,9 +95,7 @@ def distribution_baseline( # Calculate current RMAE and percentile compared to baseline mae = sum(abs(df[observed_column] - df[predicted_column])) / len(df.index) mae_rel = (mae / max(df[observed_column])) * 100 - percentile = round( - (baseline_df["rel_mae_transfer_learning"] < mae_rel).mean() * 100, 1 - ) + percentile = round((baseline_df["rel_mae_transfer_learning"] < mae_rel).mean() * 100, 1) # Calculate x-axis range with 5% padding all_values = np.append(baseline_df["rel_mae_transfer_learning"].values, mae_rel) @@ -140,10 +136,7 @@ def distribution_baseline( ) fig.update_xaxes(range=[x_min, x_max]) fig.update_layout( - title=( - f"Current DeepLC performance compared to {len(baseline_df.index)} " - "datasets" - ), + title=(f"Current DeepLC performance compared to {len(baseline_df.index)} datasets"), xaxis_title="Relative mean absolute error (%)", ) diff --git a/deeplc/trainl3.py b/deeplc/trainl3.py index a28e083..f4bd276 100644 --- a/deeplc/trainl3.py +++ b/deeplc/trainl3.py @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. -This project was made possible by MASSTRPLAN. MASSTRPLAN received funding -from the Marie Sklodowska-Curie EU Framework for Research and Innovation +This project was made possible by MASSTRPLAN. MASSTRPLAN received funding +from the Marie Sklodowska-Curie EU Framework for Research and Innovation Horizon 2020, under Grant Agreement No. 675132. """ @@ -99,7 +99,7 @@ def train_en(X, y, n_jobs=16, cv=None): grid.fit(X, y) crossv_mod.set_params(**grid.best_params_) - + ret_mod.set_params(**grid.best_params_) ret_mod.fit(X, y) diff --git a/examples/deeplc_example.py b/examples/deeplc_example.py index 95a5c5d..25a2cf4 100644 --- a/examples/deeplc_example.py +++ b/examples/deeplc_example.py @@ -3,7 +3,7 @@ from matplotlib import pyplot as plt from deeplc import DeepLC, FeatExtractor -if __name__ == '__main__': +if __name__ == "__main__": # Input files peptide_file = "examples/datasets/dia.csv" @@ -14,14 +14,14 @@ df = df.fillna("") # Generate some identifiers, any kind of identifiers will do - df.index = ["Pep_"+str(dfi) for dfi in df.index] + df.index = ["Pep_" + str(dfi) for dfi in df.index] # Make a feature extraction object. # This step can be skipped if you want to use the default feature extraction # settings. In this example we will use a model that does not use RDKit features # so we skip the chemical descriptor making procedure. - #f_extractor = FeatExtractor(chem_descr_feat=False,verbose=False) + # f_extractor = FeatExtractor(chem_descr_feat=False,verbose=False) f_extractor = FeatExtractor( add_sum_feat=False, ptm_add_feat=False, @@ -30,26 +30,28 @@ chem_descr_feat=False, add_comp_feat=False, cnn_feats=True, - verbose=True + verbose=True, ) # Initiate a DeepLC instance that will perform the calibration and predictions dlc = DeepLC( - path_model=["deeplc/mods/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "deeplc/mods/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "deeplc/mods/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "deeplc/mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5"], + path_model=[ + "deeplc/mods/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "deeplc/mods/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "deeplc/mods/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "deeplc/mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + ], cnn_model=True, f_extractor=f_extractor, verbose=True, write_library=True, - use_library="deeplc/library/library.csv" + use_library="deeplc/library/library.csv", ) df["tr"] = list(range(len(df.index))) # To demonstrate DeepLC's callibration, we'll induce some an artificial # transformation into the retention times - df["tr"] = df["tr"]**0.85 + df["tr"] = df["tr"] ** 0.85 # Calibrate the original model based on the new retention times dlc.calibrate_preds(seq_df=df) @@ -66,10 +68,10 @@ print("c") # Compare calibrated and uncalibrated predictions - #print("Predictions (calibrated): ", preds_cal) - #print("Predictions (uncalibrated): ", preds_uncal) + # print("Predictions (calibrated): ", preds_cal) + # print("Predictions (uncalibrated): ", preds_uncal) - plt.scatter(df["tr"],preds_cal,label="Calibrated",s=1) - plt.scatter(df["tr"],preds_uncal,label="Uncalibrated",s=1) + plt.scatter(df["tr"], preds_cal, label="Calibrated", s=1) + plt.scatter(df["tr"], preds_uncal, label="Uncalibrated", s=1) plt.legend() - plt.savefig('deeplc_calibrated_vs_uncalibrated.png') + plt.savefig("deeplc_calibrated_vs_uncalibrated.png") diff --git a/pyproject.toml b/pyproject.toml index 31e8fe3..47b0895 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,3 +61,13 @@ build-backend = "setuptools.build_meta" [tool.setuptools] packages = ["deeplc"] include-package-data = true + +[tool.ruff] +line-length = 99 +target-version = 'py311' + +[tool.ruff.lint] +select = ["E", "F", "UP", "B", "SIM", "I"] + +[tool.ruff.format] +docstring-code-format = true diff --git a/streamlit/deeplc_streamlit.py b/streamlit/deeplc_streamlit.py index 399dc5d..fcd574d 100644 --- a/streamlit/deeplc_streamlit.py +++ b/streamlit/deeplc_streamlit.py @@ -109,7 +109,8 @@ def _main_page(self): st.subheader("Prediction speed boost") self.user_input["use_library"] = st.checkbox( - "Use prediction library for speed-up", help=self.texts.Help.use_library + "Use prediction library for speed-up", + help=self.texts.Help.use_library, ) st.markdown(self.texts.Help.use_library_agreement) @@ -334,9 +335,9 @@ class Sidebar: --- Currently using the following package versions:
- [![DeepLC](https://img.shields.io/badge/deeplc-{version('deeplc')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/compomics/deeplc) - [![Tensorflow](https://img.shields.io/badge/tensorflow-{version('tensorflow')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/tensorflow/tensorflow) - [![Streamlit](https://img.shields.io/badge/streamlit-{version('streamlit')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/streamlit/streamlit) + [![DeepLC](https://img.shields.io/badge/deeplc-{version("deeplc")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/compomics/deeplc) + [![Tensorflow](https://img.shields.io/badge/tensorflow-{version("tensorflow")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/tensorflow/tensorflow) + [![Streamlit](https://img.shields.io/badge/streamlit-{version("streamlit")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/streamlit/streamlit) Latest DeepLC version:
![PyPI](https://img.shields.io/pypi/v/deeplc?style=flat-square&logoColor=white&logo=pypi) diff --git a/streamlit/streamlit_utils.py b/streamlit/streamlit_utils.py index 3e6bec5..c3a7df7 100644 --- a/streamlit/streamlit_utils.py +++ b/streamlit/streamlit_utils.py @@ -80,4 +80,4 @@ def hide_streamlit_menu(): @st.cache def save_dataframe(df): """Save dataframe to file object, with streamlit cache.""" - return df.to_csv().encode('utf-8') + return df.to_csv().encode("utf-8") diff --git a/tests/test_deepcallc.py b/tests/test_deepcallc.py index 5505b51..71297db 100644 --- a/tests/test_deepcallc.py +++ b/tests/test_deepcallc.py @@ -2,76 +2,82 @@ from deeplc import DeepLC from matplotlib import pyplot as plt + def main(): peptide_file = "temp_data/PXD005573_mcp.csv" calibration_file = "temp_data/PXD005573_mcp.csv" pep_df = pd.read_csv(peptide_file, sep=",") - pep_df['modifications'] = pep_df['modifications'].fillna("") + pep_df["modifications"] = pep_df["modifications"].fillna("") cal_df = pd.read_csv(calibration_file, sep=",") - cal_df['modifications'] = cal_df['modifications'].fillna("") - + cal_df["modifications"] = cal_df["modifications"].fillna("") + pep_df = pep_df.sample(50) cal_df = cal_df.sample(50) - mods = ["C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_1fd8363d9af9dcad3be7553c39396960.hdf5", - #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_8c22d89667368f2f02ad996469ba157e.hdf5", - #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_5ee8aaa41d387bfffb8cda966348937c.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_f3c75e74dd7b16180edf6f6f0d78a4a6.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_5ee8aaa41d387bfffb8cda966348937c.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5"] - - dlc = DeepLC(write_library=False, - use_library="", - pygam_calibration=True, - deepcallc_mod=True, - path_model=mods, - reload_library=False) + mods = [ + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_1fd8363d9af9dcad3be7553c39396960.hdf5", + # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_8c22d89667368f2f02ad996469ba157e.hdf5", + # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_5ee8aaa41d387bfffb8cda966348937c.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_f3c75e74dd7b16180edf6f6f0d78a4a6.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_5ee8aaa41d387bfffb8cda966348937c.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + ] + + dlc = DeepLC( + write_library=False, + use_library="", + pygam_calibration=True, + deepcallc_mod=True, + path_model=mods, + reload_library=False, + ) dlc.calibrate_preds(seq_df=cal_df) preds = dlc.make_preds(seq_df=cal_df) - plt.scatter(cal_df["tr"],preds) + plt.scatter(cal_df["tr"], preds) plt.show() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_deeplc.py b/tests/test_deeplc.py index 4a5af2f..ed60f92 100644 --- a/tests/test_deeplc.py +++ b/tests/test_deeplc.py @@ -22,9 +22,9 @@ def _r2_score(y_true, y_pred): def test_cli_basic(): """Test command line interface help message.""" - assert ( - subprocess.getstatusoutput("deeplc -h")[0] == 0 - ), "`deeplc -h` returned non-zero status code" + assert subprocess.getstatusoutput("deeplc -h")[0] == 0, ( + "`deeplc -h` returned non-zero status code" + ) def test_cli_full(): @@ -48,8 +48,10 @@ def test_cli_full(): train_df = pd.read_csv(file_path_pred) model_r2 = _r2_score(train_df["tr"], preds_df["predicted retention time"]) logging.info("DeepLC R2 score on %s: %f", file_path_pred, model_r2) - assert model_r2 > 0.90, f"DeepLC R2 score on {file_path_pred} below 0.9 \ + assert model_r2 > 0.90, ( + f"DeepLC R2 score on {file_path_pred} below 0.9 \ (was {model_r2})" + ) if __name__ == "__main__": diff --git a/tests/test_deeplc_lib.py b/tests/test_deeplc_lib.py index a17748c..e9d6b51 100644 --- a/tests/test_deeplc_lib.py +++ b/tests/test_deeplc_lib.py @@ -2,27 +2,27 @@ from deeplc import DeepLC from matplotlib import pyplot as plt + def main(): peptide_file = "examples/datasets/test_pred.csv" calibration_file = "examples/datasets/test_train.csv" pep_df = pd.read_csv(peptide_file, sep=",") - pep_df['modifications'] = pep_df['modifications'].fillna("") + pep_df["modifications"] = pep_df["modifications"].fillna("") cal_df = pd.read_csv(calibration_file, sep=",") - cal_df['modifications'] = cal_df['modifications'].fillna("") - - dlc = DeepLC(write_library=False, - use_library="", - reload_library=False) - #write_library=True, - #use_library="lib.csv", - #reload_library=True) + cal_df["modifications"] = cal_df["modifications"].fillna("") + + dlc = DeepLC(write_library=False, use_library="", reload_library=False) + # write_library=True, + # use_library="lib.csv", + # reload_library=True) dlc.calibrate_preds(seq_df=cal_df) preds = dlc.make_preds(seq_df=cal_df) - plt.scatter(pep_df["tr"],preds) + plt.scatter(pep_df["tr"], preds) plt.show() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_deeplc_retrain.py b/tests/test_deeplc_retrain.py index a67c7a4..8c09330 100644 --- a/tests/test_deeplc_retrain.py +++ b/tests/test_deeplc_retrain.py @@ -2,27 +2,27 @@ from deeplc import DeepLC from matplotlib import pyplot as plt + def main(): peptide_file = "examples/datasets/test_pred.csv" calibration_file = "examples/datasets/test_train.csv" pep_df = pd.read_csv(peptide_file, sep=",") - pep_df['modifications'] = pep_df['modifications'].fillna("") + pep_df["modifications"] = pep_df["modifications"].fillna("") cal_df = pd.read_csv(calibration_file, sep=",") - cal_df['modifications'] = cal_df['modifications'].fillna("") - - dlc = DeepLC(write_library=False, - deeplc_retrain=True, - reload_library=False) - #write_library=True, - #use_library="lib.csv", - #reload_library=True) + cal_df["modifications"] = cal_df["modifications"].fillna("") + + dlc = DeepLC(write_library=False, deeplc_retrain=True, reload_library=False) + # write_library=True, + # use_library="lib.csv", + # reload_library=True) dlc.calibrate_preds(seq_df=cal_df) preds = dlc.make_preds(seq_df=cal_df) - plt.scatter(cal_df["tr"],preds) + plt.scatter(cal_df["tr"], preds) plt.show() + if __name__ == "__main__": - main() \ No newline at end of file + main() From a5ec35c2957c9cac247d255e68040ea1de0bb36f Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 27 Mar 2025 16:48:35 +0100 Subject: [PATCH 2/6] First refactoring of do_f_extraction* methods --- deeplc/deeplc.py | 497 ++++++++++++++++++++++++----------------------- 1 file changed, 256 insertions(+), 241 deletions(-) diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index dcaa146..9bb4051 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -4,6 +4,8 @@ This provides the main interface. For the library versions see the .yml file """ +from __future__ import annotations + __author__ = ["Robbin Bouwmeester", "Ralf Gabriels"] __license__ = "Apache License, Version 2.0" __maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"] @@ -16,26 +18,13 @@ "Sven Degroeve", ] -# Default models, will be used if no other is specified. If no best model is -# selected during calibration, the first model in the list will be used. -import os - -deeplc_dir = os.path.dirname(os.path.realpath(__file__)) -DEFAULT_MODELS = [ - "mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras", - "mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras", - "mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras", -] -DEFAULT_MODELS = [os.path.join(deeplc_dir, dm) for dm in DEFAULT_MODELS] - -LIBRARY = {} import copy import gc import logging -import math import multiprocessing import multiprocessing.dummy +import os import random import sys import warnings @@ -43,10 +32,28 @@ from itertools import chain from tempfile import TemporaryDirectory +import numpy as np +import pandas as pd +from psm_utils import PSM, PSMList +from psm_utils.io import read_file +from psm_utils.io.peptide_record import peprec_to_proforma from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline from sklearn.preprocessing import SplineTransformer +# Default models, will be used if no other is specified. If no best model is +# selected during calibration, the first model in the list will be used. +DEEPLC_DIR = os.path.dirname(os.path.realpath(__file__)) +DEFAULT_MODELS = [ + "mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras", + "mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras", + "mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras", +] +DEFAULT_MODELS = [os.path.join(DEEPLC_DIR, dm) for dm in DEFAULT_MODELS] + +LIBRARY = {} + + # If CLI/GUI/frozen: disable Tensorflow info and warnings before importing IS_CLI_GUI = os.path.basename(sys.argv[0]) in ["deeplc", "deeplc-gui"] IS_FROZEN = getattr(sys, "frozen", False) @@ -57,34 +64,27 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) -# Supress warnings (or at least try...) +# Suppress warnings (or at least try...) logging.getLogger("tensorflow").setLevel(logging.ERROR) os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" -import numpy as np -import pandas as pd +# ruff: noqa: E402 import tensorflow as tf from deeplcretrainer import deeplcretrainer -from psm_utils.io import read_file -from psm_utils.io.peptide_record import peprec_to_proforma -from psm_utils.psm import PSM -from psm_utils.psm_list import PSMList try: from tensorflow.keras.models import load_model -except: +except Exception: from tensorflow.python.keras.models import load_model from tensorflow.python.eager import context from deeplc._exceptions import CalibrationError +from deeplc.feat_extractor import FeatExtractor from deeplc.trainl3 import train_en # Set to force CPU calculations os.environ["CUDA_VISIBLE_DEVICES"] = "-1" -# Feature extraction -from deeplc.feat_extractor import FeatExtractor - def warn(*args, **kwargs): pass @@ -124,78 +124,92 @@ class DeepLC: """ DeepLC predictor. - Parameters - ---------- - main_path : str - main path of module - path_model : str, optional - path to prediction model(s); leave empty to select the best of the - default models based on the calibration peptides - verbose : bool, default=True - turn logging on/off - bin_dist : float, default=2 - TODO - dict_cal_divider : int, default=50 - sets precision for fast-lookup of retention times for calibration; e.g. - 10 means a precision of 0.1 between the calibration anchor points - split_cal : int, default=50 - number of splits in the chromatogram for piecewise linear calibration - fit - n_jobs : int, optional - number of CPU threads to use - config_file : str, optional - path to configuration file - f_extractor : object :: deeplc.FeatExtractor, optional - deeplc.FeatExtractor object to use - cnn_model : bool, default=True - use CNN model or not - batch_num : int, default=250000 - prediction batch size (in peptides); lower to decrease memory footprint - write_library : bool, default=False - append new predictions to library for faster future results; requires - `use_library` option - use_library : str, optional - library file with previous predictions for faster results to read from, - or to write to - reload_library : bool, default=False - reload prediction library - Methods ------- - calibrate_preds(seqs=[], mods=[], identifiers=[], measured_tr=[], correction_factor=1.0, seq_df=None, use_median=True) + calibrate_preds Find best model and calibrate - make_preds(seqs=[], mods=[], identifiers=[], calibrate=True, seq_df=None, correction_factor=1.0, mod_name=None) + make_preds Make predictions """ library = {} - # TODO have a CCS flag here def __init__( self, - main_path=os.path.dirname(os.path.realpath(__file__)), - path_model=None, - verbose=True, - bin_dist=2, - dict_cal_divider=50, - split_cal=50, - n_jobs=None, - config_file=None, - f_extractor=None, - cnn_model=True, - batch_num=250000, - batch_num_tf=1024, - write_library=False, - use_library=None, - reload_library=False, - pygam_calibration=True, - deepcallc_mod=False, - deeplc_retrain=False, - predict_ccs=False, - n_epochs=20, - single_model_mode=True, + main_path: str | None = None, + path_model: str | None = None, + verbose: bool = True, + bin_dist: float = 2.0, + dict_cal_divider: int = 50, + split_cal: int = 50, + n_jobs: int | None = None, + config_file: str | None = None, + f_extractor: FeatExtractor | None = None, + cnn_model: bool = True, + batch_num: int = 250000, + batch_num_tf: int = 1024, + write_library: bool = False, + use_library: str | None = None, + reload_library: bool = False, + pygam_calibration: bool = True, + deepcallc_mod: bool = False, + deeplc_retrain: bool = False, + predict_ccs: bool = False, + n_epochs: int = 20, + single_model_mode: bool = True, ): + """ + Initialize the DeepLC predictor. + + Parameters + ---------- + main_path + Main path of the module. + path_model + Path to prediction model(s); if not provided, the best default model is selected based + on calibration peptides. + verbose + Turn logging on/off. + bin_dist + Precision factor for calibration lookup. + dict_cal_divider + Divider that sets the precision for fast lookup of retention times in calibration; e.g. + 10 means a precision of 0.1 between the calibration anchor points + split_cal + Number of splits in the chromatogram for piecewise linear calibration. + n_jobs + Number of CPU threads to use. + config_file + Path to a configuration file. + f_extractor + deeplc.FeatExtractor object to use. + cnn_model + Flag indicating whether to use the CNN model. + batch_num + Prediction batch size (in peptides); lower values reduce memory footprint. + batch_num_tf + Batch size for TensorFlow predictions. + write_library + Whether to append new predictions to a library for faster future access. + use_library + Library file to read from or write to for prediction caching. + reload_library + Whether to reload the prediction library. + pygam_calibration + Flag to enable calibration using PyGAM. + deepcallc_mod + Flag specific to deepcallc mode. + deeplc_retrain + Flag indicating whether to perform retraining (transfer learning) of prediction models. + predict_ccs + Flag to control prediction of CCS values. + n_epochs + Number of epochs used in retraining if deeplc_retrain is enabled. + single_model_mode + Flag to use a single model instead of multiple default models. + + """ # if a config file is defined overwrite standard parameters if config_file: cparser = ConfigParser() @@ -204,29 +218,31 @@ def __init__( split_cal = cparser.getint("DeepLC", "split_cal") n_jobs = cparser.getint("DeepLC", "n_jobs") - self.main_path = main_path + self.main_path = main_path or os.path.dirname(os.path.realpath(__file__)) + self.path_model = self._get_model_paths(path_model, single_model_mode) self.verbose = verbose self.bin_dist = bin_dist - self.calibrate_dict = {} - self.calibrate_min = float("inf") - self.calibrate_max = 0 - self.n_epochs = n_epochs + self.dict_cal_divider = dict_cal_divider + self.split_cal = split_cal + self.n_jobs = multiprocessing.cpu_count() if n_jobs is None else n_jobs + self.config_file = config_file + self.f_extractor = f_extractor or FeatExtractor() self.cnn_model = cnn_model - self.batch_num = batch_num self.batch_num_tf = batch_num_tf - self.dict_cal_divider = dict_cal_divider - self.split_cal = split_cal - self.n_jobs = n_jobs - - if self.n_jobs == None: - max_threads = multiprocessing.cpu_count() - self.n_jobs = max_threads - - self.use_library = use_library self.write_library = write_library - + self.use_library = use_library self.reload_library = reload_library + self.pygam_calibration = pygam_calibration + self.deepcallc_mod = deepcallc_mod + self.deeplc_retrain = deeplc_retrain + self.predict_ccs = predict_ccs + self.n_epochs = n_epochs + + # Calibration variables + self.calibrate_dict = {} + self.calibrate_min = float("inf") + self.calibrate_max = 0 try: tf.config.threading.set_intra_op_parallelism_threads(n_jobs) @@ -236,26 +252,6 @@ def __init__( if "NUMEXPR_MAX_THREADS" not in os.environ: os.environ["NUMEXPR_MAX_THREADS"] = str(n_jobs) - if path_model: - self.model = path_model - else: - if single_model_mode: - self.model = [DEFAULT_MODELS[0]] - else: - self.model = DEFAULT_MODELS - - if f_extractor: - self.f_extractor = f_extractor - else: - self.f_extractor = FeatExtractor() - - self.pygam_calibration = pygam_calibration - self.deeplc_retrain = deeplc_retrain - - self.deepcallc_mod = deepcallc_mod - - self.predict_ccs = predict_ccs - if self.deepcallc_mod: self.write_library = False self.use_library = None @@ -273,139 +269,110 @@ def __str__(self): |_| """ - def do_f_extraction(self, seqs, mods, identifiers, charges=[]): + @staticmethod + def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) -> list[str]: + """Get the model paths based on the passed model path and the single model mode.""" + if passed_model_path: + return [passed_model_path] + + if single_model_mode: + return [DEFAULT_MODELS[0]] + + return DEFAULT_MODELS + + def do_f_extraction( + self, + seqs: list[str], + mods: list[str | None], + identifiers: list[str], + charges: list[int] | None = None, + ): """ - Extract all features we can extract; without parallelization; use if you - want to run feature extraction with a single core + Extract all features we can extract; without parallelization; use if you want to run + feature extraction with a single core. Parameters ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods + seqs + List of stripped peptide sequences for each entry. + mods + List of PEPREC-formatted modifications for each entry. + identifiers + List of identifiers for each entry. + charges + List of charges for each entry. Only required if CCS prediction is enabled. Returns ------- pd.DataFrame - feature matrix + Feature matrix. """ - list_of_psms = [] - - if not self.predict_ccs: - for seq, mod, ident in zip(seqs, mods, identifiers): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident)) - else: - for seq, mod, ident, z in zip(seqs, mods, identifiers, charges): - list_of_psms.append( - PSM( - peptide=peprec_to_proforma(seq, mod, z), - spectrum_id=ident, - ) - ) - - psm_list = PSMList(psm_list=list_of_psms) - + psm_list = _lists_to_psm_list(seqs, mods, identifiers, charges, n_jobs=self.n_jobs) return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) - def do_f_extraction_pd(self, df_instances, charges=[]): + def do_f_extraction_pd(self, dataframe: pd.DataFrame, charges: list[int] | None = None): """ - Extract all features we can extract; without parallelization; use if - you want to run feature extraction with a single thread; and use a - defined dataframe + Extract all features we can extract; without parallelization; use if you want to run + feature extraction with a single thread; and use a defined dataframe. Parameters ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) + dataframe + Dataframe containing the sequences (column:seq), modifications (column:modifications), + identifiers (index), and optionally charges (column:charge). + charges + List of charges for each entry. Only required if CCS prediction is enabled and not + present in the dataframe. Returns ------- pd.DataFrame - feature matrix - """ - - list_of_psms = [] - if len(charges) == 0: - for seq, mod, ident in zip( - df_instances["seq"], - df_instances["modifications"], - df_instances.index, - ): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident)) - else: - for seq, mod, ident, z in zip( - df_instances["seq"], - df_instances["modifications"], - df_instances.index, - charges=df_instances["charges"], - ): - list_of_psms.append( - PSM( - peptide=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - ) - ) - - psm_list = PSMList(psm_list=list_of_psms) + Feature matrix. + """ + psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs) return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) - def do_f_extraction_pd_parallel(self, df_instances): + def do_f_extraction_pd_parallel( + self, dataframe: pd.DataFrame, charges: list[int] | None = None + ): """ - Extract all features we can extract; with parallelization; use if you - want to run feature extraction with multiple threads; and use a defined - dataframe + Extract all features we can extract; with parallelization; use if you want to run feature + extraction with multiple threads; and use a defined dataframe. Parameters ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) + dataframe + Dataframe containing the sequences (column:seq), modifications (column:modifications), + identifiers (index), and optionally charges (column:charge). + charges + List of charges for each entry. Only required if CCS prediction is enabled and not + present in the dataframe. Returns ------- pd.DataFrame - feature matrix - """ - # self.n_jobs = 1 - - df_instances_split = np.array_split(df_instances, math.ceil(self.n_jobs / 4.0)) - if multiprocessing.current_process().daemon: - logger.warning( - "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children." - ) - pool = multiprocessing.dummy.Pool(1) - else: - pool = multiprocessing.Pool(math.ceil(self.n_jobs / 4.0)) + Feature matrix. - if self.n_jobs == 1: - df = self.do_f_extraction_pd(df_instances) - else: - df = pd.concat(pool.map(self.do_f_extraction_pd, df_instances_split)) - pool.close() - pool.join() - return df + """ + psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs) + return self.do_f_extraction_psm_list_parallel(psm_list) def do_f_extraction_psm_list(self, psm_list): """ - Extract all features we can extract; without parallelization; use if - you want to run feature extraction with a single thread; and use a - defined dataframe + Extract all features we can extract; without parallelization; use if you want to run + feature extraction with a single thread; and use a defined dataframe Parameters ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) + psm_list + PSMList with PSMs to extract features for. Returns ------- pd.DataFrame feature matrix + """ return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) @@ -417,36 +384,22 @@ def do_f_extraction_psm_list_parallel(self, psm_list): Parameters ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) + psm_list + PSMList with PSMs to extract features for. Returns ------- pd.DataFrame feature matrix + """ - # TODO for multiproc I am still expecting a pd dataframe, this is not the case anymore, they are dicts - # self.n_jobs = 1 logger.debug("prepare feature extraction") - if multiprocessing.current_process().daemon: - logger.warning( - "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children." - ) - psm_list_split = split_list(psm_list, self.n_jobs) - pool = multiprocessing.dummy.Pool(1) - elif self.n_jobs > 1: - psm_list_split = split_list(psm_list, self.n_jobs) - pool = multiprocessing.Pool(self.n_jobs) - - if self.n_jobs == 1: - logger.debug("start feature extraction") - all_feats = self.do_f_extraction_psm_list(psm_list) - logger.debug("got feature extraction results") - else: - logger.debug("start feature extraction") - all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split) + psm_list_split = split_list(psm_list, self.n_jobs) + + logger.debug("start feature extraction") + with multiprocessing.Pool(self.n_jobs) as pool: + all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split) logger.debug("wait for feature extraction") all_feats_async.wait() logger.debug("get feature extraction results") @@ -454,18 +407,12 @@ def do_f_extraction_psm_list_parallel(self, psm_list): matrix_names = res[0].keys() all_feats = { matrix_name: dict( - enumerate(chain.from_iterable((v[matrix_name].values() for v in res))) + enumerate(chain.from_iterable(v[matrix_name].values() for v in res)) ) for matrix_name in matrix_names } - - # all_feats = pd.concat(all_feats_async.get()) - logger.debug("got feature extraction results") - pool.close() - pool.join() - return all_feats def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max): @@ -1325,3 +1272,71 @@ def split_seq(self, a, n): k, m = divmod(len(a), n) result = (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) return result + + +def _get_pool(n_jobs: int) -> multiprocessing.pool.Pool | multiprocessing.dummy.Pool: + """Get a Pool object for parallel processing.""" + if multiprocessing.current_process().daemon: + logger.warning( + "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic " + "processes can't have children." + ) + return multiprocessing.dummy.Pool(1) + elif n_jobs == 1: + return multiprocessing.dummy.Pool(1) + else: + max_n_jobs = multiprocessing.cpu_count() + if n_jobs > max_n_jobs: + logger.warning( + f"Number of jobs ({n_jobs}) exceeds the number of CPUs ({max_n_jobs}). " + f"Setting number of jobs to {max_n_jobs}." + ) + return multiprocessing.Pool(max_n_jobs) + else: + return multiprocessing.Pool(n_jobs) + + +def _lists_to_psm_list( + sequences: list[str], + modifications: list[str | None], + identifiers: list[str], + charges: list[int] | None, + n_jobs: int = 1, +) -> PSMList: + """Convert lists of sequences, modifications, identifiers, and charges into a PSMList.""" + if not charges: + charges = [None] * len(sequences) + + def create_psm(args): + sequence, modifications, identifier, charge = args + return PSM( + peptidoform=peprec_to_proforma(sequence, modifications, charge=charge), + spectrum_id=identifier, + ) + + args_list = list(zip(sequences, modifications, identifiers, charges, strict=True)) + with _get_pool(n_jobs) as pool: + list_of_psms = pool.map(create_psm, args_list) + return PSMList(psm_list=list_of_psms) + + +# TODO: I'm not sure what the expected behavior was for charges; they were parsed +# from the dataframe, while the passed list was used to check whether it they should get +# parsed. I'll allow both with a priority for the passed charges. +def _dataframe_to_psm_list( + dataframe: pd.DataFrame, + charges: list[int] | None, + n_jobs: int = 1, +) -> PSMList: + """Convert a DataFrame with sequences, modifications, and identifiers into a PSMList.""" + sequences = dataframe["seq"] + modifications = dataframe["modifications"] + identifiers = dataframe.index + + if not charges: + if "charge" in dataframe.columns: + charges = dataframe["charge"] + else: + charges = list(None) * len(sequences) + + return _lists_to_psm_list(sequences, modifications, identifiers, charges, n_jobs=n_jobs) From 6588d4d2341d319547646ab28264534c0e1654cf Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 23 Apr 2025 17:34:37 +0200 Subject: [PATCH 3/6] Further cleanup; refactor feature generation; try dask for multiprocessing --- deeplc/__init__.py | 14 +- deeplc/__main__.py | 14 +- deeplc/deeplc.py | 978 ++++++++++++----------------------- deeplc/feat_extractor.py | 856 ++++++++---------------------- deeplc/trainl3.py | 2 +- examples/deeplc_example.py | 5 +- pyproject.toml | 2 +- tests/test_deepcallc.py | 5 +- tests/test_deeplc_lib.py | 5 +- tests/test_deeplc_retrain.py | 5 +- tests/test_feat_extractor.py | 93 ++++ 11 files changed, 660 insertions(+), 1319 deletions(-) create mode 100644 tests/test_feat_extractor.py diff --git a/deeplc/__init__.py b/deeplc/__init__.py index 0b3c79f..e4fb659 100644 --- a/deeplc/__init__.py +++ b/deeplc/__init__.py @@ -1,16 +1,8 @@ -__all__ = ["DeepLC", "FeatExtractor"] +__all__ = ["DeepLC"] -import sys +from importlib.metadata import version -if sys.version_info >= (3, 8): - from importlib.metadata import version - - __version__ = version("deeplc") -else: - import pkg_resources - - __version__ = pkg_resources.require("deeplc")[0].version +__version__ = version("deeplc") from deeplc.deeplc import DeepLC -from deeplc.feat_extractor import FeatExtractor diff --git a/deeplc/__main__.py b/deeplc/__main__.py index 6c26032..e76278b 100644 --- a/deeplc/__main__.py +++ b/deeplc/__main__.py @@ -17,12 +17,12 @@ import warnings import pandas as pd +from psm_utils.io import read_file from psm_utils.io.peptide_record import peprec_to_proforma from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.io import read_file -from deeplc import __version__, DeepLC, FeatExtractor +from deeplc import DeepLC, __version__ from deeplc._argument_parser import parse_arguments from deeplc._exceptions import DeepLCError @@ -182,16 +182,10 @@ def run( index_col=0, )["value"].to_dict() psm_list_cal.rename_modifications(mapper) - # Make a feature extraction object; you can skip this if you do not want to - # use the default settings for DeepLC. Here we want to use a model that does - # not use RDKit features so we skip the chemical descriptor making - # procedure. - f_extractor = FeatExtractor(cnn_feats=True, verbose=verbose) # Make the DeepLC object that will handle making predictions and calibration dlc = DeepLC( path_model=file_model, - f_extractor=f_extractor, cnn_model=True, split_cal=split_cal, dict_cal_divider=dict_divider, @@ -212,9 +206,9 @@ def run( # Make predictions; calibrated or uncalibrated logger.info("Making predictions using model: %s", dlc.model) if len(psm_list_cal) > 0: - preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred) + preds = dlc._make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred) else: - preds = dlc.make_preds( + preds = dlc._make_preds( seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred, diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index 9bb4051..f91cedd 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -29,14 +29,16 @@ import sys import warnings from configparser import ConfigParser -from itertools import chain +from pathlib import Path from tempfile import TemporaryDirectory import numpy as np import pandas as pd -from psm_utils import PSM, PSMList +from dask import compute, delayed +from psm_utils import PSM, Peptidoform, PSMList from psm_utils.io import read_file from psm_utils.io.peptide_record import peprec_to_proforma +from sklearn.base import BaseEstimator from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline from sklearn.preprocessing import SplineTransformer @@ -79,8 +81,8 @@ from tensorflow.python.eager import context from deeplc._exceptions import CalibrationError -from deeplc.feat_extractor import FeatExtractor -from deeplc.trainl3 import train_en +from deeplc.feat_extractor import aggregate_encodings, encode_peptidoform, unpack_features +from deeplc.trainl3 import train_elastic_net # Set to force CPU calculations os.environ["CUDA_VISIBLE_DEVICES"] = "-1" @@ -93,7 +95,6 @@ def warn(*args, **kwargs): import warnings warnings.warn = warn - warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) @@ -105,9 +106,10 @@ def split_list(a, n): return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) -def divide_chunks(l, n): - for i in range(0, len(l), n): - yield l[i : i + n] +def divide_chunks(list_, n_chunks): + """Yield successive n-sized chunks from list_.""" + for i in range(0, len(list_), n_chunks): + yield list_[i : i + n_chunks] def reset_keras(): @@ -140,14 +142,15 @@ def __init__( main_path: str | None = None, path_model: str | None = None, verbose: bool = True, - bin_dist: float = 2.0, + bin_distance: float = 2.0, dict_cal_divider: int = 50, split_cal: int = 50, n_jobs: int | None = None, config_file: str | None = None, - f_extractor: FeatExtractor | None = None, + f_extractor: None = None, cnn_model: bool = True, - batch_num: int = 250000, + # batch_num: int = 250000, + batch_num: int = int(1e6), batch_num_tf: int = 1024, write_library: bool = False, use_library: str | None = None, @@ -183,7 +186,7 @@ def __init__( config_file Path to a configuration file. f_extractor - deeplc.FeatExtractor object to use. + Deprecated. cnn_model Flag indicating whether to use the CNN model. batch_num @@ -221,12 +224,11 @@ def __init__( self.main_path = main_path or os.path.dirname(os.path.realpath(__file__)) self.path_model = self._get_model_paths(path_model, single_model_mode) self.verbose = verbose - self.bin_dist = bin_dist + self.bin_distance = bin_distance self.dict_cal_divider = dict_cal_divider self.split_cal = split_cal self.n_jobs = multiprocessing.cpu_count() if n_jobs is None else n_jobs self.config_file = config_file - self.f_extractor = f_extractor or FeatExtractor() self.cnn_model = cnn_model self.batch_num = batch_num self.batch_num_tf = batch_num_tf @@ -239,6 +241,15 @@ def __init__( self.predict_ccs = predict_ccs self.n_epochs = n_epochs + # Apparently... + self.model = self.path_model + + if f_extractor: + warnings.DeprecationWarning("f_extractor argument is deprecated.") + + # TODO REMOVE!!! + self.verbose = True + # Calibration variables self.calibrate_dict = {} self.calibrate_min = float("inf") @@ -280,145 +291,59 @@ def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) -> return DEFAULT_MODELS - def do_f_extraction( + def _extract_features( self, - seqs: list[str], - mods: list[str | None], - identifiers: list[str], - charges: list[int] | None = None, - ): - """ - Extract all features we can extract; without parallelization; use if you want to run - feature extraction with a single core. - - Parameters - ---------- - seqs - List of stripped peptide sequences for each entry. - mods - List of PEPREC-formatted modifications for each entry. - identifiers - List of identifiers for each entry. - charges - List of charges for each entry. Only required if CCS prediction is enabled. - - Returns - ------- - pd.DataFrame - Feature matrix. - """ - psm_list = _lists_to_psm_list(seqs, mods, identifiers, charges, n_jobs=self.n_jobs) - return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) - - def do_f_extraction_pd(self, dataframe: pd.DataFrame, charges: list[int] | None = None): - """ - Extract all features we can extract; without parallelization; use if you want to run - feature extraction with a single thread; and use a defined dataframe. - - Parameters - ---------- - dataframe - Dataframe containing the sequences (column:seq), modifications (column:modifications), - identifiers (index), and optionally charges (column:charge). - charges - List of charges for each entry. Only required if CCS prediction is enabled and not - present in the dataframe. - - Returns - ------- - pd.DataFrame - Feature matrix. - - """ - psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs) - return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) - - def do_f_extraction_pd_parallel( - self, dataframe: pd.DataFrame, charges: list[int] | None = None - ): - """ - Extract all features we can extract; with parallelization; use if you want to run feature - extraction with multiple threads; and use a defined dataframe. + peptidoforms: list[str | Peptidoform] | PSMList, + chunk_size: int = 10000, + ) -> dict[str, dict[int, np.ndarray]]: + """Extract features for all peptidoforms.""" + if isinstance(peptidoforms, PSMList): + peptidoforms = [psm.peptidoform for psm in peptidoforms] + + logger.debug("Running feature extraction in single-threaded mode...") + if self.n_jobs <= 1: + encodings = [ + encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in peptidoforms + ] - Parameters - ---------- - dataframe - Dataframe containing the sequences (column:seq), modifications (column:modifications), - identifiers (index), and optionally charges (column:charge). - charges - List of charges for each entry. Only required if CCS prediction is enabled and not - present in the dataframe. + else: + logger.debug("Preparing feature extraction with Dask") + # Process peptidoforms in larger chunks to reduce task overhead. + peptidoform_strings = [str(pep) for pep in peptidoforms] # Faster pickling of strings - Returns - ------- - pd.DataFrame - Feature matrix. + def chunked_encode(chunk): + return [encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in chunk] - """ - psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs) - return self.do_f_extraction_psm_list_parallel(psm_list) + tasks = [ + delayed(chunked_encode)(peptidoform_strings[i : i + chunk_size]) + for i in range(0, len(peptidoform_strings), chunk_size) + ] - def do_f_extraction_psm_list(self, psm_list): - """ - Extract all features we can extract; without parallelization; use if you want to run - feature extraction with a single thread; and use a defined dataframe + logger.debug("Starting feature extraction with Dask") + chunks_encodings = compute(*tasks, scheduler="processes", workers=self.n_jobs) - Parameters - ---------- - psm_list - PSMList with PSMs to extract features for. + # Flatten the list of lists. + encodings = [enc for chunk in chunks_encodings for enc in chunk] - Returns - ------- - pd.DataFrame - feature matrix + # Aggregate the encodings into a single dictionary. + aggregated_encodings = aggregate_encodings(encodings) - """ - return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs) + logger.debug("Finished feature extraction") - def do_f_extraction_psm_list_parallel(self, psm_list): - """ - Extract all features we can extract; without parallelization; use if - you want to run feature extraction with a single thread; and use a - defined dataframe + return aggregated_encodings - Parameters - ---------- - psm_list - PSMList with PSMs to extract features for. - - Returns - ------- - pd.DataFrame - feature matrix - - """ - logger.debug("prepare feature extraction") - - psm_list_split = split_list(psm_list, self.n_jobs) - - logger.debug("start feature extraction") - with multiprocessing.Pool(self.n_jobs) as pool: - all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split) - logger.debug("wait for feature extraction") - all_feats_async.wait() - logger.debug("get feature extraction results") - res = all_feats_async.get() - matrix_names = res[0].keys() - all_feats = { - matrix_name: dict( - enumerate(chain.from_iterable(v[matrix_name].values() for v in res)) - ) - for matrix_name in matrix_names - } - logger.debug("got feature extraction results") - - return all_feats + def _apply_calibration_core( + self, + uncal_preds: np.ndarray, + cal_dict: dict | list[BaseEstimator], + cal_min: float, + cal_max: float, + ) -> np.ndarray: + """Apply calibration to the predictions.""" + if len(uncal_preds) == 0: + return np.array([]) - def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max): cal_preds = [] - if len(uncal_preds) == 0: - return np.array(cal_preds) if self.pygam_calibration: linear_model_left, spline_model, linear_model_right = cal_dict y_pred_spline = spline_model.predict(uncal_preds.reshape(-1, 1)) @@ -442,341 +367,223 @@ def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max): else: for uncal_pred in uncal_preds: try: - slope, intercept = cal_dict[str(round(uncal_pred, self.bin_dist))] + slope, intercept = cal_dict[str(round(uncal_pred, self.bin_distance))] cal_preds.append(slope * (uncal_pred) + intercept) except KeyError: # outside of the prediction range ... use the last # calibration curve if uncal_pred <= cal_min: - slope, intercept = cal_dict[str(round(cal_min, self.bin_dist))] + slope, intercept = cal_dict[str(round(cal_min, self.bin_distance))] cal_preds.append(slope * (uncal_pred) + intercept) elif uncal_pred >= cal_max: - slope, intercept = cal_dict[str(round(cal_max, self.bin_dist))] + slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))] cal_preds.append(slope * (uncal_pred) + intercept) else: - slope, intercept = cal_dict[str(round(cal_max, self.bin_dist))] + slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))] cal_preds.append(slope * (uncal_pred) + intercept) + return np.array(cal_preds) - def make_preds_core_library(self, psm_list=[], calibrate=True, mod_name=None): + def _make_preds_core_library(self, psm_list=None, calibrate=True, mod_name=None): + """Get predictions stored in library and calibrate them if needed.""" + psm_list = [] if psm_list is None else psm_list ret_preds = [] for psm in psm_list: ret_preds.append(LIBRARY[psm.peptidoform.proforma + "|" + mod_name]) if calibrate: - try: - ret_preds = self.calibration_core( - ret_preds, + if isinstance(self.calibrate_min, dict): + # if multiple models are used, use the model name to get the + # calibration values (DeepCallC mode) + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict[mod_name], self.calibrate_min[mod_name], self.calibrate_max[mod_name], ) - except: - ret_preds = self.calibration_core( - ret_preds, + else: + # if only one model is used, use the same calibration values + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict, self.calibrate_min, self.calibrate_max, ) + ret_preds = self._apply_calibration_core( + ret_preds, calibrate_dict, calibrate_min, calibrate_max + ) + return ret_preds - def make_preds_core( + def _make_preds_core( self, - X=[], - X_sum=[], - X_global=[], - X_hc=[], - psm_list=[], + X: np.ndarray | None = None, + X_sum: np.ndarray | None = None, + X_global: np.ndarray | None = None, + X_hc: np.ndarray | None = None, calibrate=True, mod_name=None, - ): - """ - Make predictions for sequences - Parameters - ---------- - seq_df : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index); will use parallel - by default! - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods - calibrate : boolean - calibrate predictions or just return the predictions - correction_factor : float - correction factor to apply to predictions - mod_name : str or None - specify a model to use instead of the model assigned originally to - this instance of the object - Returns - ------- - np.array - predictions - """ + ) -> np.ndarray: + """Make predictions.""" + # Check calibration state if calibrate: assert self.calibrate_dict, ( - "DeepLC instance is not yet calibrated.\ - Calibrate before making predictions, or use calibrate=False" + "DeepLC instance is not yet calibrated. Calibrate before making predictions, or " + "use `calibrate=False`" ) - if len(X) == 0 and len(psm_list) > 0: - if self.verbose: - logger.debug("Extracting features for the CNN model ...") - X = self.do_f_extraction_psm_list_parallel(psm_list) - - X_sum = np.stack(list(X["matrix_sum"].values())) - X_global = np.concatenate( - ( - np.stack(list(X["matrix_all"].values())), - np.stack(list(X["pos_matrix"].values())), - ), - axis=1, - ) - X_hc = np.stack(list(X["matrix_hc"].values())) - X = np.stack(list(X["matrix"].values())) - elif len(X) == 0 and len(psm_list) == 0: - return [] + if len(X) == 0: + return np.array([]) ret_preds = [] - - mod = load_model(mod_name) - try: - X - ret_preds = mod.predict( - [X, X_sum, X_global, X_hc], - batch_size=self.batch_num_tf, - verbose=int(self.verbose), - ).flatten() - except UnboundLocalError: - logger.debug("X is empty, skipping...") - ret_preds = [] + model = load_model(mod_name) + ret_preds = model.predict( + [X, X_sum, X_global, X_hc], + batch_size=self.batch_num_tf, + verbose=int(self.verbose), + ).flatten() if calibrate: - try: - ret_preds = self.calibration_core( - ret_preds, + if isinstance(self.calibrate_min, dict): + # if multiple models are used, use the model name to get the + # calibration values (DeepCallC mode) + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict[mod_name], self.calibrate_min[mod_name], self.calibrate_max[mod_name], ) - except: - ret_preds = self.calibration_core( - ret_preds, + else: + # if only one model is used, use the same calibration values + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict, self.calibrate_min, self.calibrate_max, ) - # clear_session() + ret_preds = self._apply_calibration_core( + ret_preds, calibrate_dict, calibrate_min, calibrate_max + ) + gc.collect() return ret_preds def make_preds( self, - psm_list=None, - infile="", - calibrate=True, - seq_df=None, - mod_name=None, + psm_list: PSMList | None = None, + infile: str | Path | None = None, + seq_df: pd.DataFrame | None = None, + calibrate: bool = True, + mod_name: str | None = None, ): """ Make predictions for sequences, in batches if required. Parameters ---------- - seq_df : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index); will use parallel - by default! - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods - calibrate : boolean - calibrate predictions or just return the predictions - correction_factor : float - correction factor to apply to predictions - mod_name : str or None - specify a model to use instead of the model assigned originally to - this instance of the object + psm_list + PSMList object containing the peptidoforms to predict for. + infile + Path to a file containing the peptidoforms to predict for. + seq_df + DataFrame containing the sequences (column:seq), modifications + (column:modifications) and naming (column:index). + calibrate + calibrate predictions or just return the predictions. + mod_name + specify a model to use instead of the model assigned originally to this instance of the + object. Returns ------- np.array predictions """ - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - if self.predict_ccs: - for seq, mod, ident, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - ) - ) + if psm_list is None: + if seq_df is not None: + psm_list = _dataframe_to_psm_list(seq_df) + elif infile is not None: + psm_list = _file_to_psm_list(infile) else: - for seq, mod, ident in zip(seq_df["seq"], seq_df["modifications"], seq_df.index): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=ident, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) - - if len(infile) > 0: - psm_list = read_file(infile) - if "msms" in infile and ".txt" in infile: - mapper = pd.read_csv( - os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "unimod/map_mq_file.csv", - ), - index_col=0, - )["value"].to_dict() - psm_list.rename_modifications(mapper) + raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.") + + if len(psm_list) == 0: + logger.warning("No PSMs to predict for.") + return [] ret_preds_batches = [] for psm_list_t in divide_chunks(psm_list, self.batch_num): ret_preds = [] - if len(psm_list_t) > 0: - if self.verbose: - logger.debug("Extracting features for the CNN model ...") - - X = self.do_f_extraction_psm_list_parallel(psm_list_t) - X_sum = np.stack(list(X["matrix_sum"].values())) - X_global = np.concatenate( - ( - np.stack(list(X["matrix_all"].values())), - np.stack(list(X["pos_matrix"].values())), - ), - axis=1, - ) - X_hc = np.stack(list(X["matrix_hc"].values())) - X = np.stack(list(X["matrix"].values())) - else: - return [] - if isinstance(self.model, dict): - for m_group_name, m_name in self.model.items(): - ret_preds.append( - self.make_preds_core( - X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=m_name, - ) - ) - ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)]) - elif mod_name is not None: - ret_preds = self.make_preds_core( - X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=mod_name, - ) + # Extract features for the CNN model + features = self._extract_features(psm_list_t) + X_sum, X_global, X_hc, X = unpack_features(features) + + # Check if model was provided, and if not, whether multiple models are selected in + # the DeepLC object or not. + if mod_name: + model_names = [mod_name] + elif isinstance(self.model, dict): + model_names = [m_name for m_group_name, m_name in self.model.items()] elif isinstance(self.model, list): - for m_name in self.model: - ret_preds.append( - self.make_preds_core( + model_names = self.model + elif isinstance(self.model, str): + model_names = [self.model] + else: + raise ValueError("Invalid model name provided.") + + # Get predictions + if len(model_names) > 1: + # Iterate over models if multiple were selected + model_predictions = [] + for model_name in model_names: + model_predictions.append( + self._make_preds_core( X=X, X_sum=X_sum, X_global=X_global, X_hc=X_hc, calibrate=calibrate, - mod_name=m_name, + mod_name=model_name, ) ) - ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)]) + # Average the predictions from all models + ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds, strict=True)]) + # ret_preds = np.mean(model_predictions, axis=0) + else: - ret_preds = self.make_preds_core( + # Use the single model + ret_preds = self._make_preds_core( X=X, X_sum=X_sum, X_global=X_global, X_hc=X_hc, calibrate=calibrate, - mod_name=self.model, + mod_name=model_names[0], ) - ret_preds_batches.extend(ret_preds) - return ret_preds_batches - # TODO make this multithreaded - # should be possible with the batched list + ret_preds_batches.append(ret_preds) - def calibrate_preds_func_pygam( - self, - psm_list=None, - correction_factor=1.0, - seq_df=None, - measured_tr=None, - use_median=True, - mod_name=None, - ): - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - # TODO include charge here - if self.predict_ccs: - for seq, mod, ident, tr, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["tr"], - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - retention_time=tr, - ) - ) - else: - for seq, mod, ident, tr in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["tr"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=ident, - retention_time=tr, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) + all_ret_preds = np.concatenate(ret_preds_batches, axis=0) - measured_tr = [psm.retention_time for psm in psm_list] + return all_ret_preds - predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name) + def _calibrate_preds_pygam( + self, + measured_tr: np.ndarray, + predicted_tr: np.ndarray, + ) -> tuple[float, float, list[BaseEstimator]]: + """Make calibration curve for predictions using PyGAM.""" + logger.debug("Getting predictions for calibration...") # sort two lists, predicted and observed based on measured tr tr_sort = [ (mtr, ptr) - for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1]) + for mtr, ptr in sorted( + zip(measured_tr, predicted_tr, strict=True), key=lambda pair: pair[1] + ) ] measured_tr = np.array([mtr for mtr, ptr in tr_sort], dtype=np.float32) predicted_tr = np.array([ptr for mtr, ptr in tr_sort], dtype=np.float32) - # predicted_tr = list(predicted_tr) - # measured_tr = list(measured_tr) - # Fit a SplineTransformer model if self.deeplc_retrain: spline = SplineTransformer(degree=2, n_knots=10) @@ -815,96 +622,19 @@ def calibrate_preds_func_pygam( [linear_model_left, spline_model, linear_model_right], ) - def calibrate_preds_func( + def _calibrate_preds_piecewise_linear( self, - psm_list=None, - correction_factor=1.0, - seq_df=None, - use_median=True, - mod_name=None, - ): - """ - Make calibration curve for predictions - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods - measured_tr : list - measured tr of the peptides; should correspond to seqs, identifiers, - and mods - correction_factor : float - correction factor that needs to be applied to the supplied measured - trs - seq_df : object :: pd.DataFrame - a pd.DataFrame that contains the sequences, modifications and - observed retention times to fit a calibration curve - use_median : boolean - flag to indicate we need to use the median valuein a window to - perform calibration - mod_name - specify a model to use instead of the model assigned originally to - this instance of the object - - Returns - ------- - float - the minimum value where a calibration curve was fitted, lower values - will be extrapolated from the minimum fit of the calibration curve - float - the maximum value where a calibration curve was fitted, higher values - will be extrapolated from the maximum fit of the calibration curve - dict - dictionary with keys for rounded tr, and the values concern a linear - model that should be applied to do calibration (!!! what is the - shape of this?) - """ - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - # TODO include charge here - if self.predict_ccs: - for seq, mod, tr, ident, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df["tr"], - seq_df.index, - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - retention_time=tr, - ) - ) - else: - for seq, mod, tr, ident in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df["tr"], - seq_df.index, - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=ident, - retention_time=tr, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) - - measured_tr = [psm.retention_time for psm in psm_list] - - predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name) - + measured_tr: np.ndarray, + predicted_tr: np.ndarray, + use_median: bool = True, + ) -> tuple[float, float, dict[str, tuple[float]]]: + """Make calibration curve for predictions.""" # sort two lists, predicted and observed based on measured tr tr_sort = [ (mtr, ptr) - for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1]) + for mtr, ptr in sorted( + zip(measured_tr, predicted_tr, strict=False), key=lambda pair: pair[1] + ) ] measured_tr = np.array([mtr for mtr, ptr in tr_sort]) predicted_tr = np.array([ptr for mtr, ptr in tr_sort]) @@ -916,10 +646,9 @@ def calibrate_preds_func( calibrate_min = float("inf") calibrate_max = 0 - if self.verbose: - logger.debug( - "Selecting the data points for calibration (used to fit the linear models between)" - ) + logger.debug( + "Selecting the data points for calibration (used to fit the linear models between)" + ) # smooth between observed and predicted split_val = predicted_tr[-1] / self.split_cal @@ -948,14 +677,13 @@ def calibrate_preds_func( mtr_mean.append(sum(mtr) / len(mtr)) ptr_mean.append(sum(ptr) / len(ptr)) - if self.verbose: - logger.debug("Fitting the linear models between the points") + logger.debug("Fitting the linear models between the points") if self.split_cal >= len(measured_tr): raise CalibrationError( - "Not enough measured tr ({}) for the chosen number of splits ({}). " - "Choose a smaller split_cal parameter or provide more peptides for " - "fitting the calibration curve.".format(len(measured_tr), self.split_cal) + f"Not enough measured tr ({len(measured_tr)}) for the chosen number of splits " + f"({self.split_cal}). Choose a smaller split_cal parameter or provide more " + "peptides for fitting the calibration curve." ) if len(mtr_mean) == 0: raise CalibrationError("The measured tr list is empty, not able to calibrate") @@ -975,118 +703,84 @@ def calibrate_preds_func( # optimized predictions using a dict to find calibration curve very # fast for v in np.arange( - round(ptr_mean[i], self.bin_dist), - round(ptr_mean[i + 1], self.bin_dist), - 1 / ((self.bin_dist) * self.dict_cal_divider), + round(ptr_mean[i], self.bin_distance), + round(ptr_mean[i + 1], self.bin_distance), + 1 / ((self.bin_distance) * self.dict_cal_divider), ): if v < calibrate_min: calibrate_min = v if v > calibrate_max: calibrate_max = v - calibrate_dict[str(round(v, self.bin_dist))] = [ - slope, - intercept, - ] + calibrate_dict[str(round(v, self.bin_distance))] = (slope, intercept) return calibrate_min, calibrate_max, calibrate_dict def calibrate_preds( self, - psm_list=None, - infile="", - measured_tr=[], - correction_factor=1.0, - location_retraining_models="", - psm_utils_obj=None, - sample_for_calibration_curve=None, - seq_df=None, - use_median=True, + psm_list: PSMList | None = None, + infile: str | Path | None = None, + seq_df: pd.DataFrame | None = None, + measured_tr: np.ndarray | None = None, + location_retraining_models: str = "", + sample_for_calibration_curve: int | None = None, + use_median: bool = True, return_plotly_report=False, - ): + ) -> dict | None: """ Find best model and calibrate. Parameters ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods + psm_list + PSMList object containing the peptidoforms to predict for. + infile + Path to a file containing the peptidoforms to predict for. + seq_df + DataFrame containing the sequences (column:seq), modifications (column:modifications), + naming (column:index), and optionally charge (column:charge) and measured retention + time (column:tr). measured_tr : list - measured tr of the peptides; should correspond to seqs, identifiers, - and mods + Measured retention time used for calibration. Should correspond to the PSMs in the + provided PSMs. If None, the measured retention time is taken from the PSMs. correction_factor : float - correction factor that needs to be applied to the supplied measured - trs - seq_df : object :: pd.DataFrame - a pd.DataFrame that contains the sequences, modifications and - observed retention times to fit a calibration curve - use_median : boolean - flag to indicate we need to use the median valuein a window to - perform calibration + correction factor that needs to be applied to the supplied measured tr's + location_retraining_models + Location to save the retraining models; if None, a temporary directory is used. + sample_for_calibration_curve + Number of PSMs to sample for calibration curve; if None, all provided PSMs are used. + use_median + Whether to use the median value in a window to perform calibration; only applies to + piecewise linear calibration, not to PyGAM calibration. + return_plotly_report + Whether to return a plotly report with the calibration results. Returns ------- + dict | None + Dictionary with plotly report information or None. """ - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - if self.predict_ccs: - for seq, mod, ident, tr, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["tr"], - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - retention_time=tr, - ) - ) + # Getting PSMs either from psm_list, seq_df, or infile + if psm_list is None: + if seq_df is not None: + psm_list = _dataframe_to_psm_list(seq_df) + elif infile is not None: + psm_list = _file_to_psm_list(infile) else: - for seq, mod, ident, tr in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["tr"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=ident, - retention_time=tr, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) - elif psm_utils_obj: - psm_list = psm_utils_obj + raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.") + # Getting measured retention time either from measured_tr or provided PSMs + if not measured_tr: + measured_tr = [psm.retention_time for psm in psm_list] + if None in measured_tr: + raise ValueError("Not all PSMs have an observed retention time.") + + # Ensuring self.model is list of strings if isinstance(self.model, str): self.model = [self.model] - if len(infile) > 0: - psm_list = read_file(infile) - if "msms" in infile and ".txt" in infile: - mapper = pd.read_csv( - os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "unimod/map_mq_file.csv", - ), - index_col=0, - )["value"].to_dict() - psm_list.rename_modifications(mapper) - - measured_tr = [psm.retention_time for psm in psm_list] - - if self.verbose: - logger.debug("Start to calibrate predictions ...") - if self.verbose: - logger.debug("Ready to find the best model out of: %s" % (self.model)) + logger.debug("Start to calibrate predictions ...") + logger.debug(f"Ready to find the best model out of: {self.model}") best_perf = float("inf") best_calibrate_min = 0.0 @@ -1101,7 +795,8 @@ def calibrate_preds( temp_pred = [] if self.deeplc_retrain: - # The following code is not required in most cases, but here it is used to clear variables that might cause problems + # The following code is not required in most cases, but here it is used to clear + # variables that might cause problems _ = tf.Variable([1]) context._context = None @@ -1109,17 +804,14 @@ def calibrate_preds( tf.config.threading.set_inter_op_parallelism_threads(1) - if len(location_retraining_models) > 0: + if location_retraining_models: + os.makedirs(location_retraining_models, exist_ok=True) + else: t_dir_models = TemporaryDirectory().name os.mkdir(t_dir_models) - else: - t_dir_models = location_retraining_models - try: - os.mkdir(t_dir_models) - except: - pass - # Here we will apply transfer learning we specify previously trained models in the 'mods_transfer_learning' + # Here we will apply transfer learning we specify previously trained models in the + # 'mods_transfer_learning' models = deeplcretrainer.retrain( {"deeplc_transferlearn": psm_list}, outpath=t_dir_models, @@ -1132,50 +824,37 @@ def calibrate_preds( self.model = models - if isinstance(sample_for_calibration_curve, int): + # Limit calibration to a subset of PSMs if specified + if sample_for_calibration_curve: psm_list = random.sample(list(psm_list), sample_for_calibration_curve) measured_tr = [psm.retention_time for psm in psm_list] - for m in self.model: - if self.verbose: - logger.debug("Trying out the following model: %s" % (m)) + for model_name in self.model: + logger.debug(f"Trying out the following model: {model_name}") + predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=model_name) + if self.pygam_calibration: - calibrate_output = self.calibrate_preds_func_pygam( - psm_list, - measured_tr=measured_tr, - correction_factor=correction_factor, - seq_df=seq_df, - use_median=use_median, - mod_name=m, - ) + calibrate_output = self._calibrate_preds_pygam(measured_tr, predicted_tr) else: - calibrate_output = self.calibrate_preds_func( - psm_list, - correction_factor=correction_factor, - seq_df=seq_df, - use_median=use_median, - mod_name=m, + calibrate_output = self._calibrate_preds_piecewise_linear( + measured_tr, predicted_tr, use_median=use_median ) + self.calibrate_min, self.calibrate_max, self.calibrate_dict = calibrate_output + # TODO: Currently, calibration dict can be both a dict (linear) or a list of models + # (PyGAM)... This should be handled better in the future. - ( - self.calibrate_min, - self.calibrate_max, - self.calibrate_dict, - ) = calibrate_output - - if type(self.calibrate_dict) == dict: - if len(self.calibrate_dict.keys()) == 0: - continue - - m_name = m.split("/")[-1] + # Skip this model if calibrate_dict is empty + # TODO: Should this do something when using PyGAM and calibrate_dict is a list? + if isinstance(self.calibrate_dict, dict) and len(self.calibrate_dict.keys()) == 0: + continue - preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=m) + m_name = model_name.split("/")[-1] - if self.deepcallc_mod: - m_group_name = "deepcallc" - else: - m_group_name = "_".join(m_name.split("_")[:-1]) + # Get new predictions with calibration + preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=model_name) + m_group_name = "deepcallc" if self.deepcallc_mod else "_".join(m_name.split("_")[:-1]) + m = model_name try: pred_dict[m_group_name][m] = preds mod_dict[m_group_name][m] = m @@ -1195,23 +874,19 @@ def calibrate_preds( mod_calibrate_min_dict[m_group_name][m] = self.calibrate_min mod_calibrate_max_dict[m_group_name][m] = self.calibrate_max - for m_name in pred_dict.keys(): - preds = [sum(a) / len(a) for a in zip(*list(pred_dict[m_name].values()))] + for m_name in pred_dict: + preds = [sum(a) / len(a) for a in zip(*list(pred_dict[m_name].values()), strict=True)] if len(measured_tr) == 0: perf = sum(abs(seq_df["tr"] - preds)) else: perf = sum(abs(np.array(measured_tr) - np.array(preds))) - if self.verbose: - logger.debug("For %s model got a performance of: %s" % (m_name, perf / len(preds))) + logger.debug(f"For {m_name} model got a performance of: {perf / len(preds)}") if perf < best_perf: - if self.deepcallc_mod: - m_group_name = "deepcallc" - else: - m_group_name = m_name - # TODO is deepcopy really required? + m_group_name = "deepcallc" if self.deepcallc_mod else m_name + # TODO is deepcopy really required? best_calibrate_dict = copy.deepcopy(mod_calibrate_dict[m_group_name]) best_calibrate_min = copy.deepcopy(mod_calibrate_min_dict[m_group_name]) best_calibrate_max = copy.deepcopy(mod_calibrate_max_dict[m_group_name]) @@ -1228,18 +903,18 @@ def calibrate_preds( self.model = best_model if self.deepcallc_mod: - self.deepcallc_model = train_en(pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"]) - - # self.n_jobs = 1 + self.deepcallc_model = train_elastic_net( + pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"] + ) - logger.debug("Model with the best performance got selected: %s" % (best_model)) + logger.debug(f"Model with the best performance got selected: {best_model}") if return_plotly_report: import deeplc.plot plotly_return_dict = {} plotly_df = pd.DataFrame( - list(zip(temp_obs, temp_pred)), + list(zip(temp_obs, temp_pred, strict=True)), columns=[ "Observed retention time", "Predicted retention time", @@ -1249,32 +924,10 @@ def calibrate_preds( plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(plotly_df) return plotly_return_dict - return {} - - def split_seq(self, a, n): - """ - Split a list (a) into multiple chunks (n) - - Parameters - ---------- - a : list - list to split - n : list - number of chunks - - Returns - ------- - list - chunked list - """ - - # since chunking is not alway possible do the modulo of residues - k, m = divmod(len(a), n) - result = (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) - return result + return None -def _get_pool(n_jobs: int) -> multiprocessing.pool.Pool | multiprocessing.dummy.Pool: +def _get_pool(n_jobs: int) -> multiprocessing.Pool | multiprocessing.dummy.Pool: # type: ignore """Get a Pool object for parallel processing.""" if multiprocessing.current_process().daemon: logger.warning( @@ -1301,22 +954,29 @@ def _lists_to_psm_list( modifications: list[str | None], identifiers: list[str], charges: list[int] | None, + retention_times: list[float] | None = None, n_jobs: int = 1, ) -> PSMList: - """Convert lists of sequences, modifications, identifiers, and charges into a PSMList.""" + """Convert lists into a PSMList using Dask for parallel processing.""" if not charges: charges = [None] * len(sequences) + if not retention_times: + retention_times = [None] * len(sequences) + def create_psm(args): - sequence, modifications, identifier, charge = args + sequence, modifications, identifier, charge, retention_time = args return PSM( peptidoform=peprec_to_proforma(sequence, modifications, charge=charge), spectrum_id=identifier, + retention_time=retention_time, ) - args_list = list(zip(sequences, modifications, identifiers, charges, strict=True)) - with _get_pool(n_jobs) as pool: - list_of_psms = pool.map(create_psm, args_list) + args_list = list( + zip(sequences, modifications, identifiers, charges, retention_times, strict=True) + ) + tasks = [delayed(create_psm)(args) for args in args_list] + list_of_psms = list(compute(*tasks, scheduler="processes")) return PSMList(psm_list=list_of_psms) @@ -1332,11 +992,27 @@ def _dataframe_to_psm_list( sequences = dataframe["seq"] modifications = dataframe["modifications"] identifiers = dataframe.index - - if not charges: - if "charge" in dataframe.columns: - charges = dataframe["charge"] - else: - charges = list(None) * len(sequences) - - return _lists_to_psm_list(sequences, modifications, identifiers, charges, n_jobs=n_jobs) + retention_times = dataframe["tr"] if "tr" in dataframe.columns else None + + if not charges and "charge" in dataframe.columns: + charges = dataframe["charge"] + + return _lists_to_psm_list( + sequences, modifications, identifiers, charges, retention_times, n_jobs=n_jobs + ) + + +def _file_to_psm_list(input_file: str | Path) -> PSMList: + """Read a file into a PSMList, optionally mapping MaxQuant modifications labels.""" + psm_list = read_file(input_file) + if "msms" in input_file and ".txt" in input_file: + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() + psm_list.rename_modifications(mapper) + + return psm_list diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py index 087baa1..ecda640 100644 --- a/deeplc/feat_extractor.py +++ b/deeplc/feat_extractor.py @@ -1,651 +1,233 @@ -""" -This code is used to extract features for peptides used in DeepLC (or seperately -if required). +"""Feature extraction for DeepLC.""" -For the library versions see the .yml file -""" +from __future__ import annotations -__author__ = ["Robbin Bouwmeester", "Ralf Gabriels"] -__credits__ = [ - "Robbin Bouwmeester", - "Ralf Gabriels", - "Arthur Declercq", - "Prof. Lennart Martens", - "Sven Degroeve", -] -__license__ = "Apache License, Version 2.0" -__maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"] -__email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"] - -# Native imports -import os -import math -import time -from configparser import ConfigParser -import ast -from re import sub import logging +import warnings +from re import sub import numpy as np -import pandas as pd -from psm_utils.io.peptide_record import peprec_to_proforma -from psm_utils.psm import PSM -from psm_utils.psm_list import PSMList +from psm_utils.psm import Peptidoform from pyteomics import mass logger = logging.getLogger(__name__) -class FeatExtractor: - """ - Place holder, fill later - - Parameters - ---------- - - Returns - ------- - - """ - - def __init__( - self, - main_path=os.path.dirname(os.path.realpath(__file__)), - lib_path_mod=os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/"), - lib_aa_composition=os.path.join( - os.path.dirname(os.path.realpath(__file__)), "aa_comp_rel.csv" - ), - split_size=7, - verbose=True, - include_specific_posses=[ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - -1, - -2, - -3, - -4, - -5, - -6, - -7, - ], - add_sum_feat=False, - ptm_add_feat=False, - ptm_subtract_feat=False, - add_rolling_feat=False, - include_unnormalized=True, - add_comp_feat=False, - cnn_feats=True, - ignore_mods=False, - config_file=None, - ): - # if a config file is defined overwrite standard parameters - if config_file: - cparser = ConfigParser() - cparser.read(config_file) - lib_path_mod = cparser.get("featExtractor", "lib_path_mod").strip('"') - split_size = cparser.getint("featExtractor", "split_size") - verbose = cparser.getboolean("featExtractor", "verbose") - add_sum_feat = cparser.getboolean("featExtractor", "add_sum_feat") - ptm_add_feat = cparser.getboolean("featExtractor", "ptm_add_feat") - ptm_subtract_feat = cparser.getboolean("featExtractor", "ptm_subtract_feat") - add_rolling_feat = cparser.getboolean("featExtractor", "add_rolling_feat") - include_unnormalized = cparser.getboolean("featExtractor", "include_unnormalized") - include_specific_posses = ast.literal_eval( - cparser.get("featExtractor", "include_specific_posses") - ) - - self.main_path = main_path - - # Get the atomic composition of AAs - self.lib_aa_composition = self.get_aa_composition(lib_aa_composition) - - self.split_size = split_size - self.verbose = verbose - - self.add_sum_feat = add_sum_feat - self.ptm_add_feat = ptm_add_feat - self.ptm_subtract_feat = ptm_subtract_feat - self.add_rolling_feat = add_rolling_feat - self.cnn_feats = cnn_feats - self.include_unnormalized = include_unnormalized - self.include_specific_posses = include_specific_posses - self.add_comp_feat = add_comp_feat - self.ignore_mods = ignore_mods - - def __str__(self): - return """ - _____ _ _____ __ _ _ _ - | __ \ | | / ____| / _| | | | | | | - | | | | ___ ___ _ __ | | | | ______| |_ ___ __ _| |_ _____ _| |_ _ __ __ _ ___| |_ ___ _ __ - | | | |/ _ \/ _ \ '_ \| | | | |______| _/ _ \/ _` | __| / _ \ \/ / __| '__/ _` |/ __| __/ _ \| '__| - | |__| | __/ __/ |_) | |___| |____ | || __/ (_| | |_ | __/> <| |_| | | (_| | (__| || (_) | | - |_____/ \___|\___| .__/|______\_____| |_| \___|\__,_|\__| \___/_/\_\\__|_| \__,_|\___|\__\___/|_| - | | - |_| - """ - - def get_aa_composition(self, file_loc): - """ - Read amino acid atomic composition and return a dictionary - - Parameters - ---------- - file_loc : str - location of the (csv) file that contains the atomic compositions of AAs. The first column must contain - the one-letter AA code. The remaining columns contain counts for each atom (each atom in seperate - column). An example is: - - aa,C,H,N,O,S - A,1,2,0,0,0 - R,4,9,3,0,0 - N,2,3,1,1,0 - - Returns - ------- - dict - dictionary that goes from one-letter AA to atomic composition - """ - return pd.read_csv(file_loc, index_col=0).T.to_dict() - - def split_seq(self, a, n): - """ - Split a list (a) into multiple chunks (n) - - Parameters - ---------- - a : list - list to split - n : list - number of chunks - - Returns - ------- - list - chunked list - """ - # since chunking is not alway possible do the modulo of residues - k, m = divmod(len(a), n) - return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) - - def calc_feats_mods(self, formula): - """ - Chemical formula to atom addition/subtraction - - Parameters - ---------- - formula : str - chemical formula - - Returns - ------- - list - atom naming - list - number of atom added/subtracted - """ - if not formula: - return [], [] - if len(str(formula)) == 0: - return [], [] - if not isinstance(formula, str): - if math.isnan(formula): - return [], [] - - new_atoms = [] - new_num_atoms = [] - for atom in formula.split(" "): - if "(" not in atom: - atom_symbol = atom - num_atom = 1 - else: - atom_symbol = atom.split("(")[0] - num_atom = atom.split("(")[1].rstrip(")") - new_atoms.append(atom_symbol) - new_num_atoms.append(int(num_atom)) - return new_atoms, new_num_atoms - - def get_feats_mods( - self, - seqs, - mods, - identifiers, - split_size=False, - atoms_order=set(["H", "C", "N", "O", "P", "S"]), - add_str="_sum", - subtract_mods=False, - ): - """ - Chemical formula to atom addition/subtraction - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : str - identifiers of the peptides; should correspond to seqs and mods - split_size : int - overwrite the set split size if needed - atoms_order : set - atoms to include and the order - add_str : str - add this substring to feature naming - subtract_mods : boolean - calculate the atom that are substracted in the PTM - - Returns - ------- - object :: pd.DataFrame - feature matrix for peptide PTMs - """ - if not split_size: - split_size = self.split_size - if self.verbose: - t0 = time.time() - mod_dict = {} - look_up_mod_subtract = {} - look_up_mod_add = {} - - len_init = len([ao + str(spl_s) for spl_s in range(split_size) for ao in atoms_order]) - for index_name, mod, seq in zip(identifiers, mods, seqs): - mod_dict[index_name] = dict( - zip( - [ - ao + str(spl_s) + add_str - for spl_s in range(split_size) - for ao in atoms_order - ], - [0] * len_init, - ) - ) - - if not mod: - continue - if len(str(mod)) == 0: - continue - if not isinstance(mod, str): - if math.isnan(mod): - continue - - if self.verbose: - logger.debug("Time to calculate mod features: %s seconds" % (time.time() - t0)) - df_ret = pd.DataFrame(mod_dict, dtype=int).T - del mod_dict - return df_ret - - def encode_atoms( - self, - psm_list, - indexes, - charges=[], - predict_ccs=False, - padding_length=60, - positions=set([0, 1, 2, 3, -1, -2, -3, -4]), - positions_pos=set([0, 1, 2, 3]), - positions_neg=set([-1, -2, -3, -4]), - sum_mods=2, - dict_aa={ - "K": 0, - "R": 1, - "P": 2, - "T": 3, - "N": 4, - "A": 5, - "Q": 6, - "V": 7, - "S": 8, - "G": 9, - "I": 10, - "L": 11, - "C": 12, - "M": 13, - "H": 14, - "F": 15, - "Y": 16, - "W": 17, - "E": 18, - "D": 19, - }, - dict_index_pos={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}, - dict_index_all={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}, - dict_index={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}, - ): - """ - Extract all features we can extract... Probably the function you want to call by default - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods_all : list - naming of the mods; should correspond to seqs and identifiers - indexes : list - identifiers of the peptides; should correspond to seqs and mods - padding_length : int - indicates padding length with 'X'-characters. Shorter sequences are padded. Longer sequences - are sliced shorter (C-terminal > than padding length will be missing) - positions : list - list of positions to include separately, for the C-terminus - provide negative indices - sum_mods : int - value that is used to feed the second head of cerberus with summed information, for example, - a value of 2 would sum the composition of each 2 AAs (i.e. 1+2, 3+4, 5+6 ...) - dict_index_pos : dict - index position of atom for positional features - dict_index_all : dict - index position of atom for overall compositional features - dict_index : dict - index position of atom for compositional features for the whole peptide (each position) - charges : list - optional list with charges, keep empty if these will not effect the predicted value - - Returns - ------- - object :: pd.DataFrame - feature matrix (np.matrix) of all positions (up till padding length) - object :: pd.DataFrame - feature matrix (np.matrix) of summed positions (up till padding length / sum_mods) - object :: pd.DataFrame - feature matrix (np.matrix) of specific positions (from positions argument) - object :: pd.DataFrame - feature matrix (np.matrix) of summed composition - """ - - # Local helper to ensure each unique warning is logged only once. - logged_warnings = set() - - def warn_once(message): - if message not in logged_warnings: - logged_warnings.add(message) - logger.warning(message) - - # TODO param flag for CCS prediction - def rolling_sum(a, n=2): - ret = np.cumsum(a, axis=1, dtype=np.float32) - ret[:, n:] = ret[:, n:] - ret[:, :-n] - return ret[:, n - 1 :] - - ret_list = {} - ret_list["matrix"] = {} - ret_list["matrix_sum"] = {} - ret_list["matrix_all"] = {} - ret_list["pos_matrix"] = {} - ret_list["matrix_hc"] = {} - - # Iterate over all instances - for psm, row_index in zip(psm_list, indexes): - peptidoform = psm.peptidoform - seq = peptidoform.sequence - seq_len = len(seq) - charge = psm.get_precursor_charge() - - # For now anything longer than padding length is cut away - # (C-terminal cutting) - if seq_len > padding_length: - seq = seq[0:padding_length] - seq_len = len(seq) - warn_once("Truncating peptide (too long): %s" % (seq)) - - peptide_composition = [mass.std_aa_comp[aa] for aa in seq] - - # Initialize all feature matrices - matrix = np.zeros((padding_length, len(dict_index.keys())), dtype=np.float16) - matrix_hc = np.zeros((padding_length, len(dict_aa.keys())), dtype=np.float16) - matrix_pos = np.zeros((len(positions), len(dict_index.keys())), dtype=np.float16) - - for i, position_composition in enumerate(peptide_composition): - for k, v in position_composition.items(): - try: - matrix[i, dict_index[k]] = v - except IndexError: - warn_once( - f"Could not add the following value: pos {i} for atom {k} with value {v}" - ) - except KeyError: - warn_once( - f"Could not add the following value: pos {i} for atom {k} with value {v}" - ) - - for p in positions_pos: - try: - aa = seq[p] - except: - warn_once(f"Unable to get the following position: {p}") - continue - for atom, val in mass.std_aa_comp[aa].items(): - try: - matrix_pos[p, dict_index_pos[atom]] = val - except KeyError: - warn_once(f"Could not add the following atom: {atom}") - except IndexError: - warn_once(f"Could not add the following atom: {p} {atom} {val}") - - for pn in positions_neg: - try: - aa = seq[seq_len + pn] - except: - warn_once(f"Unable to get the following position: {p}") - continue - for atom, val in mass.std_aa_comp[aa].items(): - try: - matrix_pos[pn, dict_index_pos[atom]] = val - except KeyError: - warn_once(f"Could not add the following atom: {atom}") - except IndexError: - warn_once(f"Could not add the following atom: {pn} {atom} {val}") - - for i, peptide_position in enumerate(peptidoform.parsed_sequence): - try: - matrix_hc[i, dict_aa[peptide_position[0]]] = 1.0 - except KeyError: - warn_once(f"Skipping the following (not in library): {i} {peptide_position}") - except IndexError: - warn_once(f"Could not add the following atom: {i} {peptide_position}") - - if peptide_position[1] is not None: - try: - modification_composition = peptide_position[1][0].composition - except KeyError: - warn_once( - f"Skipping the following (not in library): {peptide_position[1]}" - ) - continue - except: - warn_once( - f"Skipping the following (not in library): {peptide_position[1]}" - ) - continue - - for ( - atom_position_composition, - atom_change, - ) in modification_composition.items(): - try: - matrix[i, dict_index[atom_position_composition]] += atom_change - if i in positions: - matrix_pos[ - i, - dict_index_pos[atom_position_composition], - ] += atom_change - elif i - seq_len in positions: - matrix_pos[ - i - seq_len, - dict_index_pos[atom_position_composition], - ] += atom_change - except KeyError: - try: - warn_once( - f"Could not add the following atom: {atom_position_composition}, attempting to replace the [] part" - ) - atom_position_composition = sub( - "\[.*?\]", "", atom_position_composition - ) - matrix[i, dict_index[atom_position_composition]] += atom_change - if i in positions: - matrix_pos[ - i, - dict_index_pos[atom_position_composition], - ] += atom_change - elif i - seq_len in positions: - matrix_pos[ - i - seq_len, - dict_index_pos[atom_position_composition], - ] += atom_change - except KeyError: - warn_once( - f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored" - ) - continue - except: - warn_once( - f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored" - ) - continue - except IndexError: - warn_once( - f"Could not add the following atom: {i} {atom_position_composition} {atom_change}" - ) - except: - warn_once( - f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored" - ) - continue - - matrix_all = np.sum(matrix, axis=0) - matrix_all = np.append(matrix_all, seq_len) - - if predict_ccs: - matrix_all = np.append(matrix_all, (seq.count("H")) / float(seq_len)) - matrix_all = np.append( - matrix_all, - (seq.count("F") + seq.count("W") + seq.count("Y")) / float(seq_len), - ) - matrix_all = np.append( - matrix_all, - (seq.count("D") + seq.count("E")) / float(seq_len), - ) - matrix_all = np.append( - matrix_all, - (seq.count("K") + seq.count("R")) / float(seq_len), - ) - matrix_all = np.append(matrix_all, charge) - - matrix_sum = rolling_sum(matrix.T, n=2)[:, ::2].T - - ret_list["matrix"][row_index] = matrix - ret_list["matrix_sum"][row_index] = matrix_sum - ret_list["matrix_all"][row_index] = matrix_all - ret_list["pos_matrix"][row_index] = matrix_pos.flatten() - ret_list["matrix_hc"][row_index] = matrix_hc - - return ret_list - - def full_feat_extract( - self, - psm_list=[], - seqs=[], - mods=[], - predict_ccs=False, - identifiers=[], - charges=[], - ): - """ - Extract all features we can extract... Probably the function your want to call by default - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : str - identifiers of the peptides; should correspond to seqs and mods - charges : list - optional list with charges, keep emtpy if these will not effect the predicted value - - Returns - ------- - pd.DataFrame - feature matrix - """ - # TODO Reintroduce for CCS, check CCS flag - if len(seqs) > 0: - list_of_psms = [] - for seq, mod, id in zip(seqs, mods, identifiers): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=id, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) - - if self.verbose: - t0 = time.time() - - # TODO pass CCS flag - if self.add_sum_feat: - if self.verbose: - logger.debug("Extracting compositional sum features for modifications") - X_feats_sum = self.get_feats_mods(psm_list, split_size=1, add_str="_sum") - if self.ptm_add_feat: - if self.verbose: - logger.debug("Extracting compositional add features for modifications") - X_feats_add = self.get_feats_mods(psm_list, split_size=self.split_size, add_str="_add") - if self.ptm_subtract_feat: - if self.verbose: - logger.debug("Extracting compositional subtract features for modifications") - X_feats_neg = self.get_feats_mods( - psm_list, - split_size=self.split_size, - add_str="_subtract", - subtract_mods=True, - ) - if self.cnn_feats: - if self.verbose: - logger.debug("Extracting CNN features") - X_cnn = self.encode_atoms( - psm_list, - list(range(len(psm_list))), - charges=charges, - predict_ccs=predict_ccs, - ) - - if self.cnn_feats: - X = X_cnn - if self.add_sum_feat: +# fmt: off +DEFAULT_POSITIONS: set[int] = {0, 1, 2, 3, -1, -2, -3, -4} +DEFAULT_POSITIONS_POS: set[int] = {0, 1, 2, 3} +DEFAULT_POSITIONS_NEG: set[int] = {-1, -2, -3, -4} +DEFAULT_DICT_AA: dict[str, int] = { + "K": 0, "R": 1, "P": 2, "T": 3, "N": 4, "A": 5, "Q": 6, "V": 7, "S": 8, "G": 9, "I": 10, + "L": 11, "C": 12, "M": 13, "H": 14, "F": 15, "Y": 16, "W": 17, "E": 18, "D": 19, +} +DEFAULT_DICT_INDEX_POS: dict[str, int] = {"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5} +DEFAULT_DICT_INDEX: dict[str, int] = {"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5} +# fmt: on + + +def _truncate_sequence(seq: str, max_length: int) -> tuple[str, int]: + """Truncate the sequence if it exceeds the max_length.""" + if len(seq) > max_length: + warnings.warn(f"Truncating peptide (too long): {seq}", stacklevel=2) + seq = seq[:max_length] + return seq, len(seq) + + +def _fill_standard_matrix(seq: str, padding_length: int, dict_index: dict[str, int]) -> np.ndarray: + """Fill the standard composition matrix using mass.std_aa_comp.""" + mat = np.zeros((padding_length, len(dict_index)), dtype=np.float16) + for i, aa in enumerate(seq): + for atom, value in mass.std_aa_comp[aa].items(): try: - X = pd.concat([X, X_feats_sum], axis=1) - except BaseException: - X = X_feats_sum - if self.ptm_add_feat: + mat[i, dict_index[atom]] = value + except (KeyError, IndexError): + warnings.warn(f"Skipping atom {atom} at pos {i}", stacklevel=2) + return mat + + +def _fill_onehot_matrix( + parsed_seq: list, padding_length: int, dict_aa: dict[str, int] +) -> np.ndarray: + """Fill a one-hot matrix based on the parsed sequence tokens.""" + onehot = np.zeros((padding_length, len(dict_aa)), dtype=np.float16) + for i, token in enumerate(parsed_seq): + try: + onehot[i, dict_aa[token[0]]] = 1.0 + except (KeyError, IndexError): + warnings.warn(f"One-hot skip: {i} {token}", stacklevel=2) + return onehot + + +def _fill_pos_matrix( + seq: str, + seq_len: int, + positions_pos: set[int], + positions_neg: set[int], + dict_index: dict[str, int], + dict_index_pos: dict[str, int], +) -> np.ndarray: + """Fill positional matrix for atoms at specific positions.""" + pos_total = positions_pos.union(positions_neg) + pos_mat = np.zeros((max(pos_total) - min(pos_total) + 1, len(dict_index)), dtype=np.float16) + # For positive positions + for pos in positions_pos: + try: + aa = seq[pos] + except Exception: + warnings.warn(f"Unable to get pos {pos}", stacklevel=2) + continue + for atom, value in mass.std_aa_comp[aa].items(): try: - X = pd.concat([X, X_feats_add], axis=1) - except BaseException: - X = X_feats_add - if self.ptm_subtract_feat: + # shift index for matrix row since positions may be negative. + pos_mat[pos - min(pos_total), dict_index_pos[atom]] = value + except (KeyError, IndexError): + warnings.warn(f"Pos matrix skip: {atom} at pos {pos}", stacklevel=2) + # For negative positions + for pos in positions_neg: + try: + aa = seq[seq_len + pos] + except Exception: + warnings.warn(f"Unable to get pos {pos}", stacklevel=2) + continue + for atom, value in mass.std_aa_comp[aa].items(): try: - X = pd.concat([X, X_feats_neg], axis=1) - except BaseException: - X = X_feats_neg - - if self.verbose: - logger.debug("Time to calculate all features: %s seconds" % (time.time() - t0)) - return X - - -def main(verbose=True): - f_extractor = FeatExtractor(config_file="config.ini") - df = pd.read_csv("parse_pride/seqs_exp.csv") - df.index = ["Pep_" + str(dfi) for dfi in df.index] - print(f_extractor.full_feat_extract(df["seq"], df["modifications"], df.index)) - + pos_mat[pos - min(pos_total), dict_index_pos[atom]] = value + except (KeyError, IndexError): + warnings.warn(f"Pos matrix skip: {atom} at neg pos {pos}", stacklevel=2) + return pos_mat + + +def _apply_modifications( + mat: np.ndarray, + pos_mat: np.ndarray, + parsed_seq: list, + seq_len: int, + dict_index: dict[str, int], + dict_index_pos: dict[str, int], + positions: set[int], +) -> None: + """Apply modification changes to the matrices.""" + for i, token in enumerate(parsed_seq): + if token[1] is None: + continue + try: + mod_comp = token[1][0].composition + except Exception: + warnings.warn(f"Skipping mod at pos {i}: {token[1]}", stacklevel=2) + continue + for atom_comp, change in mod_comp.items(): + try: + mat[i, dict_index[atom_comp]] += change + if i in positions: + pos_mat[i, dict_index_pos[atom_comp]] += change + elif (i - seq_len) in positions: + pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change + except KeyError: + try: + warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2) + atom_comp_clean = sub(r"\[.*?\]", "", atom_comp) + mat[i, dict_index[atom_comp_clean]] += change + if i in positions: + pos_mat[i, dict_index_pos[atom_comp_clean]] += change + elif (i - seq_len) in positions: + pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change + except KeyError: + warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2) + continue + except IndexError: + warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2) + + +def _compute_rolling_sum(matrix: np.ndarray, n: int = 2) -> np.ndarray: + """Compute a rolling sum over the matrix.""" + ret = np.cumsum(matrix, axis=1, dtype=np.float32) + ret[:, n:] = ret[:, n:] - ret[:, :-n] + return ret[:, n - 1 :] + + +def encode_peptidoform( + peptidoform: Peptidoform | str, + predict_ccs: bool = False, + padding_length: int = 60, + positions: set[int] | None = None, + positions_pos: set[int] | None = None, + positions_neg: set[int] | None = None, + dict_aa: dict[str, int] | None = None, + dict_index_pos: dict[str, int] | None = None, + dict_index: dict[str, int] | None = None, +) -> dict[str, np.ndarray]: + """Extract features from a single peptidoform.""" + positions = positions or DEFAULT_POSITIONS + positions_pos = positions_pos or DEFAULT_POSITIONS_POS + positions_neg = positions_neg or DEFAULT_POSITIONS_NEG + dict_aa = dict_aa or DEFAULT_DICT_AA + dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS + dict_index = dict_index or DEFAULT_DICT_INDEX + + if isinstance(peptidoform, str): + peptidoform = Peptidoform(peptidoform) + seq = peptidoform.sequence + charge = peptidoform.precursor_charge + seq, seq_len = _truncate_sequence(seq, padding_length) + + std_matrix = _fill_standard_matrix(seq, padding_length, dict_index) + onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa) + pos_matrix = _fill_pos_matrix( + seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos + ) + _apply_modifications( + std_matrix, + pos_matrix, + peptidoform.parsed_sequence, + seq_len, + dict_index, + dict_index_pos, + positions, + ) + + matrix_all = np.sum(std_matrix, axis=0) + matrix_all = np.append(matrix_all, seq_len) + if predict_ccs: + matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len) + matrix_all = np.append( + matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len + ) + matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len) + matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len) + matrix_all = np.append(matrix_all, charge) + + matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T + + return { + "matrix": std_matrix, + "matrix_sum": matrix_sum, + "matrix_all": matrix_all, + "pos_matrix": pos_matrix.flatten(), + "matrix_hc": onehot_matrix, + } + + +def aggregate_encodings( + encodings: list[dict[str, np.ndarray]], +) -> dict[str, dict[int, np.ndarray]]: + """Aggregate list of encodings into single dictionary.""" + return {key: {i: enc[key] for i, enc in enumerate(encodings)} for key in encodings[0]} + + +def unpack_features( + features: dict[str, np.ndarray], +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Unpack dictionary with features to numpy arrays.""" + X_sum = np.stack(list(features["matrix_sum"].values())) + X_global = np.concatenate( + ( + np.stack(list(features["matrix_all"].values())), + np.stack(list(features["pos_matrix"].values())), + ), + axis=1, + ) + X_hc = np.stack(list(features["matrix_hc"].values())) + X_main = np.stack(list(features["matrix"].values())) -if __name__ == "__main__": - main() + return X_sum, X_global, X_hc, X_main diff --git a/deeplc/trainl3.py b/deeplc/trainl3.py index f4bd276..1682485 100644 --- a/deeplc/trainl3.py +++ b/deeplc/trainl3.py @@ -28,7 +28,7 @@ _has_sklearn = True -def train_en(X, y, n_jobs=16, cv=None): +def train_elastic_net(X, y, n_jobs=16, cv=None): """ Function that trains Layer 3 of CALLC (elastic net) diff --git a/examples/deeplc_example.py b/examples/deeplc_example.py index 25a2cf4..2a44b2b 100644 --- a/examples/deeplc_example.py +++ b/examples/deeplc_example.py @@ -1,6 +1,7 @@ # Imports import pandas as pd from matplotlib import pyplot as plt + from deeplc import DeepLC, FeatExtractor if __name__ == "__main__": @@ -59,11 +60,11 @@ print("a") # Make predictions; calibrated and uncalibrated - preds_cal = dlc.make_preds(seq_df=df) + preds_cal = dlc._make_preds(seq_df=df) print("b") - preds_uncal = dlc.make_preds(seq_df=df, calibrate=False) + preds_uncal = dlc._make_preds(seq_df=df, calibrate=False) print("c") diff --git a/pyproject.toml b/pyproject.toml index 47b0895..b345d1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "deeplc" -version = "3.1.8" +version = "4.0.0-dev1" description = "DeepLC: Retention time prediction for (modified) peptides using Deep Learning." readme = "README.md" license = { file = "LICENSE" } diff --git a/tests/test_deepcallc.py b/tests/test_deepcallc.py index 71297db..798be6e 100644 --- a/tests/test_deepcallc.py +++ b/tests/test_deepcallc.py @@ -1,7 +1,8 @@ import pandas as pd -from deeplc import DeepLC from matplotlib import pyplot as plt +from deeplc import DeepLC + def main(): peptide_file = "temp_data/PXD005573_mcp.csv" @@ -73,7 +74,7 @@ def main(): ) dlc.calibrate_preds(seq_df=cal_df) - preds = dlc.make_preds(seq_df=cal_df) + preds = dlc._make_preds(seq_df=cal_df) plt.scatter(cal_df["tr"], preds) plt.show() diff --git a/tests/test_deeplc_lib.py b/tests/test_deeplc_lib.py index e9d6b51..c984196 100644 --- a/tests/test_deeplc_lib.py +++ b/tests/test_deeplc_lib.py @@ -1,7 +1,8 @@ import pandas as pd -from deeplc import DeepLC from matplotlib import pyplot as plt +from deeplc import DeepLC + def main(): peptide_file = "examples/datasets/test_pred.csv" @@ -18,7 +19,7 @@ def main(): # use_library="lib.csv", # reload_library=True) dlc.calibrate_preds(seq_df=cal_df) - preds = dlc.make_preds(seq_df=cal_df) + preds = dlc._make_preds(seq_df=cal_df) plt.scatter(pep_df["tr"], preds) plt.show() diff --git a/tests/test_deeplc_retrain.py b/tests/test_deeplc_retrain.py index 8c09330..365819b 100644 --- a/tests/test_deeplc_retrain.py +++ b/tests/test_deeplc_retrain.py @@ -1,7 +1,8 @@ import pandas as pd -from deeplc import DeepLC from matplotlib import pyplot as plt +from deeplc import DeepLC + def main(): peptide_file = "examples/datasets/test_pred.csv" @@ -18,7 +19,7 @@ def main(): # use_library="lib.csv", # reload_library=True) dlc.calibrate_preds(seq_df=cal_df) - preds = dlc.make_preds(seq_df=cal_df) + preds = dlc._make_preds(seq_df=cal_df) plt.scatter(cal_df["tr"], preds) plt.show() diff --git a/tests/test_feat_extractor.py b/tests/test_feat_extractor.py new file mode 100644 index 0000000..8ecc39e --- /dev/null +++ b/tests/test_feat_extractor.py @@ -0,0 +1,93 @@ +import numpy as np +from psm_utils.psm import Peptidoform + +from deeplc.feat_extractor import encode_peptidoform + + +def _check_result_structure(result: dict[str, np.ndarray]) -> None: + expected_keys = {"matrix", "matrix_sum", "matrix_all", "pos_matrix", "matrix_hc"} + assert expected_keys.issubset(result.keys()) + + +def test_encode_peptidoform_without_modification(): + # Test using a simple peptidoform string without modifications. + peptide = "ACDE/2" + result = encode_peptidoform(peptide) + _check_result_structure(result) + + # Default padding_length is 60 and there are 6 atoms in DEFAULT_DICT_INDEX. + assert result["matrix"].shape == (60, 6) + # matrix_all is formed by summing matrix (length 6) and appending the sequence length. + # So without CCS prediction, matrix_all length should be 7. + assert result["matrix_all"].shape == (7,) + + # The sequence is taken from the part before the '/'. + expected_seq = "ACDE" + # The last element of matrix_all should be the sequence length. + assert result["matrix_all"][-1] == len(expected_seq) + + +def test_encode_peptidoform_with_modification(): + # Test using a peptidoform string with a modification. + peptide = "AC[Carbamidomethyl]DE/2" + result = encode_peptidoform(peptide) + _check_result_structure(result) + + # Check basic structure as before. + assert result["matrix"].shape == (60, 6) + expected_seq = "ACDE" # Modification does not alter the base sequence. + assert result["matrix_all"][-1] == len(expected_seq) + + +def test_encode_peptidoform_with_ccs_prediction(): + # Test with predict_ccs=True. + peptide = "ACDE/2" + result = encode_peptidoform(peptide, predict_ccs=True) + _check_result_structure(result) + + # Without predict_ccs, matrix_all has 7 elements (6 sums + sequence length). + # With predict_ccs, 5 additional values are appended (ratios and charge), + # resulting in a total length of 12. + assert result["matrix_all"].shape == (12,) + + # Verify that the last element (charge) matches the precursor charge from Peptidoform. + pf = Peptidoform(peptide) + assert result["matrix_all"][-1] == pf.precursor_charge + + +def test_encode_peptidoform_feature_values() -> None: + """ + Test that the returned feature values match the expected results + for a known peptidoform. + """ + peptide = "ACDE/2" + result = encode_peptidoform(peptide, predict_ccs=False, padding_length=60) + + # For peptide "ACDE", using DEFAULT_DICT_INDEX = {"C":0,"H":1,"N":2,"O":3,"S":4,"P":5} + # Based on pyteomics.mass.std_aa_comp, the expected composition is: + # For 'A': { "C": 3, "H": 5, "N": 1, "O": 1 } -> row0: [3,5,1,1,0,0] + # For 'C': { "C": 3, "H": 5, "N": 1, "O": 1, "S": 1 } -> row1: [3,5,1,1,1,0] + # For 'D': { "C": 4, "H": 5, "N": 1, "O": 3 } -> row2: [4,5,1,3,0,0] + # For 'E': { "C": 5, "H": 7, "N": 1, "O": 3 } -> row3: [5,7,1,3,0,0] + # + # Sum each column over positions 0 to 3: + # C: 3+3+4+5 = 15 + # H: 5+5+5+7 = 22 + # N: 1+1+1+1 = 4 + # O: 1+1+3+3 = 8 + # S: 0+1+0+0 = 1 + # P: 0+0+0+0 = 0 + # Then sequence length appended: 4 + expected_matrix_all = np.array([15, 22, 4, 8, 1, 0, 4], dtype=np.float32) + + # Check matrix_all values using np.allclose. + assert np.allclose(result["matrix_all"], expected_matrix_all), ( + f"Expected matrix_all: {expected_matrix_all}, got: {result['matrix_all']}" + ) + + # Also check the first row of the matrix for letter 'A'. + # Expected for 'A': [3,5,1,1,0,0] + expected_row_A = np.array([3, 5, 1, 1, 0, 0], dtype=np.float16) + assert np.allclose(result["matrix"][0][:6], expected_row_A), ( + f"Expected row for A: {expected_row_A}, got: {result['matrix'][0][:6]}" + ) From ccc6ae33ae096d9cbb81390208aa24af18302e3d Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Thu, 24 Apr 2025 11:21:17 +0200 Subject: [PATCH 4/6] Add dask dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 3a2f23b..5acfd42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ keywords = [ ] dependencies = [ + "dask>2023" "torch>=2.6.0,<3", "onnx2torch>=0.1.5", "numpy>=1.17", From 57db4c85ea9f39270f7ca66df09dff5915ee9351 Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Thu, 24 Apr 2025 11:22:29 +0200 Subject: [PATCH 5/6] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5acfd42..49df3f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ keywords = [ ] dependencies = [ - "dask>2023" + "dask>2023", "torch>=2.6.0,<3", "onnx2torch>=0.1.5", "numpy>=1.17", From cc965d684c6ef1dee92df9d03a680b0f50d2efdb Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Thu, 24 Apr 2025 11:33:29 +0200 Subject: [PATCH 6/6] Read function to prepare matrices --- deeplc/deeplc.py | 149 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 119 insertions(+), 30 deletions(-) diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index bf5af33..22f1b92 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -112,7 +112,9 @@ def __init__(self, X, X_sum, X_global, X_hc, target=None): self.X_hc = torch.from_numpy(X_hc).float() if target is not None: - self.target = torch.from_numpy(target).float() # Add target values if provided + self.target = torch.from_numpy( + target + ).float() # Add target values if provided else: self.target = None # If no target is provided, set it to None @@ -212,7 +214,8 @@ def fine_tune(self): val_loader = self.prepare_data(val_dataset, shuffle=False) optimizer = torch.optim.Adam( - filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.learning_rate + filter(lambda p: p.requires_grad, self.model.parameters()), + lr=self.learning_rate, ) loss_fn = torch.nn.L1Loss() best_model_wts = copy.deepcopy(self.model.state_dict()) @@ -241,7 +244,9 @@ def fine_tune(self): for batch in val_loader: batch_X, batch_X_sum, batch_X_global, batch_X_hc, target = batch target = target.view(-1, 1) - outputs = self.model(batch_X, batch_X_sum, batch_X_global, batch_X_hc) + outputs = self.model( + batch_X, batch_X_sum, batch_X_global, batch_X_hc + ) val_loss += loss_fn(outputs, target).item() avg_val_loss = val_loss / len(val_loader) @@ -415,7 +420,9 @@ def __str__(self): """ @staticmethod - def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) -> list[str]: + def _get_model_paths( + passed_model_path: str | None, single_model_mode: bool + ) -> list[str]: """Get the model paths based on the passed model path and the single model mode.""" if passed_model_path: return [passed_model_path] @@ -425,6 +432,35 @@ def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) -> return DEFAULT_MODELS + def _prepare_feature_matrices(self, psm_list): + """ + Extract features in parallel and assemble the four input matrices. + + Parameters + ---------- + psm_list : list of PSM + List of peptide‐spectrum matches for which to extract features. + + Returns + ------- + X : ndarray, shape (n_peptides, n_features) + X_sum : ndarray, shape (n_peptides, n_sum_features) + X_global : ndarray, shape (n_peptides, n_global_features * 2) + X_hc : ndarray, shape (n_peptides, n_hc_features) + """ + feats = self.do_f_extraction_psm_list_parallel(psm_list) + X = np.stack(list(feats["matrix"].values())) + X_sum = np.stack(list(feats["matrix_sum"].values())) + X_global = np.concatenate( + ( + np.stack(list(feats["matrix_all"].values())), + np.stack(list(feats["pos_matrix"].values())), + ), + axis=1, + ) + X_hc = np.stack(list(feats["matrix_hc"].values())) + return X, X_sum, X_global, X_hc + def _extract_features( self, peptidoforms: list[str | Peptidoform] | PSMList, @@ -437,16 +473,21 @@ def _extract_features( logger.debug("Running feature extraction in single-threaded mode...") if self.n_jobs <= 1: encodings = [ - encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in peptidoforms + encode_peptidoform(pf, predict_ccs=self.predict_ccs) + for pf in peptidoforms ] else: logger.debug("Preparing feature extraction with Dask") # Process peptidoforms in larger chunks to reduce task overhead. - peptidoform_strings = [str(pep) for pep in peptidoforms] # Faster pickling of strings + peptidoform_strings = [ + str(pep) for pep in peptidoforms + ] # Faster pickling of strings def chunked_encode(chunk): - return [encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in chunk] + return [ + encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in chunk + ] tasks = [ delayed(chunked_encode)(peptidoform_strings[i : i + chunk_size]) @@ -454,7 +495,9 @@ def chunked_encode(chunk): ] logger.debug("Starting feature extraction with Dask") - chunks_encodings = compute(*tasks, scheduler="processes", workers=self.n_jobs) + chunks_encodings = compute( + *tasks, scheduler="processes", workers=self.n_jobs + ) # Flatten the list of lists. encodings = [enc for chunk in chunks_encodings for enc in chunk] @@ -486,7 +529,9 @@ def _apply_calibration_core( # Use spline model within the range of X within_range = (uncal_preds >= cal_min) & (uncal_preds <= cal_max) - within_range = within_range.ravel() # Ensure this is a 1D array for proper indexing + within_range = ( + within_range.ravel() + ) # Ensure this is a 1D array for proper indexing # Create a prediction array initialized with spline predictions cal_preds = np.copy(y_pred_spline) @@ -501,19 +546,27 @@ def _apply_calibration_core( else: for uncal_pred in uncal_preds: try: - slope, intercept = cal_dict[str(round(uncal_pred, self.bin_distance))] + slope, intercept = cal_dict[ + str(round(uncal_pred, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) except KeyError: # outside of the prediction range ... use the last # calibration curve if uncal_pred <= cal_min: - slope, intercept = cal_dict[str(round(cal_min, self.bin_distance))] + slope, intercept = cal_dict[ + str(round(cal_min, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) elif uncal_pred >= cal_max: - slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))] + slope, intercept = cal_dict[ + str(round(cal_max, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) else: - slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))] + slope, intercept = cal_dict[ + str(round(cal_max, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) return np.array(cal_preds) @@ -648,7 +701,9 @@ def make_preds( elif infile is not None: psm_list = _file_to_psm_list(infile) else: - raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.") + raise ValueError( + "Either `psm_list` or `seq_df` or `infile` must be provided." + ) if len(psm_list) == 0: logger.warning("No PSMs to predict for.") @@ -692,7 +747,9 @@ def make_preds( ) ) # Average the predictions from all models - ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds, strict=True)]) + ret_preds = np.array( + [sum(a) / len(a) for a in zip(*ret_preds, strict=True)] + ) # ret_preds = np.mean(model_predictions, axis=0) else: @@ -740,7 +797,9 @@ def _calibrate_preds_pygam( spline_model = linear_model linear_model_right = linear_model else: - spline = SplineTransformer(degree=4, n_knots=int(len(measured_tr) / 500) + 5) + spline = SplineTransformer( + degree=4, n_knots=int(len(measured_tr) / 500) + 5 + ) spline_model = make_pipeline(spline, LinearRegression()) spline_model.fit(predicted_tr.reshape(-1, 1), measured_tr) @@ -832,9 +891,13 @@ def _calibrate_preds_piecewise_linear( "peptides for fitting the calibration curve." ) if len(mtr_mean) == 0: - raise CalibrationError("The measured tr list is empty, not able to calibrate") + raise CalibrationError( + "The measured tr list is empty, not able to calibrate" + ) if len(ptr_mean) == 0: - raise CalibrationError("The predicted tr list is empty, not able to calibrate") + raise CalibrationError( + "The predicted tr list is empty, not able to calibrate" + ) # calculate calibration curves for i in range(0, len(ptr_mean)): @@ -913,7 +976,9 @@ def calibrate_preds( elif infile is not None: psm_list = _file_to_psm_list(infile) else: - raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.") + raise ValueError( + "Either `psm_list` or `seq_df` or `infile` must be provided." + ) # Getting measured retention time either from measured_tr or provided PSMs if not measured_tr: @@ -946,7 +1011,9 @@ def calibrate_preds( X, X_sum, X_global, X_hc = self._prepare_feature_matrices(psm_list) dataset = DeepLCDataset(X, X_sum, X_global, X_hc, np.array(measured_tr)) - base_model_path = self.model[0] if isinstance(self.model, list) else self.model + base_model_path = ( + self.model[0] if isinstance(self.model, list) else self.model + ) base_model = torch.load( base_model_path, weights_only=False, map_location=torch.device("cpu") ) @@ -982,29 +1049,42 @@ def calibrate_preds( for model_name in self.model: logger.debug(f"Trying out the following model: {model_name}") - predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=model_name) + predicted_tr = self.make_preds( + psm_list, calibrate=False, mod_name=model_name + ) if self.pygam_calibration: - calibrate_output = self._calibrate_preds_pygam(measured_tr, predicted_tr) + calibrate_output = self._calibrate_preds_pygam( + measured_tr, predicted_tr + ) else: calibrate_output = self._calibrate_preds_piecewise_linear( measured_tr, predicted_tr, use_median=use_median ) - self.calibrate_min, self.calibrate_max, self.calibrate_dict = calibrate_output + self.calibrate_min, self.calibrate_max, self.calibrate_dict = ( + calibrate_output + ) # TODO: Currently, calibration dict can be both a dict (linear) or a list of models # (PyGAM)... This should be handled better in the future. # Skip this model if calibrate_dict is empty # TODO: Should this do something when using PyGAM and calibrate_dict is a list? - if isinstance(self.calibrate_dict, dict) and len(self.calibrate_dict.keys()) == 0: + if ( + isinstance(self.calibrate_dict, dict) + and len(self.calibrate_dict.keys()) == 0 + ): continue m_name = model_name.split("/")[-1] # Get new predictions with calibration - preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=model_name) + preds = self.make_preds( + psm_list, calibrate=True, seq_df=seq_df, mod_name=model_name + ) - m_group_name = "deepcallc" if self.deepcallc_mod else "_".join(m_name.split("_")[:-1]) + m_group_name = ( + "deepcallc" if self.deepcallc_mod else "_".join(m_name.split("_")[:-1]) + ) m = model_name try: pred_dict[m_group_name][m] = preds @@ -1026,13 +1106,18 @@ def calibrate_preds( mod_calibrate_max_dict[m_group_name][m] = self.calibrate_max for m_name in pred_dict: - preds = [sum(a) / len(a) for a in zip(*list(pred_dict[m_name].values()), strict=True)] + preds = [ + sum(a) / len(a) + for a in zip(*list(pred_dict[m_name].values()), strict=True) + ] if len(measured_tr) == 0: perf = sum(abs(seq_df["tr"] - preds)) else: perf = sum(abs(np.array(measured_tr) - np.array(preds))) - logger.debug(f"For {m_name} model got a performance of: {perf / len(preds)}") + logger.debug( + f"For {m_name} model got a performance of: {perf / len(preds)}" + ) if perf < best_perf: m_group_name = "deepcallc" if self.deepcallc_mod else m_name @@ -1072,7 +1157,9 @@ def calibrate_preds( ], ) plotly_return_dict["scatter"] = deeplc.plot.scatter(plotly_df) - plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(plotly_df) + plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline( + plotly_df + ) return plotly_return_dict return None @@ -1124,7 +1211,9 @@ def create_psm(args): ) args_list = list( - zip(sequences, modifications, identifiers, charges, retention_times, strict=True) + zip( + sequences, modifications, identifiers, charges, retention_times, strict=True + ) ) tasks = [delayed(create_psm)(args) for args in args_list] list_of_psms = list(compute(*tasks, scheduler="processes"))