From f3a996264236382686f849a4ae127d2134f3f848 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Thu, 27 Mar 2025 13:08:36 +0100
Subject: [PATCH 1/6] Formatting with Ruff

---
 deeplc/__init__.py            |  10 ++-
 deeplc/__main__.py            | 112 ++++++++++++++++++----------
 deeplc/_argument_parser.py    |  21 ++----
 deeplc/_exceptions.py         |   1 +
 deeplc/deeplc.py              | 133 +++++++++++++++-------------------
 deeplc/feat_extractor.py      |  94 +++++++++++-------------
 deeplc/gui.py                 |   7 +-
 deeplc/plot.py                |  25 +++----
 deeplc/trainl3.py             |   6 +-
 examples/deeplc_example.py    |  32 ++++----
 pyproject.toml                |  10 +++
 streamlit/deeplc_streamlit.py |   9 ++-
 streamlit/streamlit_utils.py  |   2 +-
 tests/test_deepcallc.py       | 118 ++++++++++++++++--------------
 tests/test_deeplc.py          |  10 ++-
 tests/test_deeplc_lib.py      |  22 +++---
 tests/test_deeplc_retrain.py  |  22 +++---
 17 files changed, 327 insertions(+), 307 deletions(-)

diff --git a/deeplc/__init__.py b/deeplc/__init__.py
index 953aa36..0b3c79f 100644
--- a/deeplc/__init__.py
+++ b/deeplc/__init__.py
@@ -1,14 +1,16 @@
-import sys
+__all__ = ["DeepLC", "FeatExtractor"]
 
+import sys
 
-if sys.version_info >= (3,8):
+if sys.version_info >= (3, 8):
     from importlib.metadata import version
-    __version__ = version('deeplc')
+
+    __version__ = version("deeplc")
 else:
     import pkg_resources
+
     __version__ = pkg_resources.require("deeplc")[0].version
 
 
 from deeplc.deeplc import DeepLC
 from deeplc.feat_extractor import FeatExtractor
-
diff --git a/deeplc/__main__.py b/deeplc/__main__.py
index eb6079f..6c26032 100644
--- a/deeplc/__main__.py
+++ b/deeplc/__main__.py
@@ -1,7 +1,12 @@
 """Main command line interface to DeepLC."""
 
 __author__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
-__credits__ = ["Robbin Bouwmeester", "Ralf Gabriels", "Prof. Lennart Martens", "Sven Degroeve"]
+__credits__ = [
+    "Robbin Bouwmeester",
+    "Ralf Gabriels",
+    "Prof. Lennart Martens",
+    "Sven Degroeve",
+]
 __license__ = "Apache License, Version 2.0"
 __maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
 __email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"]
@@ -26,27 +31,28 @@
 
 def setup_logging(passed_level):
     log_mapping = {
-        'critical': logging.CRITICAL,
-        'error': logging.ERROR,
-        'warning': logging.WARNING,
-        'info': logging.INFO,
-        'debug': logging.DEBUG,
+        "critical": logging.CRITICAL,
+        "error": logging.ERROR,
+        "warning": logging.WARNING,
+        "info": logging.INFO,
+        "debug": logging.DEBUG,
     }
 
     if passed_level.lower() not in log_mapping:
         print(
             "Invalid log level. Should be one of the following: ",
-            ', '.join(log_mapping.keys())
+            ", ".join(log_mapping.keys()),
         )
         exit(1)
 
     logging.basicConfig(
         stream=sys.stdout,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        datefmt='%Y-%m-%d %H:%M:%S',
-        level=log_mapping[passed_level.lower()]
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=log_mapping[passed_level.lower()],
     )
 
+
 def main(gui=False):
     """Main function for the CLI."""
     argu = parse_arguments(gui=gui)
@@ -55,13 +61,13 @@ def main(gui=False):
 
     # Reset logging levels if DEBUG (see deeplc.py)
     if argu.log_level.lower() == "debug":
-        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
-        logging.getLogger('tensorflow').setLevel(logging.DEBUG)
-        warnings.filterwarnings('default', category=DeprecationWarning)
-        warnings.filterwarnings('default', category=FutureWarning)
-        warnings.filterwarnings('default', category=UserWarning)
+        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
+        logging.getLogger("tensorflow").setLevel(logging.DEBUG)
+        warnings.filterwarnings("default", category=DeprecationWarning)
+        warnings.filterwarnings("default", category=FutureWarning)
+        warnings.filterwarnings("default", category=UserWarning)
     else:
-        os.environ['KMP_WARNINGS'] = '0'
+        os.environ["KMP_WARNINGS"] = "0"
 
     try:
         run(**vars(argu))
@@ -101,13 +107,13 @@ def run(
         for fm in file_model:
             if len(sel_group) == 0:
                 sel_group = "_".join(fm.split("_")[:-1])
-                fm_dict[sel_group]= fm
+                fm_dict[sel_group] = fm
                 continue
             m_group = "_".join(fm.split("_")[:-1])
             if m_group == sel_group:
                 fm_dict[m_group] = fm
         file_model = fm_dict
-    
+
     with open(file_pred) as f:
         first_line_pred = f.readline().strip()
     if file_cal:
@@ -118,49 +124,70 @@ def run(
         # Read input files
         df_pred = pd.read_csv(file_pred)
         if len(df_pred.columns) < 2:
-            df_pred = pd.read_csv(file_pred,sep=" ")
+            df_pred = pd.read_csv(file_pred, sep=" ")
         df_pred = df_pred.fillna("")
         file_pred = ""
 
         list_of_psms = []
-        for seq,mod,ident in zip(df_pred["seq"],df_pred["modifications"],df_pred.index):
-            list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
+        for seq, mod, ident in zip(df_pred["seq"], df_pred["modifications"], df_pred.index):
+            list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident))
         psm_list_pred = PSMList(psm_list=list_of_psms)
         df_pred = None
     else:
         psm_list_pred = read_file(file_pred)
         if "msms" in file_pred and ".txt" in file_pred:
-            mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
+            mapper = pd.read_csv(
+                os.path.join(
+                    os.path.dirname(os.path.realpath(__file__)),
+                    "unimod/map_mq_file.csv",
+                ),
+                index_col=0,
+            )["value"].to_dict()
             psm_list_pred.rename_modifications(mapper)
 
     # Allow for calibration file to be empty (undefined), fill in if/elif if present
     psm_list_cal = []
-    if "modifications" in first_line_cal.split(",") and "seq" in first_line_cal.split(",") and file_cal:
+    if (
+        "modifications" in first_line_cal.split(",")
+        and "seq" in first_line_cal.split(",")
+        and file_cal
+    ):
         df_cal = pd.read_csv(file_cal)
         if len(df_cal.columns) < 2:
-            df_cal = pd.read_csv(df_cal,sep=" ")
+            df_cal = pd.read_csv(df_cal, sep=" ")
         df_cal = df_cal.fillna("")
         file_cal = ""
 
         list_of_psms = []
-        for seq,mod,ident,tr in zip(df_cal["seq"],df_cal["modifications"],df_cal.index,df_cal["tr"]):
-            list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
+        for seq, mod, ident, tr in zip(
+            df_cal["seq"], df_cal["modifications"], df_cal.index, df_cal["tr"]
+        ):
+            list_of_psms.append(
+                PSM(
+                    peptidoform=peprec_to_proforma(seq, mod),
+                    spectrum_id=ident,
+                    retention_time=tr,
+                )
+            )
         psm_list_cal = PSMList(psm_list=list_of_psms)
         df_cal = None
     elif file_cal:
         psm_list_cal = read_file(file_cal)
         if "msms" in file_cal and ".txt" in file_cal:
-            mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
+            mapper = pd.read_csv(
+                os.path.join(
+                    os.path.dirname(os.path.realpath(__file__)),
+                    "unimod/map_mq_file.csv",
+                ),
+                index_col=0,
+            )["value"].to_dict()
             psm_list_cal.rename_modifications(mapper)
     # Make a feature extraction object; you can skip this if you do not want to
     # use the default settings for DeepLC. Here we want to use a model that does
     # not use RDKit features so we skip the chemical descriptor making
     # procedure.
-    f_extractor = FeatExtractor(
-        cnn_feats=True,
-        verbose=verbose
-    )
-    
+    f_extractor = FeatExtractor(cnn_feats=True, verbose=verbose)
+
     # Make the DeepLC object that will handle making predictions and calibration
     dlc = DeepLC(
         path_model=file_model,
@@ -173,9 +200,9 @@ def run(
         batch_num=batch_num,
         n_jobs=n_threads,
         verbose=verbose,
-        deeplc_retrain=transfer_learning
+        deeplc_retrain=transfer_learning,
     )
-    
+
     # Calibrate the original model based on the new retention times
     if len(psm_list_cal) > 0:
         logger.info("Selecting best model and calibrating predictions...")
@@ -187,14 +214,19 @@ def run(
     if len(psm_list_cal) > 0:
         preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
     else:
-        preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred, calibrate=False)
-    
-    #df_pred["predicted_tr"] = preds
+        preds = dlc.make_preds(
+            seq_df=df_pred,
+            infile=file_pred,
+            psm_list=psm_list_pred,
+            calibrate=False,
+        )
+
+    # df_pred["predicted_tr"] = preds
     logger.info("Writing predictions to file: %s", file_pred_out)
-    
-    file_pred_out = open(file_pred_out,"w")
+
+    file_pred_out = open(file_pred_out, "w")
     file_pred_out.write("Sequence proforma,predicted retention time\n")
-    for psm,tr in zip(psm_list_pred,preds):
+    for psm, tr in zip(psm_list_pred, preds):
         file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n")
     file_pred_out.close()
 
diff --git a/deeplc/_argument_parser.py b/deeplc/_argument_parser.py
index 09875cf..e12f526 100644
--- a/deeplc/_argument_parser.py
+++ b/deeplc/_argument_parser.py
@@ -74,17 +74,13 @@ def parse_arguments(gui=False):
 
     parser = ArgumentParser(
         prog="DeepLC",
-        description=(
-            "Retention time prediction for (modified) peptides using deep " "learning."
-        ),
+        description=("Retention time prediction for (modified) peptides using deep learning."),
         usage="deeplc [OPTIONS] --file_pred <peptide_file>",
         formatter_class=lambda prog: HelpFormatter(prog, max_help_position=42),
         add_help=False,
     )
 
-    io_args = parser.add_argument_group(
-        "Input and output files", **gooey_args["io_args"]
-    )
+    io_args = parser.add_argument_group("Input and output files", **gooey_args["io_args"])
     io_args.add_argument(
         "--file_pred",
         required=True,
@@ -97,9 +93,7 @@ def parse_arguments(gui=False):
         type=str,
         default=None,
         metavar="Input peptides for calibration" if gui else "",
-        help=(
-            "path to peptide CSV file with retention times to use for " "calibration"
-        ),
+        help=("path to peptide CSV file with retention times to use for calibration"),
         **gooey_args["file_cal"],
     )
     io_args.add_argument(
@@ -166,10 +160,7 @@ def parse_arguments(gui=False):
         dest="split_cal",
         default=50,
         metavar="split cal" if gui else "",
-        help=(
-            "number of splits in the chromatogram for piecewise linear "
-            "calibration fit"
-        ),
+        help=("number of splits in the chromatogram for piecewise linear calibration fit"),
         **gooey_args["split_cal"],
     )
     model_cal_args.add_argument(
@@ -265,8 +256,6 @@ def parse_arguments(gui=False):
     results = parser.parse_args()
 
     if not results.file_pred_out:
-        results.file_pred_out = (
-            os.path.splitext(results.file_pred)[0] + "_deeplc_predictions.csv"
-        )
+        results.file_pred_out = os.path.splitext(results.file_pred)[0] + "_deeplc_predictions.csv"
 
     return results
diff --git a/deeplc/_exceptions.py b/deeplc/_exceptions.py
index 1be41ae..c3cde3a 100644
--- a/deeplc/_exceptions.py
+++ b/deeplc/_exceptions.py
@@ -1,5 +1,6 @@
 """DeepLC exceptions."""
 
+
 class DeepLCError(Exception):
     pass
 
diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py
index ab49568..dcaa146 100644
--- a/deeplc/deeplc.py
+++ b/deeplc/deeplc.py
@@ -16,7 +16,6 @@
     "Sven Degroeve",
 ]
 
-
 # Default models, will be used if no other is specified. If no best model is
 # selected during calibration, the first model in the list will be used.
 import os
@@ -44,10 +43,9 @@
 from itertools import chain
 from tempfile import TemporaryDirectory
 
-from sklearn.preprocessing import SplineTransformer
 from sklearn.linear_model import LinearRegression
 from sklearn.pipeline import make_pipeline
-
+from sklearn.preprocessing import SplineTransformer
 
 # If CLI/GUI/frozen: disable Tensorflow info and warnings before importing
 IS_CLI_GUI = os.path.basename(sys.argv[0]) in ["deeplc", "deeplc-gui"]
@@ -233,9 +231,7 @@ def __init__(
         try:
             tf.config.threading.set_intra_op_parallelism_threads(n_jobs)
         except RuntimeError:
-            logger.warning(
-                "DeepLC tried to set intra op threads, but was unable to do so."
-            )
+            logger.warning("DeepLC tried to set intra op threads, but was unable to do so.")
 
         if "NUMEXPR_MAX_THREADS" not in os.environ:
             os.environ["NUMEXPR_MAX_THREADS"] = str(n_jobs)
@@ -300,20 +296,19 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]):
 
         if not self.predict_ccs:
             for seq, mod, ident in zip(seqs, mods, identifiers):
-                list_of_psms.append(
-                    PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident)
-                )
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident))
         else:
             for seq, mod, ident, z in zip(seqs, mods, identifiers, charges):
                 list_of_psms.append(
-                    PSM(peptide=peprec_to_proforma(seq, mod, z), spectrum_id=ident)
+                    PSM(
+                        peptide=peprec_to_proforma(seq, mod, z),
+                        spectrum_id=ident,
+                    )
                 )
 
         psm_list = PSMList(psm_list=list_of_psms)
 
-        return self.f_extractor.full_feat_extract(
-            psm_list, predict_ccs=self.predict_ccs
-        )
+        return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
 
     def do_f_extraction_pd(self, df_instances, charges=[]):
         """
@@ -336,11 +331,11 @@ def do_f_extraction_pd(self, df_instances, charges=[]):
         list_of_psms = []
         if len(charges) == 0:
             for seq, mod, ident in zip(
-                df_instances["seq"], df_instances["modifications"], df_instances.index
+                df_instances["seq"],
+                df_instances["modifications"],
+                df_instances.index,
             ):
-                list_of_psms.append(
-                    PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident)
-                )
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident))
         else:
             for seq, mod, ident, z in zip(
                 df_instances["seq"],
@@ -357,9 +352,7 @@ def do_f_extraction_pd(self, df_instances, charges=[]):
 
         psm_list = PSMList(psm_list=list_of_psms)
 
-        return self.f_extractor.full_feat_extract(
-            psm_list, predict_ccs=self.predict_ccs
-        )
+        return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
 
     def do_f_extraction_pd_parallel(self, df_instances):
         """
@@ -414,9 +407,7 @@ def do_f_extraction_psm_list(self, psm_list):
         pd.DataFrame
             feature matrix
         """
-        return self.f_extractor.full_feat_extract(
-            psm_list, predict_ccs=self.predict_ccs
-        )
+        return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
 
     def do_f_extraction_psm_list_parallel(self, psm_list):
         """
@@ -454,9 +445,7 @@ def do_f_extraction_psm_list_parallel(self, psm_list):
             logger.debug("got feature extraction results")
         else:
             logger.debug("start feature extraction")
-            all_feats_async = pool.map_async(
-                self.do_f_extraction_psm_list, psm_list_split
-            )
+            all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split)
 
             logger.debug("wait for feature extraction")
             all_feats_async.wait()
@@ -465,9 +454,7 @@ def do_f_extraction_psm_list_parallel(self, psm_list):
             matrix_names = res[0].keys()
             all_feats = {
                 matrix_name: dict(
-                    enumerate(
-                        chain.from_iterable((v[matrix_name].values() for v in res))
-                    )
+                    enumerate(chain.from_iterable((v[matrix_name].values() for v in res)))
                 )
                 for matrix_name in matrix_names
             }
@@ -493,9 +480,7 @@ def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max):
 
             # Use spline model within the range of X
             within_range = (uncal_preds >= cal_min) & (uncal_preds <= cal_max)
-            within_range = (
-                within_range.ravel()
-            )  # Ensure this is a 1D array for proper indexing
+            within_range = within_range.ravel()  # Ensure this is a 1D array for proper indexing
 
             # Create a prediction array initialized with spline predictions
             cal_preds = np.copy(y_pred_spline)
@@ -586,10 +571,10 @@ def make_preds_core(
             predictions
         """
         if calibrate:
-            assert (
-                self.calibrate_dict
-            ), "DeepLC instance is not yet calibrated.\
+            assert self.calibrate_dict, (
+                "DeepLC instance is not yet calibrated.\
                                         Calibrate before making predictions, or use calibrate=False"
+            )
 
         if len(X) == 0 and len(psm_list) > 0:
             if self.verbose:
@@ -644,7 +629,12 @@ def make_preds_core(
         return ret_preds
 
     def make_preds(
-        self, psm_list=None, infile="", calibrate=True, seq_df=None, mod_name=None
+        self,
+        psm_list=None,
+        infile="",
+        calibrate=True,
+        seq_df=None,
+        mod_name=None,
     ):
         """
         Make predictions for sequences, in batches if required.
@@ -690,11 +680,12 @@ def make_preds(
                         )
                     )
             else:
-                for seq, mod, ident in zip(
-                    seq_df["seq"], seq_df["modifications"], seq_df.index
-                ):
+                for seq, mod, ident in zip(seq_df["seq"], seq_df["modifications"], seq_df.index):
                     list_of_psms.append(
-                        PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident)
+                        PSM(
+                            peptidoform=peprec_to_proforma(seq, mod),
+                            spectrum_id=ident,
+                        )
                     )
             psm_list = PSMList(psm_list=list_of_psms)
 
@@ -810,7 +801,10 @@ def calibrate_preds_func_pygam(
                     )
             else:
                 for seq, mod, ident, tr in zip(
-                    seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"]
+                    seq_df["seq"],
+                    seq_df["modifications"],
+                    seq_df.index,
+                    seq_df["tr"],
                 ):
                     list_of_psms.append(
                         PSM(
@@ -828,9 +822,7 @@ def calibrate_preds_func_pygam(
         # sort two lists, predicted and observed based on measured tr
         tr_sort = [
             (mtr, ptr)
-            for mtr, ptr in sorted(
-                zip(measured_tr, predicted_tr), key=lambda pair: pair[1]
-            )
+            for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1])
         ]
         measured_tr = np.array([mtr for mtr, ptr in tr_sort], dtype=np.float32)
         predicted_tr = np.array([ptr for mtr, ptr in tr_sort], dtype=np.float32)
@@ -848,9 +840,7 @@ def calibrate_preds_func_pygam(
             spline_model = linear_model
             linear_model_right = linear_model
         else:
-            spline = SplineTransformer(
-                degree=4, n_knots=int(len(measured_tr) / 500) + 5
-            )
+            spline = SplineTransformer(degree=4, n_knots=int(len(measured_tr) / 500) + 5)
             spline_model = make_pipeline(spline, LinearRegression())
             spline_model.fit(predicted_tr.reshape(-1, 1), measured_tr)
 
@@ -946,7 +936,10 @@ def calibrate_preds_func(
                     )
             else:
                 for seq, mod, tr, ident in zip(
-                    seq_df["seq"], seq_df["modifications"], seq_df["tr"], seq_df.index
+                    seq_df["seq"],
+                    seq_df["modifications"],
+                    seq_df["tr"],
+                    seq_df.index,
                 ):
                     list_of_psms.append(
                         PSM(
@@ -964,9 +957,7 @@ def calibrate_preds_func(
         # sort two lists, predicted and observed based on measured tr
         tr_sort = [
             (mtr, ptr)
-            for mtr, ptr in sorted(
-                zip(measured_tr, predicted_tr), key=lambda pair: pair[1]
-            )
+            for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1])
         ]
         measured_tr = np.array([mtr for mtr, ptr in tr_sort])
         predicted_tr = np.array([ptr for mtr, ptr in tr_sort])
@@ -1017,18 +1008,12 @@ def calibrate_preds_func(
             raise CalibrationError(
                 "Not enough measured tr ({}) for the chosen number of splits ({}). "
                 "Choose a smaller split_cal parameter or provide more peptides for "
-                "fitting the calibration curve.".format(
-                    len(measured_tr), self.split_cal
-                )
+                "fitting the calibration curve.".format(len(measured_tr), self.split_cal)
             )
         if len(mtr_mean) == 0:
-            raise CalibrationError(
-                "The measured tr list is empty, not able to calibrate"
-            )
+            raise CalibrationError("The measured tr list is empty, not able to calibrate")
         if len(ptr_mean) == 0:
-            raise CalibrationError(
-                "The predicted tr list is empty, not able to calibrate"
-            )
+            raise CalibrationError("The predicted tr list is empty, not able to calibrate")
 
         # calculate calibration curves
         for i in range(0, len(ptr_mean)):
@@ -1051,7 +1036,10 @@ def calibrate_preds_func(
                     calibrate_min = v
                 if v > calibrate_max:
                     calibrate_max = v
-                calibrate_dict[str(round(v, self.bin_dist))] = [slope, intercept]
+                calibrate_dict[str(round(v, self.bin_dist))] = [
+                    slope,
+                    intercept,
+                ]
 
         return calibrate_min, calibrate_max, calibrate_dict
 
@@ -1115,7 +1103,10 @@ def calibrate_preds(
                     )
             else:
                 for seq, mod, ident, tr in zip(
-                    seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"]
+                    seq_df["seq"],
+                    seq_df["modifications"],
+                    seq_df.index,
+                    seq_df["tr"],
                 ):
                     list_of_psms.append(
                         PSM(
@@ -1265,10 +1256,7 @@ def calibrate_preds(
                 perf = sum(abs(np.array(measured_tr) - np.array(preds)))
 
             if self.verbose:
-                logger.debug(
-                    "For %s model got a performance of: %s"
-                    % (m_name, perf / len(preds))
-                )
+                logger.debug("For %s model got a performance of: %s" % (m_name, perf / len(preds)))
 
             if perf < best_perf:
                 if self.deepcallc_mod:
@@ -1293,9 +1281,7 @@ def calibrate_preds(
         self.model = best_model
 
         if self.deepcallc_mod:
-            self.deepcallc_model = train_en(
-                pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"]
-            )
+            self.deepcallc_model = train_en(pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"])
 
         # self.n_jobs = 1
 
@@ -1307,12 +1293,13 @@ def calibrate_preds(
             plotly_return_dict = {}
             plotly_df = pd.DataFrame(
                 list(zip(temp_obs, temp_pred)),
-                columns=["Observed retention time", "Predicted retention time"],
+                columns=[
+                    "Observed retention time",
+                    "Predicted retention time",
+                ],
             )
             plotly_return_dict["scatter"] = deeplc.plot.scatter(plotly_df)
-            plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(
-                plotly_df
-            )
+            plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(plotly_df)
             return plotly_return_dict
 
         return {}
diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py
index 1b12262..087baa1 100644
--- a/deeplc/feat_extractor.py
+++ b/deeplc/feat_extractor.py
@@ -51,15 +51,28 @@ class FeatExtractor:
     def __init__(
         self,
         main_path=os.path.dirname(os.path.realpath(__file__)),
-        lib_path_mod=os.path.join(
-            os.path.dirname(os.path.realpath(__file__)), "unimod/"
-        ),
+        lib_path_mod=os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/"),
         lib_aa_composition=os.path.join(
             os.path.dirname(os.path.realpath(__file__)), "aa_comp_rel.csv"
         ),
         split_size=7,
         verbose=True,
-        include_specific_posses=[0, 1, 2, 3, 4, 5, 6, -1, -2, -3, -4, -5, -6, -7],
+        include_specific_posses=[
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            -1,
+            -2,
+            -3,
+            -4,
+            -5,
+            -6,
+            -7,
+        ],
         add_sum_feat=False,
         ptm_add_feat=False,
         ptm_subtract_feat=False,
@@ -81,9 +94,7 @@ def __init__(
             ptm_add_feat = cparser.getboolean("featExtractor", "ptm_add_feat")
             ptm_subtract_feat = cparser.getboolean("featExtractor", "ptm_subtract_feat")
             add_rolling_feat = cparser.getboolean("featExtractor", "add_rolling_feat")
-            include_unnormalized = cparser.getboolean(
-                "featExtractor", "include_unnormalized"
-            )
+            include_unnormalized = cparser.getboolean("featExtractor", "include_unnormalized")
             include_specific_posses = ast.literal_eval(
                 cparser.get("featExtractor", "include_specific_posses")
             )
@@ -241,9 +252,7 @@ def get_feats_mods(
         look_up_mod_subtract = {}
         look_up_mod_add = {}
 
-        len_init = len(
-            [ao + str(spl_s) for spl_s in range(split_size) for ao in atoms_order]
-        )
+        len_init = len([ao + str(spl_s) for spl_s in range(split_size) for ao in atoms_order])
         for index_name, mod, seq in zip(identifiers, mods, seqs):
             mod_dict[index_name] = dict(
                 zip(
@@ -265,9 +274,7 @@ def get_feats_mods(
                     continue
 
         if self.verbose:
-            logger.debug(
-                "Time to calculate mod features: %s seconds" % (time.time() - t0)
-            )
+            logger.debug("Time to calculate mod features: %s seconds" % (time.time() - t0))
         df_ret = pd.DataFrame(mod_dict, dtype=int).T
         del mod_dict
         return df_ret
@@ -388,15 +395,9 @@ def rolling_sum(a, n=2):
             peptide_composition = [mass.std_aa_comp[aa] for aa in seq]
 
             # Initialize all feature matrices
-            matrix = np.zeros(
-                (padding_length, len(dict_index.keys())), dtype=np.float16
-            )
-            matrix_hc = np.zeros(
-                (padding_length, len(dict_aa.keys())), dtype=np.float16
-            )
-            matrix_pos = np.zeros(
-                (len(positions), len(dict_index.keys())), dtype=np.float16
-            )
+            matrix = np.zeros((padding_length, len(dict_index.keys())), dtype=np.float16)
+            matrix_hc = np.zeros((padding_length, len(dict_aa.keys())), dtype=np.float16)
+            matrix_pos = np.zeros((len(positions), len(dict_index.keys())), dtype=np.float16)
 
             for i, position_composition in enumerate(peptide_composition):
                 for k, v in position_composition.items():
@@ -437,21 +438,15 @@ def rolling_sum(a, n=2):
                     except KeyError:
                         warn_once(f"Could not add the following atom: {atom}")
                     except IndexError:
-                        warn_once(
-                            f"Could not add the following atom: {pn} {atom} {val}"
-                        )
+                        warn_once(f"Could not add the following atom: {pn} {atom} {val}")
 
             for i, peptide_position in enumerate(peptidoform.parsed_sequence):
                 try:
                     matrix_hc[i, dict_aa[peptide_position[0]]] = 1.0
                 except KeyError:
-                    warn_once(
-                        f"Skipping the following (not in library): {i} {peptide_position}"
-                    )
+                    warn_once(f"Skipping the following (not in library): {i} {peptide_position}")
                 except IndexError:
-                    warn_once(
-                        f"Could not add the following atom: {i} {peptide_position}"
-                    )
+                    warn_once(f"Could not add the following atom: {i} {peptide_position}")
 
                 if peptide_position[1] is not None:
                     try:
@@ -472,12 +467,11 @@ def rolling_sum(a, n=2):
                         atom_change,
                     ) in modification_composition.items():
                         try:
-                            matrix[
-                                i, dict_index[atom_position_composition]
-                            ] += atom_change
+                            matrix[i, dict_index[atom_position_composition]] += atom_change
                             if i in positions:
                                 matrix_pos[
-                                    i, dict_index_pos[atom_position_composition]
+                                    i,
+                                    dict_index_pos[atom_position_composition],
                                 ] += atom_change
                             elif i - seq_len in positions:
                                 matrix_pos[
@@ -492,12 +486,11 @@ def rolling_sum(a, n=2):
                                 atom_position_composition = sub(
                                     "\[.*?\]", "", atom_position_composition
                                 )
-                                matrix[
-                                    i, dict_index[atom_position_composition]
-                                ] += atom_change
+                                matrix[i, dict_index[atom_position_composition]] += atom_change
                                 if i in positions:
                                     matrix_pos[
-                                        i, dict_index_pos[atom_position_composition]
+                                        i,
+                                        dict_index_pos[atom_position_composition],
                                     ] += atom_change
                                 elif i - seq_len in positions:
                                     matrix_pos[
@@ -534,10 +527,12 @@ def rolling_sum(a, n=2):
                     (seq.count("F") + seq.count("W") + seq.count("Y")) / float(seq_len),
                 )
                 matrix_all = np.append(
-                    matrix_all, (seq.count("D") + seq.count("E")) / float(seq_len)
+                    matrix_all,
+                    (seq.count("D") + seq.count("E")) / float(seq_len),
                 )
                 matrix_all = np.append(
-                    matrix_all, (seq.count("K") + seq.count("R")) / float(seq_len)
+                    matrix_all,
+                    (seq.count("K") + seq.count("R")) / float(seq_len),
                 )
                 matrix_all = np.append(matrix_all, charge)
 
@@ -584,7 +579,10 @@ def full_feat_extract(
             list_of_psms = []
             for seq, mod, id in zip(seqs, mods, identifiers):
                 list_of_psms.append(
-                    PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=id)
+                    PSM(
+                        peptidoform=peprec_to_proforma(seq, mod),
+                        spectrum_id=id,
+                    )
                 )
             psm_list = PSMList(psm_list=list_of_psms)
 
@@ -599,14 +597,10 @@ def full_feat_extract(
         if self.ptm_add_feat:
             if self.verbose:
                 logger.debug("Extracting compositional add features for modifications")
-            X_feats_add = self.get_feats_mods(
-                psm_list, split_size=self.split_size, add_str="_add"
-            )
+            X_feats_add = self.get_feats_mods(psm_list, split_size=self.split_size, add_str="_add")
         if self.ptm_subtract_feat:
             if self.verbose:
-                logger.debug(
-                    "Extracting compositional subtract features for modifications"
-                )
+                logger.debug("Extracting compositional subtract features for modifications")
             X_feats_neg = self.get_feats_mods(
                 psm_list,
                 split_size=self.split_size,
@@ -642,9 +636,7 @@ def full_feat_extract(
                 X = X_feats_neg
 
         if self.verbose:
-            logger.debug(
-                "Time to calculate all features: %s seconds" % (time.time() - t0)
-            )
+            logger.debug("Time to calculate all features: %s seconds" % (time.time() - t0))
         return X
 
 
diff --git a/deeplc/gui.py b/deeplc/gui.py
index ae4cf27..47ee007 100644
--- a/deeplc/gui.py
+++ b/deeplc/gui.py
@@ -1,8 +1,10 @@
 """Graphical user interface."""
+
 import sys
 from multiprocessing import freeze_support
 from pathlib import Path
 import importlib.resources
+
 try:
     from gooey import Gooey, local_resource_path
 except ImportError:
@@ -17,7 +19,7 @@
 
 # Get path to package_data/images
 # Workaround with parent of specific file required for Python 3.9+ support
-with importlib.resources.path(img_module, 'config_icon.png') as resource:
+with importlib.resources.path(img_module, "config_icon.png") as resource:
     _IMG_DIR = Path(resource).parent
 
 
@@ -27,12 +29,13 @@
     tabbed_groups=True,
     default_size=(720, 480),
     monospace_display=True,
-    target=None if getattr(sys, 'frozen', False) else "deeplc-gui"
+    target=None if getattr(sys, "frozen", False) else "deeplc-gui",
 )
 def start_gui():
     """Run main with GUI enabled."""
     freeze_support()  # Required for multiprocessing with PyInstaller
     main(gui=True)
 
+
 if __name__ == "__main__":
     start_gui()
diff --git a/deeplc/plot.py b/deeplc/plot.py
index fd337d0..a4b2a37 100644
--- a/deeplc/plot.py
+++ b/deeplc/plot.py
@@ -12,7 +12,7 @@ def scatter(
     observed_column: str = "Observed retention time",
     xaxis_label: str = "Observed retention time",
     yaxis_label: str = "Predicted retention time",
-    plot_title: str = "Predicted vs. observed retention times"
+    plot_title: str = "Predicted vs. observed retention times",
 ) -> go.Figure:
     """
     Plot a scatter plot of the predicted vs. observed retention times.
@@ -42,15 +42,15 @@ def scatter(
         y=predicted_column,
         opacity=0.3,
     )
-    
+
     # Draw diagonal line
     fig.add_scatter(
         x=[min(df[observed_column]), max(df[observed_column])],
         y=[min(df[observed_column]), max(df[observed_column])],
-        mode='lines',
-        line=dict(color='red', width=3, dash='dash'),
+        mode="lines",
+        line=dict(color="red", width=3, dash="dash"),
     )
-    
+
     # Hide legend
     fig.update_layout(
         title=plot_title,
@@ -58,7 +58,7 @@ def scatter(
         xaxis_title=xaxis_label,
         yaxis_title=yaxis_label,
     )
-    
+
     return fig
 
 
@@ -85,9 +85,7 @@ def distribution_baseline(
     """
     # Get baseline data
     baseline_df = pd.read_csv(
-        Path(__file__)
-        .absolute()
-        .parent.joinpath("baseline_performance/baseline_predictions.csv")
+        Path(__file__).absolute().parent.joinpath("baseline_performance/baseline_predictions.csv")
     )
     baseline_df["rel_mae_best"] = baseline_df[
         ["rel_mae_transfer_learning", "rel_mae_new_model", "rel_mae_calibrate"]
@@ -97,9 +95,7 @@ def distribution_baseline(
     # Calculate current RMAE and percentile compared to baseline
     mae = sum(abs(df[observed_column] - df[predicted_column])) / len(df.index)
     mae_rel = (mae / max(df[observed_column])) * 100
-    percentile = round(
-        (baseline_df["rel_mae_transfer_learning"] < mae_rel).mean() * 100, 1
-    )
+    percentile = round((baseline_df["rel_mae_transfer_learning"] < mae_rel).mean() * 100, 1)
 
     # Calculate x-axis range with 5% padding
     all_values = np.append(baseline_df["rel_mae_transfer_learning"].values, mae_rel)
@@ -140,10 +136,7 @@ def distribution_baseline(
     )
     fig.update_xaxes(range=[x_min, x_max])
     fig.update_layout(
-        title=(
-            f"Current DeepLC performance compared to {len(baseline_df.index)} "
-            "datasets"
-        ),
+        title=(f"Current DeepLC performance compared to {len(baseline_df.index)} datasets"),
         xaxis_title="Relative mean absolute error (%)",
     )
 
diff --git a/deeplc/trainl3.py b/deeplc/trainl3.py
index a28e083..f4bd276 100644
--- a/deeplc/trainl3.py
+++ b/deeplc/trainl3.py
@@ -13,8 +13,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 
-This project was made possible by MASSTRPLAN. MASSTRPLAN received funding 
-from the Marie Sklodowska-Curie EU Framework for Research and Innovation 
+This project was made possible by MASSTRPLAN. MASSTRPLAN received funding
+from the Marie Sklodowska-Curie EU Framework for Research and Innovation
 Horizon 2020, under Grant Agreement No. 675132.
 """
 
@@ -99,7 +99,7 @@ def train_en(X, y, n_jobs=16, cv=None):
     grid.fit(X, y)
 
     crossv_mod.set_params(**grid.best_params_)
-    
+
     ret_mod.set_params(**grid.best_params_)
     ret_mod.fit(X, y)
 
diff --git a/examples/deeplc_example.py b/examples/deeplc_example.py
index 95a5c5d..25a2cf4 100644
--- a/examples/deeplc_example.py
+++ b/examples/deeplc_example.py
@@ -3,7 +3,7 @@
 from matplotlib import pyplot as plt
 from deeplc import DeepLC, FeatExtractor
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Input files
     peptide_file = "examples/datasets/dia.csv"
 
@@ -14,14 +14,14 @@
     df = df.fillna("")
 
     # Generate some identifiers, any kind of identifiers will do
-    df.index = ["Pep_"+str(dfi) for dfi in df.index]
+    df.index = ["Pep_" + str(dfi) for dfi in df.index]
 
     # Make a feature extraction object.
     # This step can be skipped if you want to use the default feature extraction
     # settings. In this example we will use a model that does not use RDKit features
     # so we skip the chemical descriptor making procedure.
 
-    #f_extractor = FeatExtractor(chem_descr_feat=False,verbose=False)
+    # f_extractor = FeatExtractor(chem_descr_feat=False,verbose=False)
     f_extractor = FeatExtractor(
         add_sum_feat=False,
         ptm_add_feat=False,
@@ -30,26 +30,28 @@
         chem_descr_feat=False,
         add_comp_feat=False,
         cnn_feats=True,
-        verbose=True
+        verbose=True,
     )
     # Initiate a DeepLC instance that will perform the calibration and predictions
     dlc = DeepLC(
-        path_model=["deeplc/mods/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-                    "deeplc/mods/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-                    "deeplc/mods/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-                    "deeplc/mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5"],
+        path_model=[
+            "deeplc/mods/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+            "deeplc/mods/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+            "deeplc/mods/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+            "deeplc/mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        ],
         cnn_model=True,
         f_extractor=f_extractor,
         verbose=True,
         write_library=True,
-        use_library="deeplc/library/library.csv"
+        use_library="deeplc/library/library.csv",
     )
 
     df["tr"] = list(range(len(df.index)))
 
     # To demonstrate DeepLC's callibration, we'll induce some an artificial
     # transformation into the retention times
-    df["tr"] = df["tr"]**0.85
+    df["tr"] = df["tr"] ** 0.85
 
     # Calibrate the original model based on the new retention times
     dlc.calibrate_preds(seq_df=df)
@@ -66,10 +68,10 @@
     print("c")
 
     # Compare calibrated and uncalibrated predictions
-    #print("Predictions (calibrated): ", preds_cal)
-    #print("Predictions (uncalibrated): ", preds_uncal)
+    # print("Predictions (calibrated): ", preds_cal)
+    # print("Predictions (uncalibrated): ", preds_uncal)
 
-    plt.scatter(df["tr"],preds_cal,label="Calibrated",s=1)
-    plt.scatter(df["tr"],preds_uncal,label="Uncalibrated",s=1)
+    plt.scatter(df["tr"], preds_cal, label="Calibrated", s=1)
+    plt.scatter(df["tr"], preds_uncal, label="Uncalibrated", s=1)
     plt.legend()
-    plt.savefig('deeplc_calibrated_vs_uncalibrated.png')
+    plt.savefig("deeplc_calibrated_vs_uncalibrated.png")
diff --git a/pyproject.toml b/pyproject.toml
index 31e8fe3..47b0895 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,3 +61,13 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools]
 packages = ["deeplc"]
 include-package-data = true
+
+[tool.ruff]
+line-length = 99
+target-version = 'py311'
+
+[tool.ruff.lint]
+select = ["E", "F", "UP", "B", "SIM", "I"]
+
+[tool.ruff.format]
+docstring-code-format = true
diff --git a/streamlit/deeplc_streamlit.py b/streamlit/deeplc_streamlit.py
index 399dc5d..fcd574d 100644
--- a/streamlit/deeplc_streamlit.py
+++ b/streamlit/deeplc_streamlit.py
@@ -109,7 +109,8 @@ def _main_page(self):
 
             st.subheader("Prediction speed boost")
             self.user_input["use_library"] = st.checkbox(
-                "Use prediction library for speed-up", help=self.texts.Help.use_library
+                "Use prediction library for speed-up",
+                help=self.texts.Help.use_library,
             )
             st.markdown(self.texts.Help.use_library_agreement)
 
@@ -334,9 +335,9 @@ class Sidebar:
             ---
 
             Currently using the following package versions: <br />
-            [![DeepLC](https://img.shields.io/badge/deeplc-{version('deeplc')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/compomics/deeplc)
-            [![Tensorflow](https://img.shields.io/badge/tensorflow-{version('tensorflow')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/tensorflow/tensorflow)
-            [![Streamlit](https://img.shields.io/badge/streamlit-{version('streamlit')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/streamlit/streamlit)
+            [![DeepLC](https://img.shields.io/badge/deeplc-{version("deeplc")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/compomics/deeplc)
+            [![Tensorflow](https://img.shields.io/badge/tensorflow-{version("tensorflow")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/tensorflow/tensorflow)
+            [![Streamlit](https://img.shields.io/badge/streamlit-{version("streamlit")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/streamlit/streamlit)
 
             Latest DeepLC version:<br />
             ![PyPI](https://img.shields.io/pypi/v/deeplc?style=flat-square&logoColor=white&logo=pypi)
diff --git a/streamlit/streamlit_utils.py b/streamlit/streamlit_utils.py
index 3e6bec5..c3a7df7 100644
--- a/streamlit/streamlit_utils.py
+++ b/streamlit/streamlit_utils.py
@@ -80,4 +80,4 @@ def hide_streamlit_menu():
 @st.cache
 def save_dataframe(df):
     """Save dataframe to file object, with streamlit cache."""
-    return df.to_csv().encode('utf-8')
+    return df.to_csv().encode("utf-8")
diff --git a/tests/test_deepcallc.py b/tests/test_deepcallc.py
index 5505b51..71297db 100644
--- a/tests/test_deepcallc.py
+++ b/tests/test_deepcallc.py
@@ -2,76 +2,82 @@
 from deeplc import DeepLC
 from matplotlib import pyplot as plt
 
+
 def main():
     peptide_file = "temp_data/PXD005573_mcp.csv"
     calibration_file = "temp_data/PXD005573_mcp.csv"
 
     pep_df = pd.read_csv(peptide_file, sep=",")
-    pep_df['modifications'] = pep_df['modifications'].fillna("")
+    pep_df["modifications"] = pep_df["modifications"].fillna("")
 
     cal_df = pd.read_csv(calibration_file, sep=",")
-    cal_df['modifications'] = cal_df['modifications'].fillna("")
-    
+    cal_df["modifications"] = cal_df["modifications"].fillna("")
+
     pep_df = pep_df.sample(50)
     cal_df = cal_df.sample(50)
 
-    mods = ["C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_8c22d89667368f2f02ad996469ba157e.hdf5",
-            #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_5ee8aaa41d387bfffb8cda966348937c.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_f3c75e74dd7b16180edf6f6f0d78a4a6.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_5ee8aaa41d387bfffb8cda966348937c.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",            
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
-            "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5"]
-    
-    dlc = DeepLC(write_library=False,
-                 use_library="",
-                 pygam_calibration=True,
-                 deepcallc_mod=True,
-                 path_model=mods,
-                 reload_library=False)
+    mods = [
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_8c22d89667368f2f02ad996469ba157e.hdf5",
+        # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_5ee8aaa41d387bfffb8cda966348937c.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_f3c75e74dd7b16180edf6f6f0d78a4a6.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_5ee8aaa41d387bfffb8cda966348937c.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5",
+        "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5",
+    ]
+
+    dlc = DeepLC(
+        write_library=False,
+        use_library="",
+        pygam_calibration=True,
+        deepcallc_mod=True,
+        path_model=mods,
+        reload_library=False,
+    )
 
     dlc.calibrate_preds(seq_df=cal_df)
     preds = dlc.make_preds(seq_df=cal_df)
 
-    plt.scatter(cal_df["tr"],preds)
+    plt.scatter(cal_df["tr"], preds)
     plt.show()
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tests/test_deeplc.py b/tests/test_deeplc.py
index 4a5af2f..ed60f92 100644
--- a/tests/test_deeplc.py
+++ b/tests/test_deeplc.py
@@ -22,9 +22,9 @@ def _r2_score(y_true, y_pred):
 
 def test_cli_basic():
     """Test command line interface help message."""
-    assert (
-        subprocess.getstatusoutput("deeplc -h")[0] == 0
-    ), "`deeplc -h` returned non-zero status code"
+    assert subprocess.getstatusoutput("deeplc -h")[0] == 0, (
+        "`deeplc -h` returned non-zero status code"
+    )
 
 
 def test_cli_full():
@@ -48,8 +48,10 @@ def test_cli_full():
     train_df = pd.read_csv(file_path_pred)
     model_r2 = _r2_score(train_df["tr"], preds_df["predicted retention time"])
     logging.info("DeepLC R2 score on %s: %f", file_path_pred, model_r2)
-    assert model_r2 > 0.90, f"DeepLC R2 score on {file_path_pred} below 0.9 \
+    assert model_r2 > 0.90, (
+        f"DeepLC R2 score on {file_path_pred} below 0.9 \
 (was {model_r2})"
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/test_deeplc_lib.py b/tests/test_deeplc_lib.py
index a17748c..e9d6b51 100644
--- a/tests/test_deeplc_lib.py
+++ b/tests/test_deeplc_lib.py
@@ -2,27 +2,27 @@
 from deeplc import DeepLC
 from matplotlib import pyplot as plt
 
+
 def main():
     peptide_file = "examples/datasets/test_pred.csv"
     calibration_file = "examples/datasets/test_train.csv"
 
     pep_df = pd.read_csv(peptide_file, sep=",")
-    pep_df['modifications'] = pep_df['modifications'].fillna("")
+    pep_df["modifications"] = pep_df["modifications"].fillna("")
 
     cal_df = pd.read_csv(calibration_file, sep=",")
-    cal_df['modifications'] = cal_df['modifications'].fillna("")
-
-    dlc = DeepLC(write_library=False,
-                 use_library="",
-                 reload_library=False)
-                 #write_library=True,
-                 #use_library="lib.csv",
-                 #reload_library=True)
+    cal_df["modifications"] = cal_df["modifications"].fillna("")
+
+    dlc = DeepLC(write_library=False, use_library="", reload_library=False)
+    # write_library=True,
+    # use_library="lib.csv",
+    # reload_library=True)
     dlc.calibrate_preds(seq_df=cal_df)
     preds = dlc.make_preds(seq_df=cal_df)
 
-    plt.scatter(pep_df["tr"],preds)
+    plt.scatter(pep_df["tr"], preds)
     plt.show()
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tests/test_deeplc_retrain.py b/tests/test_deeplc_retrain.py
index a67c7a4..8c09330 100644
--- a/tests/test_deeplc_retrain.py
+++ b/tests/test_deeplc_retrain.py
@@ -2,27 +2,27 @@
 from deeplc import DeepLC
 from matplotlib import pyplot as plt
 
+
 def main():
     peptide_file = "examples/datasets/test_pred.csv"
     calibration_file = "examples/datasets/test_train.csv"
 
     pep_df = pd.read_csv(peptide_file, sep=",")
-    pep_df['modifications'] = pep_df['modifications'].fillna("")
+    pep_df["modifications"] = pep_df["modifications"].fillna("")
 
     cal_df = pd.read_csv(calibration_file, sep=",")
-    cal_df['modifications'] = cal_df['modifications'].fillna("")
-
-    dlc = DeepLC(write_library=False,
-                 deeplc_retrain=True,
-                 reload_library=False)
-                 #write_library=True,
-                 #use_library="lib.csv",
-                 #reload_library=True)
+    cal_df["modifications"] = cal_df["modifications"].fillna("")
+
+    dlc = DeepLC(write_library=False, deeplc_retrain=True, reload_library=False)
+    # write_library=True,
+    # use_library="lib.csv",
+    # reload_library=True)
     dlc.calibrate_preds(seq_df=cal_df)
     preds = dlc.make_preds(seq_df=cal_df)
 
-    plt.scatter(cal_df["tr"],preds)
+    plt.scatter(cal_df["tr"], preds)
     plt.show()
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From a5ec35c2957c9cac247d255e68040ea1de0bb36f Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Thu, 27 Mar 2025 16:48:35 +0100
Subject: [PATCH 2/6] First refactoring of do_f_extraction* methods

---
 deeplc/deeplc.py | 497 ++++++++++++++++++++++++-----------------------
 1 file changed, 256 insertions(+), 241 deletions(-)

diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py
index dcaa146..9bb4051 100644
--- a/deeplc/deeplc.py
+++ b/deeplc/deeplc.py
@@ -4,6 +4,8 @@
 This provides the main interface. For the library versions see the .yml file
 """
 
+from __future__ import annotations
+
 __author__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
 __license__ = "Apache License, Version 2.0"
 __maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
@@ -16,26 +18,13 @@
     "Sven Degroeve",
 ]
 
-# Default models, will be used if no other is specified. If no best model is
-# selected during calibration, the first model in the list will be used.
-import os
-
-deeplc_dir = os.path.dirname(os.path.realpath(__file__))
-DEFAULT_MODELS = [
-    "mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras",
-    "mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras",
-    "mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras",
-]
-DEFAULT_MODELS = [os.path.join(deeplc_dir, dm) for dm in DEFAULT_MODELS]
-
-LIBRARY = {}
 
 import copy
 import gc
 import logging
-import math
 import multiprocessing
 import multiprocessing.dummy
+import os
 import random
 import sys
 import warnings
@@ -43,10 +32,28 @@
 from itertools import chain
 from tempfile import TemporaryDirectory
 
+import numpy as np
+import pandas as pd
+from psm_utils import PSM, PSMList
+from psm_utils.io import read_file
+from psm_utils.io.peptide_record import peprec_to_proforma
 from sklearn.linear_model import LinearRegression
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import SplineTransformer
 
+# Default models, will be used if no other is specified. If no best model is
+# selected during calibration, the first model in the list will be used.
+DEEPLC_DIR = os.path.dirname(os.path.realpath(__file__))
+DEFAULT_MODELS = [
+    "mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras",
+    "mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras",
+    "mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras",
+]
+DEFAULT_MODELS = [os.path.join(DEEPLC_DIR, dm) for dm in DEFAULT_MODELS]
+
+LIBRARY = {}
+
+
 # If CLI/GUI/frozen: disable Tensorflow info and warnings before importing
 IS_CLI_GUI = os.path.basename(sys.argv[0]) in ["deeplc", "deeplc-gui"]
 IS_FROZEN = getattr(sys, "frozen", False)
@@ -57,34 +64,27 @@
     warnings.filterwarnings("ignore", category=FutureWarning)
     warnings.filterwarnings("ignore", category=UserWarning)
 
-# Supress warnings (or at least try...)
+# Suppress warnings (or at least try...)
 logging.getLogger("tensorflow").setLevel(logging.ERROR)
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
-import numpy as np
-import pandas as pd
+# ruff: noqa: E402
 import tensorflow as tf
 from deeplcretrainer import deeplcretrainer
-from psm_utils.io import read_file
-from psm_utils.io.peptide_record import peprec_to_proforma
-from psm_utils.psm import PSM
-from psm_utils.psm_list import PSMList
 
 try:
     from tensorflow.keras.models import load_model
-except:
+except Exception:
     from tensorflow.python.keras.models import load_model
 from tensorflow.python.eager import context
 
 from deeplc._exceptions import CalibrationError
+from deeplc.feat_extractor import FeatExtractor
 from deeplc.trainl3 import train_en
 
 # Set to force CPU calculations
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
-# Feature extraction
-from deeplc.feat_extractor import FeatExtractor
-
 
 def warn(*args, **kwargs):
     pass
@@ -124,78 +124,92 @@ class DeepLC:
     """
     DeepLC predictor.
 
-    Parameters
-    ----------
-    main_path : str
-        main path of module
-    path_model : str, optional
-        path to prediction model(s); leave empty to select the best of the
-        default models based on the calibration peptides
-    verbose : bool, default=True
-        turn logging on/off
-    bin_dist : float, default=2
-        TODO
-    dict_cal_divider : int, default=50
-        sets precision for fast-lookup of retention times for calibration; e.g.
-        10 means a precision of 0.1 between the calibration anchor points
-    split_cal : int, default=50
-        number of splits in the chromatogram for piecewise linear calibration
-        fit
-    n_jobs : int, optional
-        number of CPU threads to use
-    config_file : str, optional
-        path to configuration file
-    f_extractor : object :: deeplc.FeatExtractor, optional
-        deeplc.FeatExtractor object to use
-    cnn_model : bool, default=True
-        use CNN model or not
-    batch_num : int, default=250000
-        prediction batch size (in peptides); lower to decrease memory footprint
-    write_library : bool, default=False
-        append new predictions to library for faster future results; requires
-        `use_library` option
-    use_library : str, optional
-        library file with previous predictions for faster results to read from,
-        or to write to
-    reload_library : bool, default=False
-        reload prediction library
-
     Methods
     -------
-    calibrate_preds(seqs=[], mods=[], identifiers=[], measured_tr=[], correction_factor=1.0, seq_df=None, use_median=True)
+    calibrate_preds
         Find best model and calibrate
-    make_preds(seqs=[], mods=[], identifiers=[], calibrate=True, seq_df=None, correction_factor=1.0, mod_name=None)
+    make_preds
         Make predictions
 
     """
 
     library = {}
 
-    # TODO have a CCS flag here
     def __init__(
         self,
-        main_path=os.path.dirname(os.path.realpath(__file__)),
-        path_model=None,
-        verbose=True,
-        bin_dist=2,
-        dict_cal_divider=50,
-        split_cal=50,
-        n_jobs=None,
-        config_file=None,
-        f_extractor=None,
-        cnn_model=True,
-        batch_num=250000,
-        batch_num_tf=1024,
-        write_library=False,
-        use_library=None,
-        reload_library=False,
-        pygam_calibration=True,
-        deepcallc_mod=False,
-        deeplc_retrain=False,
-        predict_ccs=False,
-        n_epochs=20,
-        single_model_mode=True,
+        main_path: str | None = None,
+        path_model: str | None = None,
+        verbose: bool = True,
+        bin_dist: float = 2.0,
+        dict_cal_divider: int = 50,
+        split_cal: int = 50,
+        n_jobs: int | None = None,
+        config_file: str | None = None,
+        f_extractor: FeatExtractor | None = None,
+        cnn_model: bool = True,
+        batch_num: int = 250000,
+        batch_num_tf: int = 1024,
+        write_library: bool = False,
+        use_library: str | None = None,
+        reload_library: bool = False,
+        pygam_calibration: bool = True,
+        deepcallc_mod: bool = False,
+        deeplc_retrain: bool = False,
+        predict_ccs: bool = False,
+        n_epochs: int = 20,
+        single_model_mode: bool = True,
     ):
+        """
+        Initialize the DeepLC predictor.
+
+        Parameters
+        ----------
+        main_path
+            Main path of the module.
+        path_model
+            Path to prediction model(s); if not provided, the best default model is selected based
+            on calibration peptides.
+        verbose
+            Turn logging on/off.
+        bin_dist
+            Precision factor for calibration lookup.
+        dict_cal_divider
+            Divider that sets the precision for fast lookup of retention times in calibration; e.g.
+            10 means a precision of 0.1 between the calibration anchor points
+        split_cal
+            Number of splits in the chromatogram for piecewise linear calibration.
+        n_jobs
+            Number of CPU threads to use.
+        config_file
+            Path to a configuration file.
+        f_extractor
+            deeplc.FeatExtractor object to use.
+        cnn_model
+            Flag indicating whether to use the CNN model.
+        batch_num
+            Prediction batch size (in peptides); lower values reduce memory footprint.
+        batch_num_tf
+            Batch size for TensorFlow predictions.
+        write_library
+            Whether to append new predictions to a library for faster future access.
+        use_library
+            Library file to read from or write to for prediction caching.
+        reload_library
+            Whether to reload the prediction library.
+        pygam_calibration
+            Flag to enable calibration using PyGAM.
+        deepcallc_mod
+            Flag specific to deepcallc mode.
+        deeplc_retrain
+            Flag indicating whether to perform retraining (transfer learning) of prediction models.
+        predict_ccs
+            Flag to control prediction of CCS values.
+        n_epochs
+            Number of epochs used in retraining if deeplc_retrain is enabled.
+        single_model_mode
+            Flag to use a single model instead of multiple default models.
+
+        """
         # if a config file is defined overwrite standard parameters
         if config_file:
             cparser = ConfigParser()
@@ -204,29 +218,31 @@ def __init__(
             split_cal = cparser.getint("DeepLC", "split_cal")
             n_jobs = cparser.getint("DeepLC", "n_jobs")
 
-        self.main_path = main_path
+        self.main_path = main_path or os.path.dirname(os.path.realpath(__file__))
+        self.path_model = self._get_model_paths(path_model, single_model_mode)
         self.verbose = verbose
         self.bin_dist = bin_dist
-        self.calibrate_dict = {}
-        self.calibrate_min = float("inf")
-        self.calibrate_max = 0
-        self.n_epochs = n_epochs
+        self.dict_cal_divider = dict_cal_divider
+        self.split_cal = split_cal
+        self.n_jobs = multiprocessing.cpu_count() if n_jobs is None else n_jobs
+        self.config_file = config_file
+        self.f_extractor = f_extractor or FeatExtractor()
         self.cnn_model = cnn_model
-
         self.batch_num = batch_num
         self.batch_num_tf = batch_num_tf
-        self.dict_cal_divider = dict_cal_divider
-        self.split_cal = split_cal
-        self.n_jobs = n_jobs
-
-        if self.n_jobs == None:
-            max_threads = multiprocessing.cpu_count()
-            self.n_jobs = max_threads
-
-        self.use_library = use_library
         self.write_library = write_library
-
+        self.use_library = use_library
         self.reload_library = reload_library
+        self.pygam_calibration = pygam_calibration
+        self.deepcallc_mod = deepcallc_mod
+        self.deeplc_retrain = deeplc_retrain
+        self.predict_ccs = predict_ccs
+        self.n_epochs = n_epochs
+
+        # Calibration variables
+        self.calibrate_dict = {}
+        self.calibrate_min = float("inf")
+        self.calibrate_max = 0
 
         try:
             tf.config.threading.set_intra_op_parallelism_threads(n_jobs)
@@ -236,26 +252,6 @@ def __init__(
         if "NUMEXPR_MAX_THREADS" not in os.environ:
             os.environ["NUMEXPR_MAX_THREADS"] = str(n_jobs)
 
-        if path_model:
-            self.model = path_model
-        else:
-            if single_model_mode:
-                self.model = [DEFAULT_MODELS[0]]
-            else:
-                self.model = DEFAULT_MODELS
-
-        if f_extractor:
-            self.f_extractor = f_extractor
-        else:
-            self.f_extractor = FeatExtractor()
-
-        self.pygam_calibration = pygam_calibration
-        self.deeplc_retrain = deeplc_retrain
-
-        self.deepcallc_mod = deepcallc_mod
-
-        self.predict_ccs = predict_ccs
-
         if self.deepcallc_mod:
             self.write_library = False
             self.use_library = None
@@ -273,139 +269,110 @@ def __str__(self):
                   |_|
               """
 
-    def do_f_extraction(self, seqs, mods, identifiers, charges=[]):
+    @staticmethod
+    def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) -> list[str]:
+        """Get the model paths based on the passed model path and the single model mode."""
+        if passed_model_path:
+            return [passed_model_path]
+
+        if single_model_mode:
+            return [DEFAULT_MODELS[0]]
+
+        return DEFAULT_MODELS
+
+    def do_f_extraction(
+        self,
+        seqs: list[str],
+        mods: list[str | None],
+        identifiers: list[str],
+        charges: list[int] | None = None,
+    ):
         """
-        Extract all features we can extract; without parallelization; use if you
-        want to run feature extraction with a single core
+        Extract all features we can extract; without parallelization; use if you want to run
+        feature extraction with a single core.
 
         Parameters
         ----------
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods : list
-            naming of the mods; should correspond to seqs and identifiers
-        identifiers : list
-            identifiers of the peptides; should correspond to seqs and mods
+        seqs
+            List of stripped peptide sequences for each entry.
+        mods
+            List of PEPREC-formatted modifications for each entry.
+        identifiers
+            List of identifiers for each entry.
+        charges
+            List of charges for each entry. Only required if CCS prediction is enabled.
 
         Returns
         -------
         pd.DataFrame
-            feature matrix
+            Feature matrix.
         """
-        list_of_psms = []
-
-        if not self.predict_ccs:
-            for seq, mod, ident in zip(seqs, mods, identifiers):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident))
-        else:
-            for seq, mod, ident, z in zip(seqs, mods, identifiers, charges):
-                list_of_psms.append(
-                    PSM(
-                        peptide=peprec_to_proforma(seq, mod, z),
-                        spectrum_id=ident,
-                    )
-                )
-
-        psm_list = PSMList(psm_list=list_of_psms)
-
+        psm_list = _lists_to_psm_list(seqs, mods, identifiers, charges, n_jobs=self.n_jobs)
         return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
 
-    def do_f_extraction_pd(self, df_instances, charges=[]):
+    def do_f_extraction_pd(self, dataframe: pd.DataFrame, charges: list[int] | None = None):
         """
-        Extract all features we can extract; without parallelization; use if
-        you want to run feature extraction with a single thread; and use a
-        defined dataframe
+        Extract all features we can extract; without parallelization; use if you want to run
+        feature extraction with a single thread; and use a defined dataframe.
 
         Parameters
         ----------
-        df_instances : object :: pd.DataFrame
-            dataframe containing the sequences (column:seq), modifications
-            (column:modifications) and naming (column:index)
+        dataframe
+            Dataframe containing the sequences (column:seq), modifications (column:modifications),
+            identifiers (index), and optionally charges (column:charge).
+        charges
+            List of charges for each entry. Only required if CCS prediction is enabled and not
+            present in the dataframe.
 
         Returns
         -------
         pd.DataFrame
-            feature matrix
-        """
-
-        list_of_psms = []
-        if len(charges) == 0:
-            for seq, mod, ident in zip(
-                df_instances["seq"],
-                df_instances["modifications"],
-                df_instances.index,
-            ):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident))
-        else:
-            for seq, mod, ident, z in zip(
-                df_instances["seq"],
-                df_instances["modifications"],
-                df_instances.index,
-                charges=df_instances["charges"],
-            ):
-                list_of_psms.append(
-                    PSM(
-                        peptide=peprec_to_proforma(seq, mod, charge=z),
-                        spectrum_id=ident,
-                    )
-                )
-
-        psm_list = PSMList(psm_list=list_of_psms)
+            Feature matrix.
 
+        """
+        psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs)
         return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
 
-    def do_f_extraction_pd_parallel(self, df_instances):
+    def do_f_extraction_pd_parallel(
+        self, dataframe: pd.DataFrame, charges: list[int] | None = None
+    ):
         """
-        Extract all features we can extract; with parallelization; use if you
-        want to run feature extraction with multiple threads; and use a defined
-        dataframe
+        Extract all features we can extract; with parallelization; use if you want to run feature
+        extraction with multiple threads; and use a defined dataframe.
 
         Parameters
         ----------
-        df_instances : object :: pd.DataFrame
-            dataframe containing the sequences (column:seq), modifications
-            (column:modifications) and naming (column:index)
+        dataframe
+            Dataframe containing the sequences (column:seq), modifications (column:modifications),
+            identifiers (index), and optionally charges (column:charge).
+        charges
+            List of charges for each entry. Only required if CCS prediction is enabled and not
+            present in the dataframe.
 
         Returns
         -------
         pd.DataFrame
-            feature matrix
-        """
-        # self.n_jobs = 1
-
-        df_instances_split = np.array_split(df_instances, math.ceil(self.n_jobs / 4.0))
-        if multiprocessing.current_process().daemon:
-            logger.warning(
-                "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children."
-            )
-            pool = multiprocessing.dummy.Pool(1)
-        else:
-            pool = multiprocessing.Pool(math.ceil(self.n_jobs / 4.0))
+            Feature matrix.
 
-        if self.n_jobs == 1:
-            df = self.do_f_extraction_pd(df_instances)
-        else:
-            df = pd.concat(pool.map(self.do_f_extraction_pd, df_instances_split))
-            pool.close()
-            pool.join()
-        return df
+        """
+        psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs)
+        return self.do_f_extraction_psm_list_parallel(psm_list)
 
     def do_f_extraction_psm_list(self, psm_list):
         """
-        Extract all features we can extract; without parallelization; use if
-        you want to run feature extraction with a single thread; and use a
-        defined dataframe
+        Extract all features we can extract; without parallelization; use if you want to run
+        feature extraction with a single thread; and use a defined dataframe
 
         Parameters
         ----------
-        df_instances : object :: pd.DataFrame
-            dataframe containing the sequences (column:seq), modifications
-            (column:modifications) and naming (column:index)
+        psm_list
+            PSMList with PSMs to extract features for.
 
         Returns
         -------
         pd.DataFrame
             feature matrix
+
         """
         return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
 
@@ -417,36 +384,22 @@ def do_f_extraction_psm_list_parallel(self, psm_list):
 
         Parameters
         ----------
-        df_instances : object :: pd.DataFrame
-            dataframe containing the sequences (column:seq), modifications
-            (column:modifications) and naming (column:index)
+        psm_list
+            PSMList with PSMs to extract features for.
 
         Returns
         -------
         pd.DataFrame
             feature matrix
+
         """
-        # TODO for multiproc I am still expecting a pd dataframe, this is not the case anymore, they are dicts
-        # self.n_jobs = 1
         logger.debug("prepare feature extraction")
-        if multiprocessing.current_process().daemon:
-            logger.warning(
-                "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children."
-            )
-            psm_list_split = split_list(psm_list, self.n_jobs)
-            pool = multiprocessing.dummy.Pool(1)
-        elif self.n_jobs > 1:
-            psm_list_split = split_list(psm_list, self.n_jobs)
-            pool = multiprocessing.Pool(self.n_jobs)
-
-        if self.n_jobs == 1:
-            logger.debug("start feature extraction")
-            all_feats = self.do_f_extraction_psm_list(psm_list)
-            logger.debug("got feature extraction results")
-        else:
-            logger.debug("start feature extraction")
-            all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split)
 
+        psm_list_split = split_list(psm_list, self.n_jobs)
+
+        logger.debug("start feature extraction")
+        with multiprocessing.Pool(self.n_jobs) as pool:
+            all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split)
             logger.debug("wait for feature extraction")
             all_feats_async.wait()
             logger.debug("get feature extraction results")
@@ -454,18 +407,12 @@ def do_f_extraction_psm_list_parallel(self, psm_list):
             matrix_names = res[0].keys()
             all_feats = {
                 matrix_name: dict(
-                    enumerate(chain.from_iterable((v[matrix_name].values() for v in res)))
+                    enumerate(chain.from_iterable(v[matrix_name].values() for v in res))
                 )
                 for matrix_name in matrix_names
             }
-
-            # all_feats = pd.concat(all_feats_async.get())
-
             logger.debug("got feature extraction results")
 
-            pool.close()
-            pool.join()
-
         return all_feats
 
     def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max):
@@ -1325,3 +1272,71 @@ def split_seq(self, a, n):
         k, m = divmod(len(a), n)
         result = (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
         return result
+
+
+def _get_pool(n_jobs: int) -> multiprocessing.pool.Pool | multiprocessing.dummy.Pool:
+    """Get a Pool object for parallel processing."""
+    if multiprocessing.current_process().daemon:
+        logger.warning(
+            "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic "
+            "processes can't have children."
+        )
+        return multiprocessing.dummy.Pool(1)
+    elif n_jobs == 1:
+        return multiprocessing.dummy.Pool(1)
+    else:
+        max_n_jobs = multiprocessing.cpu_count()
+        if n_jobs > max_n_jobs:
+            logger.warning(
+                f"Number of jobs ({n_jobs}) exceeds the number of CPUs ({max_n_jobs}). "
+                f"Setting number of jobs to {max_n_jobs}."
+            )
+            return multiprocessing.Pool(max_n_jobs)
+        else:
+            return multiprocessing.Pool(n_jobs)
+
+
+def _lists_to_psm_list(
+    sequences: list[str],
+    modifications: list[str | None],
+    identifiers: list[str],
+    charges: list[int] | None,
+    n_jobs: int = 1,
+) -> PSMList:
+    """Convert lists of sequences, modifications, identifiers, and charges into a PSMList."""
+    if not charges:
+        charges = [None] * len(sequences)
+
+    def create_psm(args):
+        sequence, modifications, identifier, charge = args
+        return PSM(
+            peptidoform=peprec_to_proforma(sequence, modifications, charge=charge),
+            spectrum_id=identifier,
+        )
+
+    args_list = list(zip(sequences, modifications, identifiers, charges, strict=True))
+    with _get_pool(n_jobs) as pool:
+        list_of_psms = pool.map(create_psm, args_list)
+    return PSMList(psm_list=list_of_psms)
+
+
+# TODO: I'm not sure what the expected behavior was for charges; they were parsed
+# from the dataframe, while the passed list was used to check whether it they should get
+# parsed. I'll allow both with a priority for the passed charges.
+def _dataframe_to_psm_list(
+    dataframe: pd.DataFrame,
+    charges: list[int] | None,
+    n_jobs: int = 1,
+) -> PSMList:
+    """Convert a DataFrame with sequences, modifications, and identifiers into a PSMList."""
+    sequences = dataframe["seq"]
+    modifications = dataframe["modifications"]
+    identifiers = dataframe.index
+
+    if not charges:
+        if "charge" in dataframe.columns:
+            charges = dataframe["charge"]
+        else:
+            charges = list(None) * len(sequences)
+
+    return _lists_to_psm_list(sequences, modifications, identifiers, charges, n_jobs=n_jobs)

From 6588d4d2341d319547646ab28264534c0e1654cf Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 23 Apr 2025 17:34:37 +0200
Subject: [PATCH 3/6] Further cleanup; refactor feature generation; try dask
 for multiprocessing

---
 deeplc/__init__.py           |  14 +-
 deeplc/__main__.py           |  14 +-
 deeplc/deeplc.py             | 978 ++++++++++++-----------------------
 deeplc/feat_extractor.py     | 856 ++++++++----------------------
 deeplc/trainl3.py            |   2 +-
 examples/deeplc_example.py   |   5 +-
 pyproject.toml               |   2 +-
 tests/test_deepcallc.py      |   5 +-
 tests/test_deeplc_lib.py     |   5 +-
 tests/test_deeplc_retrain.py |   5 +-
 tests/test_feat_extractor.py |  93 ++++
 11 files changed, 660 insertions(+), 1319 deletions(-)
 create mode 100644 tests/test_feat_extractor.py

diff --git a/deeplc/__init__.py b/deeplc/__init__.py
index 0b3c79f..e4fb659 100644
--- a/deeplc/__init__.py
+++ b/deeplc/__init__.py
@@ -1,16 +1,8 @@
-__all__ = ["DeepLC", "FeatExtractor"]
+__all__ = ["DeepLC"]
 
-import sys
+from importlib.metadata import version
 
-if sys.version_info >= (3, 8):
-    from importlib.metadata import version
-
-    __version__ = version("deeplc")
-else:
-    import pkg_resources
-
-    __version__ = pkg_resources.require("deeplc")[0].version
+__version__ = version("deeplc")
 
 
 from deeplc.deeplc import DeepLC
-from deeplc.feat_extractor import FeatExtractor
diff --git a/deeplc/__main__.py b/deeplc/__main__.py
index 6c26032..e76278b 100644
--- a/deeplc/__main__.py
+++ b/deeplc/__main__.py
@@ -17,12 +17,12 @@
 import warnings
 
 import pandas as pd
+from psm_utils.io import read_file
 from psm_utils.io.peptide_record import peprec_to_proforma
 from psm_utils.psm import PSM
 from psm_utils.psm_list import PSMList
-from psm_utils.io import read_file
 
-from deeplc import __version__, DeepLC, FeatExtractor
+from deeplc import DeepLC, __version__
 from deeplc._argument_parser import parse_arguments
 from deeplc._exceptions import DeepLCError
 
@@ -182,16 +182,10 @@ def run(
                 index_col=0,
             )["value"].to_dict()
             psm_list_cal.rename_modifications(mapper)
-    # Make a feature extraction object; you can skip this if you do not want to
-    # use the default settings for DeepLC. Here we want to use a model that does
-    # not use RDKit features so we skip the chemical descriptor making
-    # procedure.
-    f_extractor = FeatExtractor(cnn_feats=True, verbose=verbose)
 
     # Make the DeepLC object that will handle making predictions and calibration
     dlc = DeepLC(
         path_model=file_model,
-        f_extractor=f_extractor,
         cnn_model=True,
         split_cal=split_cal,
         dict_cal_divider=dict_divider,
@@ -212,9 +206,9 @@ def run(
     # Make predictions; calibrated or uncalibrated
     logger.info("Making predictions using model: %s", dlc.model)
     if len(psm_list_cal) > 0:
-        preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
+        preds = dlc._make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
     else:
-        preds = dlc.make_preds(
+        preds = dlc._make_preds(
             seq_df=df_pred,
             infile=file_pred,
             psm_list=psm_list_pred,
diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py
index 9bb4051..f91cedd 100644
--- a/deeplc/deeplc.py
+++ b/deeplc/deeplc.py
@@ -29,14 +29,16 @@
 import sys
 import warnings
 from configparser import ConfigParser
-from itertools import chain
+from pathlib import Path
 from tempfile import TemporaryDirectory
 
 import numpy as np
 import pandas as pd
-from psm_utils import PSM, PSMList
+from dask import compute, delayed
+from psm_utils import PSM, Peptidoform, PSMList
 from psm_utils.io import read_file
 from psm_utils.io.peptide_record import peprec_to_proforma
+from sklearn.base import BaseEstimator
 from sklearn.linear_model import LinearRegression
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import SplineTransformer
@@ -79,8 +81,8 @@
 from tensorflow.python.eager import context
 
 from deeplc._exceptions import CalibrationError
-from deeplc.feat_extractor import FeatExtractor
-from deeplc.trainl3 import train_en
+from deeplc.feat_extractor import aggregate_encodings, encode_peptidoform, unpack_features
+from deeplc.trainl3 import train_elastic_net
 
 # Set to force CPU calculations
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -93,7 +95,6 @@ def warn(*args, **kwargs):
 import warnings
 
 warnings.warn = warn
-
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.filterwarnings("ignore", category=FutureWarning)
 
@@ -105,9 +106,10 @@ def split_list(a, n):
     return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
 
 
-def divide_chunks(l, n):
-    for i in range(0, len(l), n):
-        yield l[i : i + n]
+def divide_chunks(list_, n_chunks):
+    """Yield successive n-sized chunks from list_."""
+    for i in range(0, len(list_), n_chunks):
+        yield list_[i : i + n_chunks]
 
 
 def reset_keras():
@@ -140,14 +142,15 @@ def __init__(
         main_path: str | None = None,
         path_model: str | None = None,
         verbose: bool = True,
-        bin_dist: float = 2.0,
+        bin_distance: float = 2.0,
         dict_cal_divider: int = 50,
         split_cal: int = 50,
         n_jobs: int | None = None,
         config_file: str | None = None,
-        f_extractor: FeatExtractor | None = None,
+        f_extractor: None = None,
         cnn_model: bool = True,
-        batch_num: int = 250000,
+        # batch_num: int = 250000,
+        batch_num: int = int(1e6),
         batch_num_tf: int = 1024,
         write_library: bool = False,
         use_library: str | None = None,
@@ -183,7 +186,7 @@ def __init__(
         config_file
             Path to a configuration file.
         f_extractor
-            deeplc.FeatExtractor object to use.
+            Deprecated.
         cnn_model
             Flag indicating whether to use the CNN model.
         batch_num
@@ -221,12 +224,11 @@ def __init__(
         self.main_path = main_path or os.path.dirname(os.path.realpath(__file__))
         self.path_model = self._get_model_paths(path_model, single_model_mode)
         self.verbose = verbose
-        self.bin_dist = bin_dist
+        self.bin_distance = bin_distance
         self.dict_cal_divider = dict_cal_divider
         self.split_cal = split_cal
         self.n_jobs = multiprocessing.cpu_count() if n_jobs is None else n_jobs
         self.config_file = config_file
-        self.f_extractor = f_extractor or FeatExtractor()
         self.cnn_model = cnn_model
         self.batch_num = batch_num
         self.batch_num_tf = batch_num_tf
@@ -239,6 +241,15 @@ def __init__(
         self.predict_ccs = predict_ccs
         self.n_epochs = n_epochs
 
+        # Apparently...
+        self.model = self.path_model
+
+        if f_extractor:
+            warnings.DeprecationWarning("f_extractor argument is deprecated.")
+
+        # TODO REMOVE!!!
+        self.verbose = True
+
         # Calibration variables
         self.calibrate_dict = {}
         self.calibrate_min = float("inf")
@@ -280,145 +291,59 @@ def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) ->
 
         return DEFAULT_MODELS
 
-    def do_f_extraction(
+    def _extract_features(
         self,
-        seqs: list[str],
-        mods: list[str | None],
-        identifiers: list[str],
-        charges: list[int] | None = None,
-    ):
-        """
-        Extract all features we can extract; without parallelization; use if you want to run
-        feature extraction with a single core.
-
-        Parameters
-        ----------
-        seqs
-            List of stripped peptide sequences for each entry.
-        mods
-            List of PEPREC-formatted modifications for each entry.
-        identifiers
-            List of identifiers for each entry.
-        charges
-            List of charges for each entry. Only required if CCS prediction is enabled.
-
-        Returns
-        -------
-        pd.DataFrame
-            Feature matrix.
-        """
-        psm_list = _lists_to_psm_list(seqs, mods, identifiers, charges, n_jobs=self.n_jobs)
-        return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
-
-    def do_f_extraction_pd(self, dataframe: pd.DataFrame, charges: list[int] | None = None):
-        """
-        Extract all features we can extract; without parallelization; use if you want to run
-        feature extraction with a single thread; and use a defined dataframe.
-
-        Parameters
-        ----------
-        dataframe
-            Dataframe containing the sequences (column:seq), modifications (column:modifications),
-            identifiers (index), and optionally charges (column:charge).
-        charges
-            List of charges for each entry. Only required if CCS prediction is enabled and not
-            present in the dataframe.
-
-        Returns
-        -------
-        pd.DataFrame
-            Feature matrix.
-
-        """
-        psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs)
-        return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
-
-    def do_f_extraction_pd_parallel(
-        self, dataframe: pd.DataFrame, charges: list[int] | None = None
-    ):
-        """
-        Extract all features we can extract; with parallelization; use if you want to run feature
-        extraction with multiple threads; and use a defined dataframe.
+        peptidoforms: list[str | Peptidoform] | PSMList,
+        chunk_size: int = 10000,
+    ) -> dict[str, dict[int, np.ndarray]]:
+        """Extract features for all peptidoforms."""
+        if isinstance(peptidoforms, PSMList):
+            peptidoforms = [psm.peptidoform for psm in peptidoforms]
+
+        logger.debug("Running feature extraction in single-threaded mode...")
+        if self.n_jobs <= 1:
+            encodings = [
+                encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in peptidoforms
+            ]
 
-        Parameters
-        ----------
-        dataframe
-            Dataframe containing the sequences (column:seq), modifications (column:modifications),
-            identifiers (index), and optionally charges (column:charge).
-        charges
-            List of charges for each entry. Only required if CCS prediction is enabled and not
-            present in the dataframe.
+        else:
+            logger.debug("Preparing feature extraction with Dask")
+            # Process peptidoforms in larger chunks to reduce task overhead.
+            peptidoform_strings = [str(pep) for pep in peptidoforms]  # Faster pickling of strings
 
-        Returns
-        -------
-        pd.DataFrame
-            Feature matrix.
+            def chunked_encode(chunk):
+                return [encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in chunk]
 
-        """
-        psm_list = _dataframe_to_psm_list(dataframe, charges, n_jobs=self.n_jobs)
-        return self.do_f_extraction_psm_list_parallel(psm_list)
+            tasks = [
+                delayed(chunked_encode)(peptidoform_strings[i : i + chunk_size])
+                for i in range(0, len(peptidoform_strings), chunk_size)
+            ]
 
-    def do_f_extraction_psm_list(self, psm_list):
-        """
-        Extract all features we can extract; without parallelization; use if you want to run
-        feature extraction with a single thread; and use a defined dataframe
+            logger.debug("Starting feature extraction with Dask")
+            chunks_encodings = compute(*tasks, scheduler="processes", workers=self.n_jobs)
 
-        Parameters
-        ----------
-        psm_list
-            PSMList with PSMs to extract features for.
+            # Flatten the list of lists.
+            encodings = [enc for chunk in chunks_encodings for enc in chunk]
 
-        Returns
-        -------
-        pd.DataFrame
-            feature matrix
+        # Aggregate the encodings into a single dictionary.
+        aggregated_encodings = aggregate_encodings(encodings)
 
-        """
-        return self.f_extractor.full_feat_extract(psm_list, predict_ccs=self.predict_ccs)
+        logger.debug("Finished feature extraction")
 
-    def do_f_extraction_psm_list_parallel(self, psm_list):
-        """
-        Extract all features we can extract; without parallelization; use if
-        you want to run feature extraction with a single thread; and use a
-        defined dataframe
+        return aggregated_encodings
 
-        Parameters
-        ----------
-        psm_list
-            PSMList with PSMs to extract features for.
-
-        Returns
-        -------
-        pd.DataFrame
-            feature matrix
-
-        """
-        logger.debug("prepare feature extraction")
-
-        psm_list_split = split_list(psm_list, self.n_jobs)
-
-        logger.debug("start feature extraction")
-        with multiprocessing.Pool(self.n_jobs) as pool:
-            all_feats_async = pool.map_async(self.do_f_extraction_psm_list, psm_list_split)
-            logger.debug("wait for feature extraction")
-            all_feats_async.wait()
-            logger.debug("get feature extraction results")
-            res = all_feats_async.get()
-            matrix_names = res[0].keys()
-            all_feats = {
-                matrix_name: dict(
-                    enumerate(chain.from_iterable(v[matrix_name].values() for v in res))
-                )
-                for matrix_name in matrix_names
-            }
-            logger.debug("got feature extraction results")
-
-        return all_feats
+    def _apply_calibration_core(
+        self,
+        uncal_preds: np.ndarray,
+        cal_dict: dict | list[BaseEstimator],
+        cal_min: float,
+        cal_max: float,
+    ) -> np.ndarray:
+        """Apply calibration to the predictions."""
+        if len(uncal_preds) == 0:
+            return np.array([])
 
-    def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max):
         cal_preds = []
-        if len(uncal_preds) == 0:
-            return np.array(cal_preds)
         if self.pygam_calibration:
             linear_model_left, spline_model, linear_model_right = cal_dict
             y_pred_spline = spline_model.predict(uncal_preds.reshape(-1, 1))
@@ -442,341 +367,223 @@ def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max):
         else:
             for uncal_pred in uncal_preds:
                 try:
-                    slope, intercept = cal_dict[str(round(uncal_pred, self.bin_dist))]
+                    slope, intercept = cal_dict[str(round(uncal_pred, self.bin_distance))]
                     cal_preds.append(slope * (uncal_pred) + intercept)
                 except KeyError:
                     # outside of the prediction range ... use the last
                     # calibration curve
                     if uncal_pred <= cal_min:
-                        slope, intercept = cal_dict[str(round(cal_min, self.bin_dist))]
+                        slope, intercept = cal_dict[str(round(cal_min, self.bin_distance))]
                         cal_preds.append(slope * (uncal_pred) + intercept)
                     elif uncal_pred >= cal_max:
-                        slope, intercept = cal_dict[str(round(cal_max, self.bin_dist))]
+                        slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))]
                         cal_preds.append(slope * (uncal_pred) + intercept)
                     else:
-                        slope, intercept = cal_dict[str(round(cal_max, self.bin_dist))]
+                        slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))]
                         cal_preds.append(slope * (uncal_pred) + intercept)
+
         return np.array(cal_preds)
 
-    def make_preds_core_library(self, psm_list=[], calibrate=True, mod_name=None):
+    def _make_preds_core_library(self, psm_list=None, calibrate=True, mod_name=None):
+        """Get predictions stored in library and calibrate them if needed."""
+        psm_list = [] if psm_list is None else psm_list
         ret_preds = []
         for psm in psm_list:
             ret_preds.append(LIBRARY[psm.peptidoform.proforma + "|" + mod_name])
 
         if calibrate:
-            try:
-                ret_preds = self.calibration_core(
-                    ret_preds,
+            if isinstance(self.calibrate_min, dict):
+                # if multiple models are used, use the model name to get the
+                # calibration values (DeepCallC mode)
+                calibrate_dict, calibrate_min, calibrate_max = (
                     self.calibrate_dict[mod_name],
                     self.calibrate_min[mod_name],
                     self.calibrate_max[mod_name],
                 )
-            except:
-                ret_preds = self.calibration_core(
-                    ret_preds,
+            else:
+                # if only one model is used, use the same calibration values
+                calibrate_dict, calibrate_min, calibrate_max = (
                     self.calibrate_dict,
                     self.calibrate_min,
                     self.calibrate_max,
                 )
 
+            ret_preds = self._apply_calibration_core(
+                ret_preds, calibrate_dict, calibrate_min, calibrate_max
+            )
+
         return ret_preds
 
-    def make_preds_core(
+    def _make_preds_core(
         self,
-        X=[],
-        X_sum=[],
-        X_global=[],
-        X_hc=[],
-        psm_list=[],
+        X: np.ndarray | None = None,
+        X_sum: np.ndarray | None = None,
+        X_global: np.ndarray | None = None,
+        X_hc: np.ndarray | None = None,
         calibrate=True,
         mod_name=None,
-    ):
-        """
-        Make predictions for sequences
-        Parameters
-        ----------
-        seq_df : object :: pd.DataFrame
-            dataframe containing the sequences (column:seq), modifications
-            (column:modifications) and naming (column:index); will use parallel
-            by default!
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods : list
-            naming of the mods; should correspond to seqs and identifiers
-        identifiers : list
-            identifiers of the peptides; should correspond to seqs and mods
-        calibrate : boolean
-            calibrate predictions or just return the predictions
-        correction_factor : float
-            correction factor to apply to predictions
-        mod_name : str or None
-            specify a model to use instead of the model assigned originally to
-            this instance of the object
-        Returns
-        -------
-        np.array
-            predictions
-        """
+    ) -> np.ndarray:
+        """Make predictions."""
+        # Check calibration state
         if calibrate:
             assert self.calibrate_dict, (
-                "DeepLC instance is not yet calibrated.\
-                                        Calibrate before making predictions, or use calibrate=False"
+                "DeepLC instance is not yet calibrated. Calibrate before making predictions, or "
+                "use `calibrate=False`"
             )
 
-        if len(X) == 0 and len(psm_list) > 0:
-            if self.verbose:
-                logger.debug("Extracting features for the CNN model ...")
-            X = self.do_f_extraction_psm_list_parallel(psm_list)
-
-            X_sum = np.stack(list(X["matrix_sum"].values()))
-            X_global = np.concatenate(
-                (
-                    np.stack(list(X["matrix_all"].values())),
-                    np.stack(list(X["pos_matrix"].values())),
-                ),
-                axis=1,
-            )
-            X_hc = np.stack(list(X["matrix_hc"].values()))
-            X = np.stack(list(X["matrix"].values()))
-        elif len(X) == 0 and len(psm_list) == 0:
-            return []
+        if len(X) == 0:
+            return np.array([])
 
         ret_preds = []
-
-        mod = load_model(mod_name)
-        try:
-            X
-            ret_preds = mod.predict(
-                [X, X_sum, X_global, X_hc],
-                batch_size=self.batch_num_tf,
-                verbose=int(self.verbose),
-            ).flatten()
-        except UnboundLocalError:
-            logger.debug("X is empty, skipping...")
-            ret_preds = []
+        model = load_model(mod_name)
+        ret_preds = model.predict(
+            [X, X_sum, X_global, X_hc],
+            batch_size=self.batch_num_tf,
+            verbose=int(self.verbose),
+        ).flatten()
 
         if calibrate:
-            try:
-                ret_preds = self.calibration_core(
-                    ret_preds,
+            if isinstance(self.calibrate_min, dict):
+                # if multiple models are used, use the model name to get the
+                # calibration values (DeepCallC mode)
+                calibrate_dict, calibrate_min, calibrate_max = (
                     self.calibrate_dict[mod_name],
                     self.calibrate_min[mod_name],
                     self.calibrate_max[mod_name],
                 )
-            except:
-                ret_preds = self.calibration_core(
-                    ret_preds,
+            else:
+                # if only one model is used, use the same calibration values
+                calibrate_dict, calibrate_min, calibrate_max = (
                     self.calibrate_dict,
                     self.calibrate_min,
                     self.calibrate_max,
                 )
 
-        # clear_session()
+            ret_preds = self._apply_calibration_core(
+                ret_preds, calibrate_dict, calibrate_min, calibrate_max
+            )
+
         gc.collect()
         return ret_preds
 
     def make_preds(
         self,
-        psm_list=None,
-        infile="",
-        calibrate=True,
-        seq_df=None,
-        mod_name=None,
+        psm_list: PSMList | None = None,
+        infile: str | Path | None = None,
+        seq_df: pd.DataFrame | None = None,
+        calibrate: bool = True,
+        mod_name: str | None = None,
     ):
         """
         Make predictions for sequences, in batches if required.
 
         Parameters
         ----------
-        seq_df : object :: pd.DataFrame
-            dataframe containing the sequences (column:seq), modifications
-            (column:modifications) and naming (column:index); will use parallel
-            by default!
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods : list
-            naming of the mods; should correspond to seqs and identifiers
-        identifiers : list
-            identifiers of the peptides; should correspond to seqs and mods
-        calibrate : boolean
-            calibrate predictions or just return the predictions
-        correction_factor : float
-            correction factor to apply to predictions
-        mod_name : str or None
-            specify a model to use instead of the model assigned originally to
-            this instance of the object
+        psm_list
+            PSMList object containing the peptidoforms to predict for.
+        infile
+            Path to a file containing the peptidoforms to predict for.
+        seq_df
+            DataFrame containing the sequences (column:seq), modifications
+            (column:modifications) and naming (column:index).
+        calibrate
+            calibrate predictions or just return the predictions.
+        mod_name
+            specify a model to use instead of the model assigned originally to this instance of the
+            object.
 
         Returns
         -------
         np.array
             predictions
         """
-        if type(seq_df) == pd.core.frame.DataFrame:
-            list_of_psms = []
-            if self.predict_ccs:
-                for seq, mod, ident, z in zip(
-                    seq_df["seq"],
-                    seq_df["modifications"],
-                    seq_df.index,
-                    seq_df["charge"],
-                ):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod, charge=z),
-                            spectrum_id=ident,
-                        )
-                    )
+        if psm_list is None:
+            if seq_df is not None:
+                psm_list = _dataframe_to_psm_list(seq_df)
+            elif infile is not None:
+                psm_list = _file_to_psm_list(infile)
             else:
-                for seq, mod, ident in zip(seq_df["seq"], seq_df["modifications"], seq_df.index):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod),
-                            spectrum_id=ident,
-                        )
-                    )
-            psm_list = PSMList(psm_list=list_of_psms)
-
-        if len(infile) > 0:
-            psm_list = read_file(infile)
-            if "msms" in infile and ".txt" in infile:
-                mapper = pd.read_csv(
-                    os.path.join(
-                        os.path.dirname(os.path.realpath(__file__)),
-                        "unimod/map_mq_file.csv",
-                    ),
-                    index_col=0,
-                )["value"].to_dict()
-                psm_list.rename_modifications(mapper)
+                raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.")
+
+        if len(psm_list) == 0:
+            logger.warning("No PSMs to predict for.")
+            return []
 
         ret_preds_batches = []
         for psm_list_t in divide_chunks(psm_list, self.batch_num):
             ret_preds = []
-            if len(psm_list_t) > 0:
-                if self.verbose:
-                    logger.debug("Extracting features for the CNN model ...")
-
-                X = self.do_f_extraction_psm_list_parallel(psm_list_t)
-                X_sum = np.stack(list(X["matrix_sum"].values()))
-                X_global = np.concatenate(
-                    (
-                        np.stack(list(X["matrix_all"].values())),
-                        np.stack(list(X["pos_matrix"].values())),
-                    ),
-                    axis=1,
-                )
-                X_hc = np.stack(list(X["matrix_hc"].values()))
-                X = np.stack(list(X["matrix"].values()))
-            else:
-                return []
 
-            if isinstance(self.model, dict):
-                for m_group_name, m_name in self.model.items():
-                    ret_preds.append(
-                        self.make_preds_core(
-                            X=X,
-                            X_sum=X_sum,
-                            X_global=X_global,
-                            X_hc=X_hc,
-                            calibrate=calibrate,
-                            mod_name=m_name,
-                        )
-                    )
-                ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)])
-            elif mod_name is not None:
-                ret_preds = self.make_preds_core(
-                    X=X,
-                    X_sum=X_sum,
-                    X_global=X_global,
-                    X_hc=X_hc,
-                    calibrate=calibrate,
-                    mod_name=mod_name,
-                )
+            # Extract features for the CNN model
+            features = self._extract_features(psm_list_t)
+            X_sum, X_global, X_hc, X = unpack_features(features)
+
+            # Check if model was provided, and if not, whether multiple models are selected in
+            # the DeepLC object or not.
+            if mod_name:
+                model_names = [mod_name]
+            elif isinstance(self.model, dict):
+                model_names = [m_name for m_group_name, m_name in self.model.items()]
             elif isinstance(self.model, list):
-                for m_name in self.model:
-                    ret_preds.append(
-                        self.make_preds_core(
+                model_names = self.model
+            elif isinstance(self.model, str):
+                model_names = [self.model]
+            else:
+                raise ValueError("Invalid model name provided.")
+
+            # Get predictions
+            if len(model_names) > 1:
+                # Iterate over models if multiple were selected
+                model_predictions = []
+                for model_name in model_names:
+                    model_predictions.append(
+                        self._make_preds_core(
                             X=X,
                             X_sum=X_sum,
                             X_global=X_global,
                             X_hc=X_hc,
                             calibrate=calibrate,
-                            mod_name=m_name,
+                            mod_name=model_name,
                         )
                     )
-                ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)])
+                # Average the predictions from all models
+                ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds, strict=True)])
+                # ret_preds = np.mean(model_predictions, axis=0)
+
             else:
-                ret_preds = self.make_preds_core(
+                # Use the single model
+                ret_preds = self._make_preds_core(
                     X=X,
                     X_sum=X_sum,
                     X_global=X_global,
                     X_hc=X_hc,
                     calibrate=calibrate,
-                    mod_name=self.model,
+                    mod_name=model_names[0],
                 )
-            ret_preds_batches.extend(ret_preds)
 
-        return ret_preds_batches
-        # TODO make this multithreaded
-        # should be possible with the batched list
+            ret_preds_batches.append(ret_preds)
 
-    def calibrate_preds_func_pygam(
-        self,
-        psm_list=None,
-        correction_factor=1.0,
-        seq_df=None,
-        measured_tr=None,
-        use_median=True,
-        mod_name=None,
-    ):
-        if type(seq_df) == pd.core.frame.DataFrame:
-            list_of_psms = []
-            # TODO include charge here
-            if self.predict_ccs:
-                for seq, mod, ident, tr, z in zip(
-                    seq_df["seq"],
-                    seq_df["modifications"],
-                    seq_df.index,
-                    seq_df["tr"],
-                    seq_df["charge"],
-                ):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod, charge=z),
-                            spectrum_id=ident,
-                            retention_time=tr,
-                        )
-                    )
-            else:
-                for seq, mod, ident, tr in zip(
-                    seq_df["seq"],
-                    seq_df["modifications"],
-                    seq_df.index,
-                    seq_df["tr"],
-                ):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod),
-                            spectrum_id=ident,
-                            retention_time=tr,
-                        )
-                    )
-            psm_list = PSMList(psm_list=list_of_psms)
+        all_ret_preds = np.concatenate(ret_preds_batches, axis=0)
 
-            measured_tr = [psm.retention_time for psm in psm_list]
+        return all_ret_preds
 
-        predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name)
+    def _calibrate_preds_pygam(
+        self,
+        measured_tr: np.ndarray,
+        predicted_tr: np.ndarray,
+    ) -> tuple[float, float, list[BaseEstimator]]:
+        """Make calibration curve for predictions using PyGAM."""
+        logger.debug("Getting predictions for calibration...")
 
         # sort two lists, predicted and observed based on measured tr
         tr_sort = [
             (mtr, ptr)
-            for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1])
+            for mtr, ptr in sorted(
+                zip(measured_tr, predicted_tr, strict=True), key=lambda pair: pair[1]
+            )
         ]
         measured_tr = np.array([mtr for mtr, ptr in tr_sort], dtype=np.float32)
         predicted_tr = np.array([ptr for mtr, ptr in tr_sort], dtype=np.float32)
 
-        # predicted_tr = list(predicted_tr)
-        # measured_tr = list(measured_tr)
-
         # Fit a SplineTransformer model
         if self.deeplc_retrain:
             spline = SplineTransformer(degree=2, n_knots=10)
@@ -815,96 +622,19 @@ def calibrate_preds_func_pygam(
             [linear_model_left, spline_model, linear_model_right],
         )
 
-    def calibrate_preds_func(
+    def _calibrate_preds_piecewise_linear(
         self,
-        psm_list=None,
-        correction_factor=1.0,
-        seq_df=None,
-        use_median=True,
-        mod_name=None,
-    ):
-        """
-        Make calibration curve for predictions
-
-        Parameters
-        ----------
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods : list
-            naming of the mods; should correspond to seqs and identifiers
-        identifiers : list
-            identifiers of the peptides; should correspond to seqs and mods
-        measured_tr : list
-            measured tr of the peptides; should correspond to seqs, identifiers,
-            and mods
-        correction_factor : float
-            correction factor that needs to be applied to the supplied measured
-            trs
-        seq_df : object :: pd.DataFrame
-            a pd.DataFrame that contains the sequences, modifications and
-            observed retention times to fit a calibration curve
-        use_median : boolean
-            flag to indicate we need to use the median valuein a window to
-            perform calibration
-        mod_name
-            specify a model to use instead of the model assigned originally to
-            this instance of the object
-
-        Returns
-        -------
-        float
-            the minimum value where a calibration curve was fitted, lower values
-            will be extrapolated from the minimum fit of the calibration curve
-        float
-            the maximum value where a calibration curve was fitted, higher values
-            will be extrapolated from the maximum fit of the calibration curve
-        dict
-            dictionary with keys for rounded tr, and the values concern a linear
-            model that should be applied to do calibration (!!! what is the
-            shape of this?)
-        """
-        if type(seq_df) == pd.core.frame.DataFrame:
-            list_of_psms = []
-            # TODO include charge here
-            if self.predict_ccs:
-                for seq, mod, tr, ident, z in zip(
-                    seq_df["seq"],
-                    seq_df["modifications"],
-                    seq_df["tr"],
-                    seq_df.index,
-                    seq_df["charge"],
-                ):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod, charge=z),
-                            spectrum_id=ident,
-                            retention_time=tr,
-                        )
-                    )
-            else:
-                for seq, mod, tr, ident in zip(
-                    seq_df["seq"],
-                    seq_df["modifications"],
-                    seq_df["tr"],
-                    seq_df.index,
-                ):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod),
-                            spectrum_id=ident,
-                            retention_time=tr,
-                        )
-                    )
-            psm_list = PSMList(psm_list=list_of_psms)
-
-        measured_tr = [psm.retention_time for psm in psm_list]
-
-        predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name)
-
+        measured_tr: np.ndarray,
+        predicted_tr: np.ndarray,
+        use_median: bool = True,
+    ) -> tuple[float, float, dict[str, tuple[float]]]:
+        """Make calibration curve for predictions."""
         # sort two lists, predicted and observed based on measured tr
         tr_sort = [
             (mtr, ptr)
-            for mtr, ptr in sorted(zip(measured_tr, predicted_tr), key=lambda pair: pair[1])
+            for mtr, ptr in sorted(
+                zip(measured_tr, predicted_tr, strict=False), key=lambda pair: pair[1]
+            )
         ]
         measured_tr = np.array([mtr for mtr, ptr in tr_sort])
         predicted_tr = np.array([ptr for mtr, ptr in tr_sort])
@@ -916,10 +646,9 @@ def calibrate_preds_func(
         calibrate_min = float("inf")
         calibrate_max = 0
 
-        if self.verbose:
-            logger.debug(
-                "Selecting the data points for calibration (used to fit the linear models between)"
-            )
+        logger.debug(
+            "Selecting the data points for calibration (used to fit the linear models between)"
+        )
         # smooth between observed and predicted
         split_val = predicted_tr[-1] / self.split_cal
 
@@ -948,14 +677,13 @@ def calibrate_preds_func(
                 mtr_mean.append(sum(mtr) / len(mtr))
                 ptr_mean.append(sum(ptr) / len(ptr))
 
-        if self.verbose:
-            logger.debug("Fitting the linear models between the points")
+        logger.debug("Fitting the linear models between the points")
 
         if self.split_cal >= len(measured_tr):
             raise CalibrationError(
-                "Not enough measured tr ({}) for the chosen number of splits ({}). "
-                "Choose a smaller split_cal parameter or provide more peptides for "
-                "fitting the calibration curve.".format(len(measured_tr), self.split_cal)
+                f"Not enough measured tr ({len(measured_tr)}) for the chosen number of splits "
+                f"({self.split_cal}). Choose a smaller split_cal parameter or provide more "
+                "peptides for fitting the calibration curve."
             )
         if len(mtr_mean) == 0:
             raise CalibrationError("The measured tr list is empty, not able to calibrate")
@@ -975,118 +703,84 @@ def calibrate_preds_func(
             # optimized predictions using a dict to find calibration curve very
             # fast
             for v in np.arange(
-                round(ptr_mean[i], self.bin_dist),
-                round(ptr_mean[i + 1], self.bin_dist),
-                1 / ((self.bin_dist) * self.dict_cal_divider),
+                round(ptr_mean[i], self.bin_distance),
+                round(ptr_mean[i + 1], self.bin_distance),
+                1 / ((self.bin_distance) * self.dict_cal_divider),
             ):
                 if v < calibrate_min:
                     calibrate_min = v
                 if v > calibrate_max:
                     calibrate_max = v
-                calibrate_dict[str(round(v, self.bin_dist))] = [
-                    slope,
-                    intercept,
-                ]
+                calibrate_dict[str(round(v, self.bin_distance))] = (slope, intercept)
 
         return calibrate_min, calibrate_max, calibrate_dict
 
     def calibrate_preds(
         self,
-        psm_list=None,
-        infile="",
-        measured_tr=[],
-        correction_factor=1.0,
-        location_retraining_models="",
-        psm_utils_obj=None,
-        sample_for_calibration_curve=None,
-        seq_df=None,
-        use_median=True,
+        psm_list: PSMList | None = None,
+        infile: str | Path | None = None,
+        seq_df: pd.DataFrame | None = None,
+        measured_tr: np.ndarray | None = None,
+        location_retraining_models: str = "",
+        sample_for_calibration_curve: int | None = None,
+        use_median: bool = True,
         return_plotly_report=False,
-    ):
+    ) -> dict | None:
         """
         Find best model and calibrate.
 
         Parameters
         ----------
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods : list
-            naming of the mods; should correspond to seqs and identifiers
-        identifiers : list
-            identifiers of the peptides; should correspond to seqs and mods
+        psm_list
+            PSMList object containing the peptidoforms to predict for.
+        infile
+            Path to a file containing the peptidoforms to predict for.
+        seq_df
+            DataFrame containing the sequences (column:seq), modifications (column:modifications),
+            naming (column:index), and optionally charge (column:charge) and measured retention
+            time (column:tr).
         measured_tr : list
-            measured tr of the peptides; should correspond to seqs, identifiers,
-            and mods
+            Measured retention time used for calibration. Should correspond to the PSMs in the
+            provided PSMs. If None, the measured retention time is taken from the PSMs.
         correction_factor : float
-            correction factor that needs to be applied to the supplied measured
-            trs
-        seq_df : object :: pd.DataFrame
-            a pd.DataFrame that contains the sequences, modifications and
-            observed retention times to fit a calibration curve
-        use_median : boolean
-            flag to indicate we need to use the median valuein a window to
-            perform calibration
+            correction factor that needs to be applied to the supplied measured tr's
+        location_retraining_models
+            Location to save the retraining models; if None, a temporary directory is used.
+        sample_for_calibration_curve
+            Number of PSMs to sample for calibration curve; if None, all provided PSMs are used.
+        use_median
+            Whether to use the median value in a window to perform calibration; only applies to
+            piecewise linear calibration, not to PyGAM calibration.
+        return_plotly_report
+            Whether to return a plotly report with the calibration results.
 
         Returns
         -------
+        dict | None
+            Dictionary with plotly report information or None.
 
         """
-        if type(seq_df) == pd.core.frame.DataFrame:
-            list_of_psms = []
-            if self.predict_ccs:
-                for seq, mod, ident, tr, z in zip(
-                    seq_df["seq"],
-                    seq_df["modifications"],
-                    seq_df.index,
-                    seq_df["tr"],
-                    seq_df["charge"],
-                ):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod, charge=z),
-                            spectrum_id=ident,
-                            retention_time=tr,
-                        )
-                    )
+        # Getting PSMs either from psm_list, seq_df, or infile
+        if psm_list is None:
+            if seq_df is not None:
+                psm_list = _dataframe_to_psm_list(seq_df)
+            elif infile is not None:
+                psm_list = _file_to_psm_list(infile)
             else:
-                for seq, mod, ident, tr in zip(
-                    seq_df["seq"],
-                    seq_df["modifications"],
-                    seq_df.index,
-                    seq_df["tr"],
-                ):
-                    list_of_psms.append(
-                        PSM(
-                            peptidoform=peprec_to_proforma(seq, mod),
-                            spectrum_id=ident,
-                            retention_time=tr,
-                        )
-                    )
-            psm_list = PSMList(psm_list=list_of_psms)
-        elif psm_utils_obj:
-            psm_list = psm_utils_obj
+                raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.")
 
+        # Getting measured retention time either from measured_tr or provided PSMs
+        if not measured_tr:
+            measured_tr = [psm.retention_time for psm in psm_list]
+            if None in measured_tr:
+                raise ValueError("Not all PSMs have an observed retention time.")
+
+        # Ensuring self.model is list of strings
         if isinstance(self.model, str):
             self.model = [self.model]
 
-        if len(infile) > 0:
-            psm_list = read_file(infile)
-            if "msms" in infile and ".txt" in infile:
-                mapper = pd.read_csv(
-                    os.path.join(
-                        os.path.dirname(os.path.realpath(__file__)),
-                        "unimod/map_mq_file.csv",
-                    ),
-                    index_col=0,
-                )["value"].to_dict()
-                psm_list.rename_modifications(mapper)
-
-        measured_tr = [psm.retention_time for psm in psm_list]
-
-        if self.verbose:
-            logger.debug("Start to calibrate predictions ...")
-        if self.verbose:
-            logger.debug("Ready to find the best model out of: %s" % (self.model))
+        logger.debug("Start to calibrate predictions ...")
+        logger.debug(f"Ready to find the best model out of: {self.model}")
 
         best_perf = float("inf")
         best_calibrate_min = 0.0
@@ -1101,7 +795,8 @@ def calibrate_preds(
         temp_pred = []
 
         if self.deeplc_retrain:
-            # The following code is not required in most cases, but here it is used to clear variables that might cause problems
+            # The following code is not required in most cases, but here it is used to clear
+            # variables that might cause problems
             _ = tf.Variable([1])
 
             context._context = None
@@ -1109,17 +804,14 @@ def calibrate_preds(
 
             tf.config.threading.set_inter_op_parallelism_threads(1)
 
-            if len(location_retraining_models) > 0:
+            if location_retraining_models:
+                os.makedirs(location_retraining_models, exist_ok=True)
+            else:
                 t_dir_models = TemporaryDirectory().name
                 os.mkdir(t_dir_models)
-            else:
-                t_dir_models = location_retraining_models
-                try:
-                    os.mkdir(t_dir_models)
-                except:
-                    pass
 
-            # Here we will apply transfer learning we specify previously trained models in the 'mods_transfer_learning'
+            # Here we will apply transfer learning we specify previously trained models in the
+            # 'mods_transfer_learning'
             models = deeplcretrainer.retrain(
                 {"deeplc_transferlearn": psm_list},
                 outpath=t_dir_models,
@@ -1132,50 +824,37 @@ def calibrate_preds(
 
             self.model = models
 
-        if isinstance(sample_for_calibration_curve, int):
+        # Limit calibration to a subset of PSMs if specified
+        if sample_for_calibration_curve:
             psm_list = random.sample(list(psm_list), sample_for_calibration_curve)
             measured_tr = [psm.retention_time for psm in psm_list]
 
-        for m in self.model:
-            if self.verbose:
-                logger.debug("Trying out the following model: %s" % (m))
+        for model_name in self.model:
+            logger.debug(f"Trying out the following model: {model_name}")
+            predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=model_name)
+
             if self.pygam_calibration:
-                calibrate_output = self.calibrate_preds_func_pygam(
-                    psm_list,
-                    measured_tr=measured_tr,
-                    correction_factor=correction_factor,
-                    seq_df=seq_df,
-                    use_median=use_median,
-                    mod_name=m,
-                )
+                calibrate_output = self._calibrate_preds_pygam(measured_tr, predicted_tr)
             else:
-                calibrate_output = self.calibrate_preds_func(
-                    psm_list,
-                    correction_factor=correction_factor,
-                    seq_df=seq_df,
-                    use_median=use_median,
-                    mod_name=m,
+                calibrate_output = self._calibrate_preds_piecewise_linear(
+                    measured_tr, predicted_tr, use_median=use_median
                 )
+            self.calibrate_min, self.calibrate_max, self.calibrate_dict = calibrate_output
+            # TODO: Currently, calibration dict can be both a dict (linear) or a list of models
+            # (PyGAM)... This should be handled better in the future.
 
-            (
-                self.calibrate_min,
-                self.calibrate_max,
-                self.calibrate_dict,
-            ) = calibrate_output
-
-            if type(self.calibrate_dict) == dict:
-                if len(self.calibrate_dict.keys()) == 0:
-                    continue
-
-            m_name = m.split("/")[-1]
+            # Skip this model if calibrate_dict is empty
+            # TODO: Should this do something when using PyGAM and calibrate_dict is a list?
+            if isinstance(self.calibrate_dict, dict) and len(self.calibrate_dict.keys()) == 0:
+                continue
 
-            preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=m)
+            m_name = model_name.split("/")[-1]
 
-            if self.deepcallc_mod:
-                m_group_name = "deepcallc"
-            else:
-                m_group_name = "_".join(m_name.split("_")[:-1])
+            # Get new predictions with calibration
+            preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=model_name)
 
+            m_group_name = "deepcallc" if self.deepcallc_mod else "_".join(m_name.split("_")[:-1])
+            m = model_name
             try:
                 pred_dict[m_group_name][m] = preds
                 mod_dict[m_group_name][m] = m
@@ -1195,23 +874,19 @@ def calibrate_preds(
                 mod_calibrate_min_dict[m_group_name][m] = self.calibrate_min
                 mod_calibrate_max_dict[m_group_name][m] = self.calibrate_max
 
-        for m_name in pred_dict.keys():
-            preds = [sum(a) / len(a) for a in zip(*list(pred_dict[m_name].values()))]
+        for m_name in pred_dict:
+            preds = [sum(a) / len(a) for a in zip(*list(pred_dict[m_name].values()), strict=True)]
             if len(measured_tr) == 0:
                 perf = sum(abs(seq_df["tr"] - preds))
             else:
                 perf = sum(abs(np.array(measured_tr) - np.array(preds)))
 
-            if self.verbose:
-                logger.debug("For %s model got a performance of: %s" % (m_name, perf / len(preds)))
+            logger.debug(f"For {m_name} model got a performance of: {perf / len(preds)}")
 
             if perf < best_perf:
-                if self.deepcallc_mod:
-                    m_group_name = "deepcallc"
-                else:
-                    m_group_name = m_name
-                    # TODO is deepcopy really required?
+                m_group_name = "deepcallc" if self.deepcallc_mod else m_name
 
+                # TODO is deepcopy really required?
                 best_calibrate_dict = copy.deepcopy(mod_calibrate_dict[m_group_name])
                 best_calibrate_min = copy.deepcopy(mod_calibrate_min_dict[m_group_name])
                 best_calibrate_max = copy.deepcopy(mod_calibrate_max_dict[m_group_name])
@@ -1228,18 +903,18 @@ def calibrate_preds(
         self.model = best_model
 
         if self.deepcallc_mod:
-            self.deepcallc_model = train_en(pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"])
-
-        # self.n_jobs = 1
+            self.deepcallc_model = train_elastic_net(
+                pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"]
+            )
 
-        logger.debug("Model with the best performance got selected: %s" % (best_model))
+        logger.debug(f"Model with the best performance got selected: {best_model}")
 
         if return_plotly_report:
             import deeplc.plot
 
             plotly_return_dict = {}
             plotly_df = pd.DataFrame(
-                list(zip(temp_obs, temp_pred)),
+                list(zip(temp_obs, temp_pred, strict=True)),
                 columns=[
                     "Observed retention time",
                     "Predicted retention time",
@@ -1249,32 +924,10 @@ def calibrate_preds(
             plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(plotly_df)
             return plotly_return_dict
 
-        return {}
-
-    def split_seq(self, a, n):
-        """
-        Split a list (a) into multiple chunks (n)
-
-        Parameters
-        ----------
-        a : list
-            list to split
-        n : list
-            number of chunks
-
-        Returns
-        -------
-        list
-            chunked list
-        """
-
-        # since chunking is not alway possible do the modulo of residues
-        k, m = divmod(len(a), n)
-        result = (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
-        return result
+        return None
 
 
-def _get_pool(n_jobs: int) -> multiprocessing.pool.Pool | multiprocessing.dummy.Pool:
+def _get_pool(n_jobs: int) -> multiprocessing.Pool | multiprocessing.dummy.Pool:  # type: ignore
     """Get a Pool object for parallel processing."""
     if multiprocessing.current_process().daemon:
         logger.warning(
@@ -1301,22 +954,29 @@ def _lists_to_psm_list(
     modifications: list[str | None],
     identifiers: list[str],
     charges: list[int] | None,
+    retention_times: list[float] | None = None,
     n_jobs: int = 1,
 ) -> PSMList:
-    """Convert lists of sequences, modifications, identifiers, and charges into a PSMList."""
+    """Convert lists into a PSMList using Dask for parallel processing."""
     if not charges:
         charges = [None] * len(sequences)
 
+    if not retention_times:
+        retention_times = [None] * len(sequences)
+
     def create_psm(args):
-        sequence, modifications, identifier, charge = args
+        sequence, modifications, identifier, charge, retention_time = args
         return PSM(
             peptidoform=peprec_to_proforma(sequence, modifications, charge=charge),
             spectrum_id=identifier,
+            retention_time=retention_time,
         )
 
-    args_list = list(zip(sequences, modifications, identifiers, charges, strict=True))
-    with _get_pool(n_jobs) as pool:
-        list_of_psms = pool.map(create_psm, args_list)
+    args_list = list(
+        zip(sequences, modifications, identifiers, charges, retention_times, strict=True)
+    )
+    tasks = [delayed(create_psm)(args) for args in args_list]
+    list_of_psms = list(compute(*tasks, scheduler="processes"))
     return PSMList(psm_list=list_of_psms)
 
 
@@ -1332,11 +992,27 @@ def _dataframe_to_psm_list(
     sequences = dataframe["seq"]
     modifications = dataframe["modifications"]
     identifiers = dataframe.index
-
-    if not charges:
-        if "charge" in dataframe.columns:
-            charges = dataframe["charge"]
-        else:
-            charges = list(None) * len(sequences)
-
-    return _lists_to_psm_list(sequences, modifications, identifiers, charges, n_jobs=n_jobs)
+    retention_times = dataframe["tr"] if "tr" in dataframe.columns else None
+
+    if not charges and "charge" in dataframe.columns:
+        charges = dataframe["charge"]
+
+    return _lists_to_psm_list(
+        sequences, modifications, identifiers, charges, retention_times, n_jobs=n_jobs
+    )
+
+
+def _file_to_psm_list(input_file: str | Path) -> PSMList:
+    """Read a file into a PSMList, optionally mapping MaxQuant modifications labels."""
+    psm_list = read_file(input_file)
+    if "msms" in input_file and ".txt" in input_file:
+        mapper = pd.read_csv(
+            os.path.join(
+                os.path.dirname(os.path.realpath(__file__)),
+                "unimod/map_mq_file.csv",
+            ),
+            index_col=0,
+        )["value"].to_dict()
+        psm_list.rename_modifications(mapper)
+
+    return psm_list
diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py
index 087baa1..ecda640 100644
--- a/deeplc/feat_extractor.py
+++ b/deeplc/feat_extractor.py
@@ -1,651 +1,233 @@
-"""
-This code is used to extract features for peptides used in DeepLC (or seperately
-if required).
+"""Feature extraction for DeepLC."""
 
-For the library versions see the .yml file
-"""
+from __future__ import annotations
 
-__author__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
-__credits__ = [
-    "Robbin Bouwmeester",
-    "Ralf Gabriels",
-    "Arthur Declercq",
-    "Prof. Lennart Martens",
-    "Sven Degroeve",
-]
-__license__ = "Apache License, Version 2.0"
-__maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"]
-__email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"]
-
-# Native imports
-import os
-import math
-import time
-from configparser import ConfigParser
-import ast
-from re import sub
 import logging
+import warnings
+from re import sub
 
 import numpy as np
-import pandas as pd
-from psm_utils.io.peptide_record import peprec_to_proforma
-from psm_utils.psm import PSM
-from psm_utils.psm_list import PSMList
+from psm_utils.psm import Peptidoform
 from pyteomics import mass
 
 logger = logging.getLogger(__name__)
 
 
-class FeatExtractor:
-    """
-    Place holder, fill later
-
-    Parameters
-    ----------
-
-    Returns
-    -------
-
-    """
-
-    def __init__(
-        self,
-        main_path=os.path.dirname(os.path.realpath(__file__)),
-        lib_path_mod=os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/"),
-        lib_aa_composition=os.path.join(
-            os.path.dirname(os.path.realpath(__file__)), "aa_comp_rel.csv"
-        ),
-        split_size=7,
-        verbose=True,
-        include_specific_posses=[
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            -1,
-            -2,
-            -3,
-            -4,
-            -5,
-            -6,
-            -7,
-        ],
-        add_sum_feat=False,
-        ptm_add_feat=False,
-        ptm_subtract_feat=False,
-        add_rolling_feat=False,
-        include_unnormalized=True,
-        add_comp_feat=False,
-        cnn_feats=True,
-        ignore_mods=False,
-        config_file=None,
-    ):
-        # if a config file is defined overwrite standard parameters
-        if config_file:
-            cparser = ConfigParser()
-            cparser.read(config_file)
-            lib_path_mod = cparser.get("featExtractor", "lib_path_mod").strip('"')
-            split_size = cparser.getint("featExtractor", "split_size")
-            verbose = cparser.getboolean("featExtractor", "verbose")
-            add_sum_feat = cparser.getboolean("featExtractor", "add_sum_feat")
-            ptm_add_feat = cparser.getboolean("featExtractor", "ptm_add_feat")
-            ptm_subtract_feat = cparser.getboolean("featExtractor", "ptm_subtract_feat")
-            add_rolling_feat = cparser.getboolean("featExtractor", "add_rolling_feat")
-            include_unnormalized = cparser.getboolean("featExtractor", "include_unnormalized")
-            include_specific_posses = ast.literal_eval(
-                cparser.get("featExtractor", "include_specific_posses")
-            )
-
-        self.main_path = main_path
-
-        # Get the atomic composition of AAs
-        self.lib_aa_composition = self.get_aa_composition(lib_aa_composition)
-
-        self.split_size = split_size
-        self.verbose = verbose
-
-        self.add_sum_feat = add_sum_feat
-        self.ptm_add_feat = ptm_add_feat
-        self.ptm_subtract_feat = ptm_subtract_feat
-        self.add_rolling_feat = add_rolling_feat
-        self.cnn_feats = cnn_feats
-        self.include_unnormalized = include_unnormalized
-        self.include_specific_posses = include_specific_posses
-        self.add_comp_feat = add_comp_feat
-        self.ignore_mods = ignore_mods
-
-    def __str__(self):
-        return """
-  _____                  _      _____       __           _               _                  _
- |  __ \                | |    / ____|     / _|         | |             | |                | |
- | |  | | ___  ___ _ __ | |   | |   ______| |_ ___  __ _| |_    _____  _| |_ _ __ __ _  ___| |_ ___  _ __
- | |  | |/ _ \/ _ \ '_ \| |   | |  |______|  _/ _ \/ _` | __|  / _ \ \/ / __| '__/ _` |/ __| __/ _ \| '__|
- | |__| |  __/  __/ |_) | |___| |____     | ||  __/ (_| | |_  |  __/>  <| |_| | | (_| | (__| || (_) | |
- |_____/ \___|\___| .__/|______\_____|    |_| \___|\__,_|\__|  \___/_/\_\\__|_|  \__,_|\___|\__\___/|_|
-                  | |
-                  |_|
-        """
-
-    def get_aa_composition(self, file_loc):
-        """
-        Read amino acid atomic composition and return a dictionary
-
-        Parameters
-        ----------
-        file_loc : str
-            location of the (csv) file that contains the atomic compositions of AAs. The first column must contain
-            the one-letter AA code. The remaining columns contain counts for each atom (each atom in seperate
-            column). An example is:
-
-                aa,C,H,N,O,S
-                A,1,2,0,0,0
-                R,4,9,3,0,0
-                N,2,3,1,1,0
-
-        Returns
-        -------
-        dict
-            dictionary that goes from one-letter AA to atomic composition
-        """
-        return pd.read_csv(file_loc, index_col=0).T.to_dict()
-
-    def split_seq(self, a, n):
-        """
-        Split a list (a) into multiple chunks (n)
-
-        Parameters
-        ----------
-        a : list
-            list to split
-        n : list
-            number of chunks
-
-        Returns
-        -------
-        list
-            chunked list
-        """
-        # since chunking is not alway possible do the modulo of residues
-        k, m = divmod(len(a), n)
-        return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
-
-    def calc_feats_mods(self, formula):
-        """
-        Chemical formula to atom addition/subtraction
-
-        Parameters
-        ----------
-        formula : str
-            chemical formula
-
-        Returns
-        -------
-        list
-            atom naming
-        list
-            number of atom added/subtracted
-        """
-        if not formula:
-            return [], []
-        if len(str(formula)) == 0:
-            return [], []
-        if not isinstance(formula, str):
-            if math.isnan(formula):
-                return [], []
-
-        new_atoms = []
-        new_num_atoms = []
-        for atom in formula.split(" "):
-            if "(" not in atom:
-                atom_symbol = atom
-                num_atom = 1
-            else:
-                atom_symbol = atom.split("(")[0]
-                num_atom = atom.split("(")[1].rstrip(")")
-            new_atoms.append(atom_symbol)
-            new_num_atoms.append(int(num_atom))
-        return new_atoms, new_num_atoms
-
-    def get_feats_mods(
-        self,
-        seqs,
-        mods,
-        identifiers,
-        split_size=False,
-        atoms_order=set(["H", "C", "N", "O", "P", "S"]),
-        add_str="_sum",
-        subtract_mods=False,
-    ):
-        """
-        Chemical formula to atom addition/subtraction
-
-        Parameters
-        ----------
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods : list
-            naming of the mods; should correspond to seqs and identifiers
-        identifiers : str
-            identifiers of the peptides; should correspond to seqs and mods
-        split_size : int
-            overwrite the set split size if needed
-        atoms_order : set
-            atoms to include and the order
-        add_str : str
-            add this substring to feature naming
-        subtract_mods : boolean
-            calculate the atom that are substracted in the PTM
-
-        Returns
-        -------
-        object :: pd.DataFrame
-            feature matrix for peptide PTMs
-        """
-        if not split_size:
-            split_size = self.split_size
-        if self.verbose:
-            t0 = time.time()
-        mod_dict = {}
-        look_up_mod_subtract = {}
-        look_up_mod_add = {}
-
-        len_init = len([ao + str(spl_s) for spl_s in range(split_size) for ao in atoms_order])
-        for index_name, mod, seq in zip(identifiers, mods, seqs):
-            mod_dict[index_name] = dict(
-                zip(
-                    [
-                        ao + str(spl_s) + add_str
-                        for spl_s in range(split_size)
-                        for ao in atoms_order
-                    ],
-                    [0] * len_init,
-                )
-            )
-
-            if not mod:
-                continue
-            if len(str(mod)) == 0:
-                continue
-            if not isinstance(mod, str):
-                if math.isnan(mod):
-                    continue
-
-        if self.verbose:
-            logger.debug("Time to calculate mod features: %s seconds" % (time.time() - t0))
-        df_ret = pd.DataFrame(mod_dict, dtype=int).T
-        del mod_dict
-        return df_ret
-
-    def encode_atoms(
-        self,
-        psm_list,
-        indexes,
-        charges=[],
-        predict_ccs=False,
-        padding_length=60,
-        positions=set([0, 1, 2, 3, -1, -2, -3, -4]),
-        positions_pos=set([0, 1, 2, 3]),
-        positions_neg=set([-1, -2, -3, -4]),
-        sum_mods=2,
-        dict_aa={
-            "K": 0,
-            "R": 1,
-            "P": 2,
-            "T": 3,
-            "N": 4,
-            "A": 5,
-            "Q": 6,
-            "V": 7,
-            "S": 8,
-            "G": 9,
-            "I": 10,
-            "L": 11,
-            "C": 12,
-            "M": 13,
-            "H": 14,
-            "F": 15,
-            "Y": 16,
-            "W": 17,
-            "E": 18,
-            "D": 19,
-        },
-        dict_index_pos={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5},
-        dict_index_all={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5},
-        dict_index={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5},
-    ):
-        """
-        Extract all features we can extract... Probably the function you want to call by default
-
-        Parameters
-        ----------
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods_all : list
-            naming of the mods; should correspond to seqs and identifiers
-        indexes : list
-            identifiers of the peptides; should correspond to seqs and mods
-        padding_length : int
-            indicates padding length with 'X'-characters. Shorter sequences are padded. Longer sequences
-            are sliced shorter (C-terminal > than padding length will be missing)
-        positions : list
-            list of positions to include separately, for the C-terminus
-            provide negative indices
-        sum_mods : int
-            value that is used to feed the second head of cerberus with summed information, for example,
-            a value of 2 would sum the composition of each 2 AAs (i.e. 1+2, 3+4, 5+6 ...)
-        dict_index_pos : dict
-            index position of atom for positional features
-        dict_index_all : dict
-            index position of atom for overall compositional features
-        dict_index : dict
-            index position of atom for compositional features for the whole peptide (each position)
-        charges : list
-            optional list with charges, keep empty if these will not effect the predicted value
-
-        Returns
-        -------
-        object :: pd.DataFrame
-            feature matrix (np.matrix) of all positions (up till padding length)
-        object :: pd.DataFrame
-            feature matrix (np.matrix) of summed positions (up till padding length / sum_mods)
-        object :: pd.DataFrame
-            feature matrix (np.matrix) of specific positions (from positions argument)
-        object :: pd.DataFrame
-            feature matrix (np.matrix) of summed composition
-        """
-
-        # Local helper to ensure each unique warning is logged only once.
-        logged_warnings = set()
-
-        def warn_once(message):
-            if message not in logged_warnings:
-                logged_warnings.add(message)
-                logger.warning(message)
-
-        # TODO param flag for CCS prediction
-        def rolling_sum(a, n=2):
-            ret = np.cumsum(a, axis=1, dtype=np.float32)
-            ret[:, n:] = ret[:, n:] - ret[:, :-n]
-            return ret[:, n - 1 :]
-
-        ret_list = {}
-        ret_list["matrix"] = {}
-        ret_list["matrix_sum"] = {}
-        ret_list["matrix_all"] = {}
-        ret_list["pos_matrix"] = {}
-        ret_list["matrix_hc"] = {}
-
-        # Iterate over all instances
-        for psm, row_index in zip(psm_list, indexes):
-            peptidoform = psm.peptidoform
-            seq = peptidoform.sequence
-            seq_len = len(seq)
-            charge = psm.get_precursor_charge()
-
-            # For now anything longer than padding length is cut away
-            # (C-terminal cutting)
-            if seq_len > padding_length:
-                seq = seq[0:padding_length]
-                seq_len = len(seq)
-                warn_once("Truncating peptide (too long): %s" % (seq))
-
-            peptide_composition = [mass.std_aa_comp[aa] for aa in seq]
-
-            # Initialize all feature matrices
-            matrix = np.zeros((padding_length, len(dict_index.keys())), dtype=np.float16)
-            matrix_hc = np.zeros((padding_length, len(dict_aa.keys())), dtype=np.float16)
-            matrix_pos = np.zeros((len(positions), len(dict_index.keys())), dtype=np.float16)
-
-            for i, position_composition in enumerate(peptide_composition):
-                for k, v in position_composition.items():
-                    try:
-                        matrix[i, dict_index[k]] = v
-                    except IndexError:
-                        warn_once(
-                            f"Could not add the following value: pos {i} for atom {k} with value {v}"
-                        )
-                    except KeyError:
-                        warn_once(
-                            f"Could not add the following value: pos {i} for atom {k} with value {v}"
-                        )
-
-            for p in positions_pos:
-                try:
-                    aa = seq[p]
-                except:
-                    warn_once(f"Unable to get the following position: {p}")
-                    continue
-                for atom, val in mass.std_aa_comp[aa].items():
-                    try:
-                        matrix_pos[p, dict_index_pos[atom]] = val
-                    except KeyError:
-                        warn_once(f"Could not add the following atom: {atom}")
-                    except IndexError:
-                        warn_once(f"Could not add the following atom: {p} {atom} {val}")
-
-            for pn in positions_neg:
-                try:
-                    aa = seq[seq_len + pn]
-                except:
-                    warn_once(f"Unable to get the following position: {p}")
-                    continue
-                for atom, val in mass.std_aa_comp[aa].items():
-                    try:
-                        matrix_pos[pn, dict_index_pos[atom]] = val
-                    except KeyError:
-                        warn_once(f"Could not add the following atom: {atom}")
-                    except IndexError:
-                        warn_once(f"Could not add the following atom: {pn} {atom} {val}")
-
-            for i, peptide_position in enumerate(peptidoform.parsed_sequence):
-                try:
-                    matrix_hc[i, dict_aa[peptide_position[0]]] = 1.0
-                except KeyError:
-                    warn_once(f"Skipping the following (not in library): {i} {peptide_position}")
-                except IndexError:
-                    warn_once(f"Could not add the following atom: {i} {peptide_position}")
-
-                if peptide_position[1] is not None:
-                    try:
-                        modification_composition = peptide_position[1][0].composition
-                    except KeyError:
-                        warn_once(
-                            f"Skipping the following (not in library): {peptide_position[1]}"
-                        )
-                        continue
-                    except:
-                        warn_once(
-                            f"Skipping the following (not in library): {peptide_position[1]}"
-                        )
-                        continue
-
-                    for (
-                        atom_position_composition,
-                        atom_change,
-                    ) in modification_composition.items():
-                        try:
-                            matrix[i, dict_index[atom_position_composition]] += atom_change
-                            if i in positions:
-                                matrix_pos[
-                                    i,
-                                    dict_index_pos[atom_position_composition],
-                                ] += atom_change
-                            elif i - seq_len in positions:
-                                matrix_pos[
-                                    i - seq_len,
-                                    dict_index_pos[atom_position_composition],
-                                ] += atom_change
-                        except KeyError:
-                            try:
-                                warn_once(
-                                    f"Could not add the following atom: {atom_position_composition}, attempting to replace the [] part"
-                                )
-                                atom_position_composition = sub(
-                                    "\[.*?\]", "", atom_position_composition
-                                )
-                                matrix[i, dict_index[atom_position_composition]] += atom_change
-                                if i in positions:
-                                    matrix_pos[
-                                        i,
-                                        dict_index_pos[atom_position_composition],
-                                    ] += atom_change
-                                elif i - seq_len in positions:
-                                    matrix_pos[
-                                        i - seq_len,
-                                        dict_index_pos[atom_position_composition],
-                                    ] += atom_change
-                            except KeyError:
-                                warn_once(
-                                    f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored"
-                                )
-                                continue
-                            except:
-                                warn_once(
-                                    f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored"
-                                )
-                                continue
-                        except IndexError:
-                            warn_once(
-                                f"Could not add the following atom: {i} {atom_position_composition} {atom_change}"
-                            )
-                        except:
-                            warn_once(
-                                f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored"
-                            )
-                            continue
-
-            matrix_all = np.sum(matrix, axis=0)
-            matrix_all = np.append(matrix_all, seq_len)
-
-            if predict_ccs:
-                matrix_all = np.append(matrix_all, (seq.count("H")) / float(seq_len))
-                matrix_all = np.append(
-                    matrix_all,
-                    (seq.count("F") + seq.count("W") + seq.count("Y")) / float(seq_len),
-                )
-                matrix_all = np.append(
-                    matrix_all,
-                    (seq.count("D") + seq.count("E")) / float(seq_len),
-                )
-                matrix_all = np.append(
-                    matrix_all,
-                    (seq.count("K") + seq.count("R")) / float(seq_len),
-                )
-                matrix_all = np.append(matrix_all, charge)
-
-            matrix_sum = rolling_sum(matrix.T, n=2)[:, ::2].T
-
-            ret_list["matrix"][row_index] = matrix
-            ret_list["matrix_sum"][row_index] = matrix_sum
-            ret_list["matrix_all"][row_index] = matrix_all
-            ret_list["pos_matrix"][row_index] = matrix_pos.flatten()
-            ret_list["matrix_hc"][row_index] = matrix_hc
-
-        return ret_list
-
-    def full_feat_extract(
-        self,
-        psm_list=[],
-        seqs=[],
-        mods=[],
-        predict_ccs=False,
-        identifiers=[],
-        charges=[],
-    ):
-        """
-        Extract all features we can extract... Probably the function your want to call by default
-
-        Parameters
-        ----------
-        seqs : list
-            peptide sequence list; should correspond to mods and identifiers
-        mods : list
-            naming of the mods; should correspond to seqs and identifiers
-        identifiers : str
-            identifiers of the peptides; should correspond to seqs and mods
-        charges : list
-            optional list with charges, keep emtpy if these will not effect the predicted value
-
-        Returns
-        -------
-        pd.DataFrame
-            feature matrix
-        """
-        # TODO Reintroduce for CCS, check CCS flag
-        if len(seqs) > 0:
-            list_of_psms = []
-            for seq, mod, id in zip(seqs, mods, identifiers):
-                list_of_psms.append(
-                    PSM(
-                        peptidoform=peprec_to_proforma(seq, mod),
-                        spectrum_id=id,
-                    )
-                )
-            psm_list = PSMList(psm_list=list_of_psms)
-
-        if self.verbose:
-            t0 = time.time()
-
-        # TODO pass CCS flag
-        if self.add_sum_feat:
-            if self.verbose:
-                logger.debug("Extracting compositional sum features for modifications")
-            X_feats_sum = self.get_feats_mods(psm_list, split_size=1, add_str="_sum")
-        if self.ptm_add_feat:
-            if self.verbose:
-                logger.debug("Extracting compositional add features for modifications")
-            X_feats_add = self.get_feats_mods(psm_list, split_size=self.split_size, add_str="_add")
-        if self.ptm_subtract_feat:
-            if self.verbose:
-                logger.debug("Extracting compositional subtract features for modifications")
-            X_feats_neg = self.get_feats_mods(
-                psm_list,
-                split_size=self.split_size,
-                add_str="_subtract",
-                subtract_mods=True,
-            )
-        if self.cnn_feats:
-            if self.verbose:
-                logger.debug("Extracting CNN features")
-            X_cnn = self.encode_atoms(
-                psm_list,
-                list(range(len(psm_list))),
-                charges=charges,
-                predict_ccs=predict_ccs,
-            )
-
-        if self.cnn_feats:
-            X = X_cnn
-        if self.add_sum_feat:
+# fmt: off
+DEFAULT_POSITIONS: set[int] = {0, 1, 2, 3, -1, -2, -3, -4}
+DEFAULT_POSITIONS_POS: set[int] = {0, 1, 2, 3}
+DEFAULT_POSITIONS_NEG: set[int] = {-1, -2, -3, -4}
+DEFAULT_DICT_AA: dict[str, int] = {
+    "K": 0, "R": 1, "P": 2, "T": 3, "N": 4, "A": 5, "Q": 6, "V": 7, "S": 8, "G": 9, "I": 10,
+    "L": 11, "C": 12, "M": 13, "H": 14, "F": 15, "Y": 16, "W": 17, "E": 18, "D": 19,
+}
+DEFAULT_DICT_INDEX_POS: dict[str, int] = {"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}
+DEFAULT_DICT_INDEX: dict[str, int] = {"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}
+# fmt: on
+
+
+def _truncate_sequence(seq: str, max_length: int) -> tuple[str, int]:
+    """Truncate the sequence if it exceeds the max_length."""
+    if len(seq) > max_length:
+        warnings.warn(f"Truncating peptide (too long): {seq}", stacklevel=2)
+        seq = seq[:max_length]
+    return seq, len(seq)
+
+
+def _fill_standard_matrix(seq: str, padding_length: int, dict_index: dict[str, int]) -> np.ndarray:
+    """Fill the standard composition matrix using mass.std_aa_comp."""
+    mat = np.zeros((padding_length, len(dict_index)), dtype=np.float16)
+    for i, aa in enumerate(seq):
+        for atom, value in mass.std_aa_comp[aa].items():
             try:
-                X = pd.concat([X, X_feats_sum], axis=1)
-            except BaseException:
-                X = X_feats_sum
-        if self.ptm_add_feat:
+                mat[i, dict_index[atom]] = value
+            except (KeyError, IndexError):
+                warnings.warn(f"Skipping atom {atom} at pos {i}", stacklevel=2)
+    return mat
+
+
+def _fill_onehot_matrix(
+    parsed_seq: list, padding_length: int, dict_aa: dict[str, int]
+) -> np.ndarray:
+    """Fill a one-hot matrix based on the parsed sequence tokens."""
+    onehot = np.zeros((padding_length, len(dict_aa)), dtype=np.float16)
+    for i, token in enumerate(parsed_seq):
+        try:
+            onehot[i, dict_aa[token[0]]] = 1.0
+        except (KeyError, IndexError):
+            warnings.warn(f"One-hot skip: {i} {token}", stacklevel=2)
+    return onehot
+
+
+def _fill_pos_matrix(
+    seq: str,
+    seq_len: int,
+    positions_pos: set[int],
+    positions_neg: set[int],
+    dict_index: dict[str, int],
+    dict_index_pos: dict[str, int],
+) -> np.ndarray:
+    """Fill positional matrix for atoms at specific positions."""
+    pos_total = positions_pos.union(positions_neg)
+    pos_mat = np.zeros((max(pos_total) - min(pos_total) + 1, len(dict_index)), dtype=np.float16)
+    # For positive positions
+    for pos in positions_pos:
+        try:
+            aa = seq[pos]
+        except Exception:
+            warnings.warn(f"Unable to get pos {pos}", stacklevel=2)
+            continue
+        for atom, value in mass.std_aa_comp[aa].items():
             try:
-                X = pd.concat([X, X_feats_add], axis=1)
-            except BaseException:
-                X = X_feats_add
-        if self.ptm_subtract_feat:
+                # shift index for matrix row since positions may be negative.
+                pos_mat[pos - min(pos_total), dict_index_pos[atom]] = value
+            except (KeyError, IndexError):
+                warnings.warn(f"Pos matrix skip: {atom} at pos {pos}", stacklevel=2)
+    # For negative positions
+    for pos in positions_neg:
+        try:
+            aa = seq[seq_len + pos]
+        except Exception:
+            warnings.warn(f"Unable to get pos {pos}", stacklevel=2)
+            continue
+        for atom, value in mass.std_aa_comp[aa].items():
             try:
-                X = pd.concat([X, X_feats_neg], axis=1)
-            except BaseException:
-                X = X_feats_neg
-
-        if self.verbose:
-            logger.debug("Time to calculate all features: %s seconds" % (time.time() - t0))
-        return X
-
-
-def main(verbose=True):
-    f_extractor = FeatExtractor(config_file="config.ini")
-    df = pd.read_csv("parse_pride/seqs_exp.csv")
-    df.index = ["Pep_" + str(dfi) for dfi in df.index]
-    print(f_extractor.full_feat_extract(df["seq"], df["modifications"], df.index))
-
+                pos_mat[pos - min(pos_total), dict_index_pos[atom]] = value
+            except (KeyError, IndexError):
+                warnings.warn(f"Pos matrix skip: {atom} at neg pos {pos}", stacklevel=2)
+    return pos_mat
+
+
+def _apply_modifications(
+    mat: np.ndarray,
+    pos_mat: np.ndarray,
+    parsed_seq: list,
+    seq_len: int,
+    dict_index: dict[str, int],
+    dict_index_pos: dict[str, int],
+    positions: set[int],
+) -> None:
+    """Apply modification changes to the matrices."""
+    for i, token in enumerate(parsed_seq):
+        if token[1] is None:
+            continue
+        try:
+            mod_comp = token[1][0].composition
+        except Exception:
+            warnings.warn(f"Skipping mod at pos {i}: {token[1]}", stacklevel=2)
+            continue
+        for atom_comp, change in mod_comp.items():
+            try:
+                mat[i, dict_index[atom_comp]] += change
+                if i in positions:
+                    pos_mat[i, dict_index_pos[atom_comp]] += change
+                elif (i - seq_len) in positions:
+                    pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change
+            except KeyError:
+                try:
+                    warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2)
+                    atom_comp_clean = sub(r"\[.*?\]", "", atom_comp)
+                    mat[i, dict_index[atom_comp_clean]] += change
+                    if i in positions:
+                        pos_mat[i, dict_index_pos[atom_comp_clean]] += change
+                    elif (i - seq_len) in positions:
+                        pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change
+                except KeyError:
+                    warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2)
+                    continue
+            except IndexError:
+                warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2)
+
+
+def _compute_rolling_sum(matrix: np.ndarray, n: int = 2) -> np.ndarray:
+    """Compute a rolling sum over the matrix."""
+    ret = np.cumsum(matrix, axis=1, dtype=np.float32)
+    ret[:, n:] = ret[:, n:] - ret[:, :-n]
+    return ret[:, n - 1 :]
+
+
+def encode_peptidoform(
+    peptidoform: Peptidoform | str,
+    predict_ccs: bool = False,
+    padding_length: int = 60,
+    positions: set[int] | None = None,
+    positions_pos: set[int] | None = None,
+    positions_neg: set[int] | None = None,
+    dict_aa: dict[str, int] | None = None,
+    dict_index_pos: dict[str, int] | None = None,
+    dict_index: dict[str, int] | None = None,
+) -> dict[str, np.ndarray]:
+    """Extract features from a single peptidoform."""
+    positions = positions or DEFAULT_POSITIONS
+    positions_pos = positions_pos or DEFAULT_POSITIONS_POS
+    positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
+    dict_aa = dict_aa or DEFAULT_DICT_AA
+    dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
+    dict_index = dict_index or DEFAULT_DICT_INDEX
+
+    if isinstance(peptidoform, str):
+        peptidoform = Peptidoform(peptidoform)
+    seq = peptidoform.sequence
+    charge = peptidoform.precursor_charge
+    seq, seq_len = _truncate_sequence(seq, padding_length)
+
+    std_matrix = _fill_standard_matrix(seq, padding_length, dict_index)
+    onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa)
+    pos_matrix = _fill_pos_matrix(
+        seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos
+    )
+    _apply_modifications(
+        std_matrix,
+        pos_matrix,
+        peptidoform.parsed_sequence,
+        seq_len,
+        dict_index,
+        dict_index_pos,
+        positions,
+    )
+
+    matrix_all = np.sum(std_matrix, axis=0)
+    matrix_all = np.append(matrix_all, seq_len)
+    if predict_ccs:
+        matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
+        matrix_all = np.append(
+            matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
+        )
+        matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len)
+        matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len)
+        matrix_all = np.append(matrix_all, charge)
+
+    matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T
+
+    return {
+        "matrix": std_matrix,
+        "matrix_sum": matrix_sum,
+        "matrix_all": matrix_all,
+        "pos_matrix": pos_matrix.flatten(),
+        "matrix_hc": onehot_matrix,
+    }
+
+
+def aggregate_encodings(
+    encodings: list[dict[str, np.ndarray]],
+) -> dict[str, dict[int, np.ndarray]]:
+    """Aggregate list of encodings into single dictionary."""
+    return {key: {i: enc[key] for i, enc in enumerate(encodings)} for key in encodings[0]}
+
+
+def unpack_features(
+    features: dict[str, np.ndarray],
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Unpack dictionary with features to numpy arrays."""
+    X_sum = np.stack(list(features["matrix_sum"].values()))
+    X_global = np.concatenate(
+        (
+            np.stack(list(features["matrix_all"].values())),
+            np.stack(list(features["pos_matrix"].values())),
+        ),
+        axis=1,
+    )
+    X_hc = np.stack(list(features["matrix_hc"].values()))
+    X_main = np.stack(list(features["matrix"].values()))
 
-if __name__ == "__main__":
-    main()
+    return X_sum, X_global, X_hc, X_main
diff --git a/deeplc/trainl3.py b/deeplc/trainl3.py
index f4bd276..1682485 100644
--- a/deeplc/trainl3.py
+++ b/deeplc/trainl3.py
@@ -28,7 +28,7 @@
     _has_sklearn = True
 
 
-def train_en(X, y, n_jobs=16, cv=None):
+def train_elastic_net(X, y, n_jobs=16, cv=None):
     """
     Function that trains Layer 3 of CALLC (elastic net)
 
diff --git a/examples/deeplc_example.py b/examples/deeplc_example.py
index 25a2cf4..2a44b2b 100644
--- a/examples/deeplc_example.py
+++ b/examples/deeplc_example.py
@@ -1,6 +1,7 @@
 # Imports
 import pandas as pd
 from matplotlib import pyplot as plt
+
 from deeplc import DeepLC, FeatExtractor
 
 if __name__ == "__main__":
@@ -59,11 +60,11 @@
     print("a")
 
     # Make predictions; calibrated and uncalibrated
-    preds_cal = dlc.make_preds(seq_df=df)
+    preds_cal = dlc._make_preds(seq_df=df)
 
     print("b")
 
-    preds_uncal = dlc.make_preds(seq_df=df, calibrate=False)
+    preds_uncal = dlc._make_preds(seq_df=df, calibrate=False)
 
     print("c")
 
diff --git a/pyproject.toml b/pyproject.toml
index 47b0895..b345d1a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "deeplc"
-version = "3.1.8"
+version = "4.0.0-dev1"
 description = "DeepLC: Retention time prediction for (modified) peptides using Deep Learning."
 readme = "README.md"
 license = { file = "LICENSE" }
diff --git a/tests/test_deepcallc.py b/tests/test_deepcallc.py
index 71297db..798be6e 100644
--- a/tests/test_deepcallc.py
+++ b/tests/test_deepcallc.py
@@ -1,7 +1,8 @@
 import pandas as pd
-from deeplc import DeepLC
 from matplotlib import pyplot as plt
 
+from deeplc import DeepLC
+
 
 def main():
     peptide_file = "temp_data/PXD005573_mcp.csv"
@@ -73,7 +74,7 @@ def main():
     )
 
     dlc.calibrate_preds(seq_df=cal_df)
-    preds = dlc.make_preds(seq_df=cal_df)
+    preds = dlc._make_preds(seq_df=cal_df)
 
     plt.scatter(cal_df["tr"], preds)
     plt.show()
diff --git a/tests/test_deeplc_lib.py b/tests/test_deeplc_lib.py
index e9d6b51..c984196 100644
--- a/tests/test_deeplc_lib.py
+++ b/tests/test_deeplc_lib.py
@@ -1,7 +1,8 @@
 import pandas as pd
-from deeplc import DeepLC
 from matplotlib import pyplot as plt
 
+from deeplc import DeepLC
+
 
 def main():
     peptide_file = "examples/datasets/test_pred.csv"
@@ -18,7 +19,7 @@ def main():
     # use_library="lib.csv",
     # reload_library=True)
     dlc.calibrate_preds(seq_df=cal_df)
-    preds = dlc.make_preds(seq_df=cal_df)
+    preds = dlc._make_preds(seq_df=cal_df)
 
     plt.scatter(pep_df["tr"], preds)
     plt.show()
diff --git a/tests/test_deeplc_retrain.py b/tests/test_deeplc_retrain.py
index 8c09330..365819b 100644
--- a/tests/test_deeplc_retrain.py
+++ b/tests/test_deeplc_retrain.py
@@ -1,7 +1,8 @@
 import pandas as pd
-from deeplc import DeepLC
 from matplotlib import pyplot as plt
 
+from deeplc import DeepLC
+
 
 def main():
     peptide_file = "examples/datasets/test_pred.csv"
@@ -18,7 +19,7 @@ def main():
     # use_library="lib.csv",
     # reload_library=True)
     dlc.calibrate_preds(seq_df=cal_df)
-    preds = dlc.make_preds(seq_df=cal_df)
+    preds = dlc._make_preds(seq_df=cal_df)
 
     plt.scatter(cal_df["tr"], preds)
     plt.show()
diff --git a/tests/test_feat_extractor.py b/tests/test_feat_extractor.py
new file mode 100644
index 0000000..8ecc39e
--- /dev/null
+++ b/tests/test_feat_extractor.py
@@ -0,0 +1,93 @@
+import numpy as np
+from psm_utils.psm import Peptidoform
+
+from deeplc.feat_extractor import encode_peptidoform
+
+
+def _check_result_structure(result: dict[str, np.ndarray]) -> None:
+    expected_keys = {"matrix", "matrix_sum", "matrix_all", "pos_matrix", "matrix_hc"}
+    assert expected_keys.issubset(result.keys())
+
+
+def test_encode_peptidoform_without_modification():
+    # Test using a simple peptidoform string without modifications.
+    peptide = "ACDE/2"
+    result = encode_peptidoform(peptide)
+    _check_result_structure(result)
+
+    # Default padding_length is 60 and there are 6 atoms in DEFAULT_DICT_INDEX.
+    assert result["matrix"].shape == (60, 6)
+    # matrix_all is formed by summing matrix (length 6) and appending the sequence length.
+    # So without CCS prediction, matrix_all length should be 7.
+    assert result["matrix_all"].shape == (7,)
+
+    # The sequence is taken from the part before the '/'.
+    expected_seq = "ACDE"
+    # The last element of matrix_all should be the sequence length.
+    assert result["matrix_all"][-1] == len(expected_seq)
+
+
+def test_encode_peptidoform_with_modification():
+    # Test using a peptidoform string with a modification.
+    peptide = "AC[Carbamidomethyl]DE/2"
+    result = encode_peptidoform(peptide)
+    _check_result_structure(result)
+
+    # Check basic structure as before.
+    assert result["matrix"].shape == (60, 6)
+    expected_seq = "ACDE"  # Modification does not alter the base sequence.
+    assert result["matrix_all"][-1] == len(expected_seq)
+
+
+def test_encode_peptidoform_with_ccs_prediction():
+    # Test with predict_ccs=True.
+    peptide = "ACDE/2"
+    result = encode_peptidoform(peptide, predict_ccs=True)
+    _check_result_structure(result)
+
+    # Without predict_ccs, matrix_all has 7 elements (6 sums + sequence length).
+    # With predict_ccs, 5 additional values are appended (ratios and charge),
+    # resulting in a total length of 12.
+    assert result["matrix_all"].shape == (12,)
+
+    # Verify that the last element (charge) matches the precursor charge from Peptidoform.
+    pf = Peptidoform(peptide)
+    assert result["matrix_all"][-1] == pf.precursor_charge
+
+
+def test_encode_peptidoform_feature_values() -> None:
+    """
+    Test that the returned feature values match the expected results
+    for a known peptidoform.
+    """
+    peptide = "ACDE/2"
+    result = encode_peptidoform(peptide, predict_ccs=False, padding_length=60)
+
+    # For peptide "ACDE", using DEFAULT_DICT_INDEX = {"C":0,"H":1,"N":2,"O":3,"S":4,"P":5}
+    # Based on pyteomics.mass.std_aa_comp, the expected composition is:
+    # For 'A': { "C": 3, "H": 5, "N": 1, "O": 1 } -> row0: [3,5,1,1,0,0]
+    # For 'C': { "C": 3, "H": 5, "N": 1, "O": 1, "S": 1 } -> row1: [3,5,1,1,1,0]
+    # For 'D': { "C": 4, "H": 5, "N": 1, "O": 3 } -> row2: [4,5,1,3,0,0]
+    # For 'E': { "C": 5, "H": 7, "N": 1, "O": 3 } -> row3: [5,7,1,3,0,0]
+    #
+    # Sum each column over positions 0 to 3:
+    #   C: 3+3+4+5 = 15
+    #   H: 5+5+5+7 = 22
+    #   N: 1+1+1+1 = 4
+    #   O: 1+1+3+3 = 8
+    #   S: 0+1+0+0 = 1
+    #   P: 0+0+0+0 = 0
+    # Then sequence length appended: 4
+    expected_matrix_all = np.array([15, 22, 4, 8, 1, 0, 4], dtype=np.float32)
+
+    # Check matrix_all values using np.allclose.
+    assert np.allclose(result["matrix_all"], expected_matrix_all), (
+        f"Expected matrix_all: {expected_matrix_all}, got: {result['matrix_all']}"
+    )
+
+    # Also check the first row of the matrix for letter 'A'.
+    # Expected for 'A': [3,5,1,1,0,0]
+    expected_row_A = np.array([3, 5, 1, 1, 0, 0], dtype=np.float16)
+    assert np.allclose(result["matrix"][0][:6], expected_row_A), (
+        f"Expected row for A: {expected_row_A}, got: {result['matrix'][0][:6]}"
+    )

From ccc6ae33ae096d9cbb81390208aa24af18302e3d Mon Sep 17 00:00:00 2001
From: RobbinBouwmeester <robbin.bouwmeester@ugent.be>
Date: Thu, 24 Apr 2025 11:21:17 +0200
Subject: [PATCH 4/6] Add dask dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 3a2f23b..5acfd42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ keywords = [
 ]
 
 dependencies = [
+    "dask>2023"
     "torch>=2.6.0,<3",
     "onnx2torch>=0.1.5",
     "numpy>=1.17",

From 57db4c85ea9f39270f7ca66df09dff5915ee9351 Mon Sep 17 00:00:00 2001
From: RobbinBouwmeester <robbin.bouwmeester@ugent.be>
Date: Thu, 24 Apr 2025 11:22:29 +0200
Subject: [PATCH 5/6] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5acfd42..49df3f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ keywords = [
 ]
 
 dependencies = [
-    "dask>2023"
+    "dask>2023",
     "torch>=2.6.0,<3",
     "onnx2torch>=0.1.5",
     "numpy>=1.17",

From cc965d684c6ef1dee92df9d03a680b0f50d2efdb Mon Sep 17 00:00:00 2001
From: RobbinBouwmeester <robbin.bouwmeester@ugent.be>
Date: Thu, 24 Apr 2025 11:33:29 +0200
Subject: [PATCH 6/6] Read function to prepare matrices

---
 deeplc/deeplc.py | 149 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 119 insertions(+), 30 deletions(-)

diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py
index bf5af33..22f1b92 100644
--- a/deeplc/deeplc.py
+++ b/deeplc/deeplc.py
@@ -112,7 +112,9 @@ def __init__(self, X, X_sum, X_global, X_hc, target=None):
         self.X_hc = torch.from_numpy(X_hc).float()
 
         if target is not None:
-            self.target = torch.from_numpy(target).float()  # Add target values if provided
+            self.target = torch.from_numpy(
+                target
+            ).float()  # Add target values if provided
         else:
             self.target = None  # If no target is provided, set it to None
 
@@ -212,7 +214,8 @@ def fine_tune(self):
         val_loader = self.prepare_data(val_dataset, shuffle=False)
 
         optimizer = torch.optim.Adam(
-            filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.learning_rate
+            filter(lambda p: p.requires_grad, self.model.parameters()),
+            lr=self.learning_rate,
         )
         loss_fn = torch.nn.L1Loss()
         best_model_wts = copy.deepcopy(self.model.state_dict())
@@ -241,7 +244,9 @@ def fine_tune(self):
                 for batch in val_loader:
                     batch_X, batch_X_sum, batch_X_global, batch_X_hc, target = batch
                     target = target.view(-1, 1)
-                    outputs = self.model(batch_X, batch_X_sum, batch_X_global, batch_X_hc)
+                    outputs = self.model(
+                        batch_X, batch_X_sum, batch_X_global, batch_X_hc
+                    )
                     val_loss += loss_fn(outputs, target).item()
             avg_val_loss = val_loss / len(val_loader)
 
@@ -415,7 +420,9 @@ def __str__(self):
               """
 
     @staticmethod
-    def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) -> list[str]:
+    def _get_model_paths(
+        passed_model_path: str | None, single_model_mode: bool
+    ) -> list[str]:
         """Get the model paths based on the passed model path and the single model mode."""
         if passed_model_path:
             return [passed_model_path]
@@ -425,6 +432,35 @@ def _get_model_paths(passed_model_path: str | None, single_model_mode: bool) ->
 
         return DEFAULT_MODELS
 
+    def _prepare_feature_matrices(self, psm_list):
+        """
+        Extract features in parallel and assemble the four input matrices.
+
+        Parameters
+        ----------
+        psm_list : list of PSM
+            List of peptide‐spectrum matches for which to extract features.
+
+        Returns
+        -------
+        X : ndarray, shape (n_peptides, n_features)
+        X_sum : ndarray, shape (n_peptides, n_sum_features)
+        X_global : ndarray, shape (n_peptides, n_global_features * 2)
+        X_hc : ndarray, shape (n_peptides, n_hc_features)
+        """
+        feats = self.do_f_extraction_psm_list_parallel(psm_list)
+        X = np.stack(list(feats["matrix"].values()))
+        X_sum = np.stack(list(feats["matrix_sum"].values()))
+        X_global = np.concatenate(
+            (
+                np.stack(list(feats["matrix_all"].values())),
+                np.stack(list(feats["pos_matrix"].values())),
+            ),
+            axis=1,
+        )
+        X_hc = np.stack(list(feats["matrix_hc"].values()))
+        return X, X_sum, X_global, X_hc
+
     def _extract_features(
         self,
         peptidoforms: list[str | Peptidoform] | PSMList,
@@ -437,16 +473,21 @@ def _extract_features(
         logger.debug("Running feature extraction in single-threaded mode...")
         if self.n_jobs <= 1:
             encodings = [
-                encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in peptidoforms
+                encode_peptidoform(pf, predict_ccs=self.predict_ccs)
+                for pf in peptidoforms
             ]
 
         else:
             logger.debug("Preparing feature extraction with Dask")
             # Process peptidoforms in larger chunks to reduce task overhead.
-            peptidoform_strings = [str(pep) for pep in peptidoforms]  # Faster pickling of strings
+            peptidoform_strings = [
+                str(pep) for pep in peptidoforms
+            ]  # Faster pickling of strings
 
             def chunked_encode(chunk):
-                return [encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in chunk]
+                return [
+                    encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in chunk
+                ]
 
             tasks = [
                 delayed(chunked_encode)(peptidoform_strings[i : i + chunk_size])
@@ -454,7 +495,9 @@ def chunked_encode(chunk):
             ]
 
             logger.debug("Starting feature extraction with Dask")
-            chunks_encodings = compute(*tasks, scheduler="processes", workers=self.n_jobs)
+            chunks_encodings = compute(
+                *tasks, scheduler="processes", workers=self.n_jobs
+            )
 
             # Flatten the list of lists.
             encodings = [enc for chunk in chunks_encodings for enc in chunk]
@@ -486,7 +529,9 @@ def _apply_calibration_core(
 
             # Use spline model within the range of X
             within_range = (uncal_preds >= cal_min) & (uncal_preds <= cal_max)
-            within_range = within_range.ravel()  # Ensure this is a 1D array for proper indexing
+            within_range = (
+                within_range.ravel()
+            )  # Ensure this is a 1D array for proper indexing
 
             # Create a prediction array initialized with spline predictions
             cal_preds = np.copy(y_pred_spline)
@@ -501,19 +546,27 @@ def _apply_calibration_core(
         else:
             for uncal_pred in uncal_preds:
                 try:
-                    slope, intercept = cal_dict[str(round(uncal_pred, self.bin_distance))]
+                    slope, intercept = cal_dict[
+                        str(round(uncal_pred, self.bin_distance))
+                    ]
                     cal_preds.append(slope * (uncal_pred) + intercept)
                 except KeyError:
                     # outside of the prediction range ... use the last
                     # calibration curve
                     if uncal_pred <= cal_min:
-                        slope, intercept = cal_dict[str(round(cal_min, self.bin_distance))]
+                        slope, intercept = cal_dict[
+                            str(round(cal_min, self.bin_distance))
+                        ]
                         cal_preds.append(slope * (uncal_pred) + intercept)
                     elif uncal_pred >= cal_max:
-                        slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))]
+                        slope, intercept = cal_dict[
+                            str(round(cal_max, self.bin_distance))
+                        ]
                         cal_preds.append(slope * (uncal_pred) + intercept)
                     else:
-                        slope, intercept = cal_dict[str(round(cal_max, self.bin_distance))]
+                        slope, intercept = cal_dict[
+                            str(round(cal_max, self.bin_distance))
+                        ]
                         cal_preds.append(slope * (uncal_pred) + intercept)
 
         return np.array(cal_preds)
@@ -648,7 +701,9 @@ def make_preds(
             elif infile is not None:
                 psm_list = _file_to_psm_list(infile)
             else:
-                raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.")
+                raise ValueError(
+                    "Either `psm_list` or `seq_df` or `infile` must be provided."
+                )
 
         if len(psm_list) == 0:
             logger.warning("No PSMs to predict for.")
@@ -692,7 +747,9 @@ def make_preds(
                         )
                     )
                 # Average the predictions from all models
-                ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds, strict=True)])
+                ret_preds = np.array(
+                    [sum(a) / len(a) for a in zip(*ret_preds, strict=True)]
+                )
                 # ret_preds = np.mean(model_predictions, axis=0)
 
             else:
@@ -740,7 +797,9 @@ def _calibrate_preds_pygam(
             spline_model = linear_model
             linear_model_right = linear_model
         else:
-            spline = SplineTransformer(degree=4, n_knots=int(len(measured_tr) / 500) + 5)
+            spline = SplineTransformer(
+                degree=4, n_knots=int(len(measured_tr) / 500) + 5
+            )
             spline_model = make_pipeline(spline, LinearRegression())
             spline_model.fit(predicted_tr.reshape(-1, 1), measured_tr)
 
@@ -832,9 +891,13 @@ def _calibrate_preds_piecewise_linear(
                 "peptides for fitting the calibration curve."
             )
         if len(mtr_mean) == 0:
-            raise CalibrationError("The measured tr list is empty, not able to calibrate")
+            raise CalibrationError(
+                "The measured tr list is empty, not able to calibrate"
+            )
         if len(ptr_mean) == 0:
-            raise CalibrationError("The predicted tr list is empty, not able to calibrate")
+            raise CalibrationError(
+                "The predicted tr list is empty, not able to calibrate"
+            )
 
         # calculate calibration curves
         for i in range(0, len(ptr_mean)):
@@ -913,7 +976,9 @@ def calibrate_preds(
             elif infile is not None:
                 psm_list = _file_to_psm_list(infile)
             else:
-                raise ValueError("Either `psm_list` or `seq_df` or `infile` must be provided.")
+                raise ValueError(
+                    "Either `psm_list` or `seq_df` or `infile` must be provided."
+                )
 
         # Getting measured retention time either from measured_tr or provided PSMs
         if not measured_tr:
@@ -946,7 +1011,9 @@ def calibrate_preds(
             X, X_sum, X_global, X_hc = self._prepare_feature_matrices(psm_list)
             dataset = DeepLCDataset(X, X_sum, X_global, X_hc, np.array(measured_tr))
 
-            base_model_path = self.model[0] if isinstance(self.model, list) else self.model
+            base_model_path = (
+                self.model[0] if isinstance(self.model, list) else self.model
+            )
             base_model = torch.load(
                 base_model_path, weights_only=False, map_location=torch.device("cpu")
             )
@@ -982,29 +1049,42 @@ def calibrate_preds(
 
         for model_name in self.model:
             logger.debug(f"Trying out the following model: {model_name}")
-            predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=model_name)
+            predicted_tr = self.make_preds(
+                psm_list, calibrate=False, mod_name=model_name
+            )
 
             if self.pygam_calibration:
-                calibrate_output = self._calibrate_preds_pygam(measured_tr, predicted_tr)
+                calibrate_output = self._calibrate_preds_pygam(
+                    measured_tr, predicted_tr
+                )
             else:
                 calibrate_output = self._calibrate_preds_piecewise_linear(
                     measured_tr, predicted_tr, use_median=use_median
                 )
-            self.calibrate_min, self.calibrate_max, self.calibrate_dict = calibrate_output
+            self.calibrate_min, self.calibrate_max, self.calibrate_dict = (
+                calibrate_output
+            )
             # TODO: Currently, calibration dict can be both a dict (linear) or a list of models
             # (PyGAM)... This should be handled better in the future.
 
             # Skip this model if calibrate_dict is empty
             # TODO: Should this do something when using PyGAM and calibrate_dict is a list?
-            if isinstance(self.calibrate_dict, dict) and len(self.calibrate_dict.keys()) == 0:
+            if (
+                isinstance(self.calibrate_dict, dict)
+                and len(self.calibrate_dict.keys()) == 0
+            ):
                 continue
 
             m_name = model_name.split("/")[-1]
 
             # Get new predictions with calibration
-            preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=model_name)
+            preds = self.make_preds(
+                psm_list, calibrate=True, seq_df=seq_df, mod_name=model_name
+            )
 
-            m_group_name = "deepcallc" if self.deepcallc_mod else "_".join(m_name.split("_")[:-1])
+            m_group_name = (
+                "deepcallc" if self.deepcallc_mod else "_".join(m_name.split("_")[:-1])
+            )
             m = model_name
             try:
                 pred_dict[m_group_name][m] = preds
@@ -1026,13 +1106,18 @@ def calibrate_preds(
                 mod_calibrate_max_dict[m_group_name][m] = self.calibrate_max
 
         for m_name in pred_dict:
-            preds = [sum(a) / len(a) for a in zip(*list(pred_dict[m_name].values()), strict=True)]
+            preds = [
+                sum(a) / len(a)
+                for a in zip(*list(pred_dict[m_name].values()), strict=True)
+            ]
             if len(measured_tr) == 0:
                 perf = sum(abs(seq_df["tr"] - preds))
             else:
                 perf = sum(abs(np.array(measured_tr) - np.array(preds)))
 
-            logger.debug(f"For {m_name} model got a performance of: {perf / len(preds)}")
+            logger.debug(
+                f"For {m_name} model got a performance of: {perf / len(preds)}"
+            )
 
             if perf < best_perf:
                 m_group_name = "deepcallc" if self.deepcallc_mod else m_name
@@ -1072,7 +1157,9 @@ def calibrate_preds(
                 ],
             )
             plotly_return_dict["scatter"] = deeplc.plot.scatter(plotly_df)
-            plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(plotly_df)
+            plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(
+                plotly_df
+            )
             return plotly_return_dict
 
         return None
@@ -1124,7 +1211,9 @@ def create_psm(args):
         )
 
     args_list = list(
-        zip(sequences, modifications, identifiers, charges, retention_times, strict=True)
+        zip(
+            sequences, modifications, identifiers, charges, retention_times, strict=True
+        )
     )
     tasks = [delayed(create_psm)(args) for args in args_list]
     list_of_psms = list(compute(*tasks, scheduler="processes"))