Skip to content

Core

Config

config

logger = logging.getLogger(__name__) module-attribute

DEFAULT_CONFIG = {'outputDir': './results', 'inputType': 'pepxml', 'inputFile': [], 'decoyPrefix': 'DECOY_', 'visualization': True, 'saveModels': True, 'toFlashLFQ': True, 'allele': [], 'numProcesses': 4, 'removePreNxtAA': False, 'showProgress': True, 'logLevel': 'INFO', 'rescore': {'testFDR': 0.01, 'trainFDR': 0.01, 'model': 'Percolator', 'numJobs': 1}} module-attribute

Config(config_source=None)

Configuration manager for optiMHC pipeline.

This class handles loading, validating, and providing access to configuration parameters from YAML files or dictionaries. It implements a fail-fast validation strategy to ensure configuration correctness before pipeline execution.

Parameters:

Name Type Description Default
config_source str or dict or None

Path to YAML file, dictionary with configuration, or None for default config. If None, uses DEFAULT_CONFIG.

None

Attributes:

Name Type Description
_config dict

The internal configuration dictionary.

Raises:

Type Description
ValueError

If configuration is invalid or required parameters are missing.

FileNotFoundError

If specified YAML file does not exist.

YAMLError

If YAML file is malformed.

Examples:

>>> # Load from YAML file
>>> config = Config("path/to/config.yaml")
>>> # Load from dictionary
>>> config_dict = {
...     "inputType": "pepxml",
...     "inputFile": ["data.pep.xml"],
...     "outputDir": "./results",
...     "rescore": {"testFDR": 0.01, "model": "Percolator"}
... }
>>> config = Config(config_dict)
>>> # Use default configuration
>>> config = Config()
>>> # Access configuration values
>>> input_type = config["inputType"]
>>> output_dir = config.get("outputDir", "./default")
>>> # Save configuration to file
>>> config.save("output_config.yaml")

Initialize Config from a YAML file path or a dictionary.

Parameters:

Name Type Description Default
config_source str or dict or None

Path to YAML file, dictionary with configuration, or None for default config. If None, uses DEFAULT_CONFIG.

None

Raises:

Type Description
ValueError

If config_source is neither a string, dict, nor None.

FileNotFoundError

If specified YAML file does not exist.

YAMLError

If YAML file is malformed.

Source code in optimhc/core/config.py
def __init__(self, config_source=None):
    """
    Initialize Config from a YAML file path or a dictionary.

    Parameters
    ----------
    config_source : str or dict or None, optional
        Path to YAML file, dictionary with configuration, or None for default config.
        If None, uses DEFAULT_CONFIG.

    Raises
    ------
    ValueError
        If config_source is neither a string, dict, nor None.
    FileNotFoundError
        If specified YAML file does not exist.
    yaml.YAMLError
        If YAML file is malformed.
    """
    if config_source is None:
        self._config = deepcopy(DEFAULT_CONFIG)
    elif isinstance(config_source, str):
        with open(config_source, "r") as f:
            user_config = yaml.safe_load(f)
        self._config = _deep_merge(DEFAULT_CONFIG, user_config)
    elif isinstance(config_source, dict):
        self._config = _deep_merge(DEFAULT_CONFIG, config_source)
    else:
        raise ValueError("Config source must be a file path, dict, or None.")

validate()

Validate the configuration using a fail-fast strategy.

This method performs comprehensive validation of the configuration, including required fields, data types, file existence, and feature generator configurations.

Raises:

Type Description
ValueError

If any validation check fails. The error message will indicate the specific validation failure.

Notes

The validation includes checks for: - Required fields (inputType, inputFile, outputDir, rescore) - Input file existence and type - Output directory creation - Rescore configuration validity (TODO) - Feature generator configuration primitive validity (TODO) - Optional parameter validity (TODO): we should validate 'allele' first !!!

Source code in optimhc/core/config.py
def validate(self):
    """
    Validate the configuration using a fail-fast strategy.

    This method performs comprehensive validation of the configuration,
    including required fields, data types, file existence, and feature
    generator configurations.

    Raises
    ------
    ValueError
        If any validation check fails. The error message will indicate
        the specific validation failure.

    Notes
    -----
    The validation includes checks for:
    - Required fields (inputType, inputFile, outputDir, rescore)
    - Input file existence and type
    - Output directory creation
    - Rescore configuration validity (TODO)
    - Feature generator configuration primitive validity (TODO)
    - Optional parameter validity (TODO): we should validate 'allele' first !!!
    """
    if not isinstance(self._config, dict):
        logger.error("Configuration must be a dictionary")
        raise ValueError("Configuration must be a dictionary")

    required_fields = ["inputType", "inputFile", "outputDir", "rescore"]
    for field in required_fields:
        if field not in self._config:
            logger.error(f"Missing required configuration: '{field}'")
            raise ValueError(f"Missing required configuration: '{field}'")

        if field == "inputFile" and self._config[field] == []:
            logger.error("inputFile list cannot be empty")
            raise ValueError("inputFile list cannot be empty")
        elif self._config[field] in (None, "", []):
            logger.error(f"Required configuration '{field}' cannot be empty")
            raise ValueError(f"Required configuration '{field}' cannot be empty")

    if self._config["inputType"] not in ("pepxml", "pin"):
        logger.error("inputType must be 'pepxml' or 'pin'")
        raise ValueError("inputType must be 'pepxml' or 'pin'")

    if (
        self._config["inputType"] == "pin"
        and self._config.get("retentionTimeColumn", None) is None
    ):
        logger.error("retentionTimeColumn must be specified when inputType is 'pin'")
        raise ValueError("retentionTimeColumn must be specified when inputType is 'pin'")

    input_files = self._config["inputFile"]
    if not isinstance(input_files, (list, tuple)):
        logger.debug(f"inputFile is not a list or tuple: {input_files}. Converting to list.")
        self._config["inputFile"] = [input_files]
        input_files = self._config["inputFile"]
    if not input_files:
        logger.error("inputFile list cannot be empty")
        raise ValueError("inputFile list cannot be empty")

    for file_path in input_files:
        if not os.path.exists(file_path):
            logger.error(f"Input file does not exist: {file_path}")
            raise ValueError(f"Input file does not exist: {file_path}")

    output_dir = self._config["outputDir"]
    if not isinstance(output_dir, str):
        logger.error("outputDir must be a string")
        raise ValueError("outputDir must be a string")
    if not output_dir:
        logger.error("outputDir is required")
        raise ValueError("outputDir is required")
    os.makedirs(output_dir, exist_ok=True)

    # TODO: Validate feature generator configuration
    valid_generators = {
        "Basic",
        "OverlappingPeptide",
        "PWM",
        "MHCflurry",
        "NetMHCpan",
        "NetMHCIIpan",
        "DeepLC",
        "SpectralSimilarity",
    }

    if self._config["featureGenerator"] is None:
        logger.warning("No feature generators specified.")
        self._config["featureGenerator"] = []
    for fg in self._config["featureGenerator"]:
        if fg["name"] not in valid_generators:
            logger.error(f"Invalid feature generator: {fg['name']}")
            raise ValueError(f"Invalid feature generator: {fg['name']}")

    valid_sources = valid_generators | {"Original", "ContigFeatures"}
    if self._config.get("experiments", None) is not None:  # experiment mode
        if not isinstance(self._config["experiments"], list):
            logger.error("experiments must be a list")
            raise ValueError("experiments must be a list")
        for exp in self._config["experiments"]:
            for source in exp.get("source", []):
                if source not in valid_sources:
                    logger.error(f"Invalid source in experiments: {source}")
                    raise ValueError(f"Invalid source in experiments: {source}")

_deep_merge(default, override)

Deep merge two dictionaries. The override dictionary values take precedence.

Parameters:

Name Type Description Default
default dict

Default dictionary.

required
override dict

Dictionary with values to override defaults.

required

Returns:

Type Description
dict

Merged dictionary.

Source code in optimhc/core/config.py
def _deep_merge(default, override):
    """
    Deep merge two dictionaries. The override dictionary values take precedence.

    Parameters
    ----------
    default : dict
        Default dictionary.
    override : dict
        Dictionary with values to override defaults.

    Returns
    -------
    dict
        Merged dictionary.
    """
    result = deepcopy(default)

    if not isinstance(override, dict):
        return override

    for key, value in override.items():
        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
            result[key] = _deep_merge(result[key], value)
        else:
            result[key] = value

    return result

load_config(config_path)

Load and parse a configuration file using YAML. Merges loaded config with default configuration.

Parameters:

Name Type Description Default
config_path str

Path to the YAML configuration file.

required

Returns:

Type Description
dict

A dictionary containing all configurations.

Source code in optimhc/core/config.py
def load_config(config_path):
    """
    Load and parse a configuration file using YAML.
    Merges loaded config with default configuration.

    Parameters
    ----------
    config_path : str
        Path to the YAML configuration file.

    Returns
    -------
    dict
        A dictionary containing all configurations.
    """
    logger.info(f"Loading configuration from {config_path}")
    with open(config_path, "r") as f:
        user_config = yaml.safe_load(f)

    config = _deep_merge(DEFAULT_CONFIG, user_config)

    return config

Pipeline

pipeline

rescore_model_factory = RescoreModelFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

Config(config_source=None)

Configuration manager for optiMHC pipeline.

This class handles loading, validating, and providing access to configuration parameters from YAML files or dictionaries. It implements a fail-fast validation strategy to ensure configuration correctness before pipeline execution.

Parameters:

Name Type Description Default
config_source str or dict or None

Path to YAML file, dictionary with configuration, or None for default config. If None, uses DEFAULT_CONFIG.

None

Attributes:

Name Type Description
_config dict

The internal configuration dictionary.

Raises:

Type Description
ValueError

If configuration is invalid or required parameters are missing.

FileNotFoundError

If specified YAML file does not exist.

YAMLError

If YAML file is malformed.

Examples:

>>> # Load from YAML file
>>> config = Config("path/to/config.yaml")
>>> # Load from dictionary
>>> config_dict = {
...     "inputType": "pepxml",
...     "inputFile": ["data.pep.xml"],
...     "outputDir": "./results",
...     "rescore": {"testFDR": 0.01, "model": "Percolator"}
... }
>>> config = Config(config_dict)
>>> # Use default configuration
>>> config = Config()
>>> # Access configuration values
>>> input_type = config["inputType"]
>>> output_dir = config.get("outputDir", "./default")
>>> # Save configuration to file
>>> config.save("output_config.yaml")

Initialize Config from a YAML file path or a dictionary.

Parameters:

Name Type Description Default
config_source str or dict or None

Path to YAML file, dictionary with configuration, or None for default config. If None, uses DEFAULT_CONFIG.

None

Raises:

Type Description
ValueError

If config_source is neither a string, dict, nor None.

FileNotFoundError

If specified YAML file does not exist.

YAMLError

If YAML file is malformed.

Source code in optimhc/core/config.py
def __init__(self, config_source=None):
    """
    Initialize Config from a YAML file path or a dictionary.

    Parameters
    ----------
    config_source : str or dict or None, optional
        Path to YAML file, dictionary with configuration, or None for default config.
        If None, uses DEFAULT_CONFIG.

    Raises
    ------
    ValueError
        If config_source is neither a string, dict, nor None.
    FileNotFoundError
        If specified YAML file does not exist.
    yaml.YAMLError
        If YAML file is malformed.
    """
    if config_source is None:
        self._config = deepcopy(DEFAULT_CONFIG)
    elif isinstance(config_source, str):
        with open(config_source, "r") as f:
            user_config = yaml.safe_load(f)
        self._config = _deep_merge(DEFAULT_CONFIG, user_config)
    elif isinstance(config_source, dict):
        self._config = _deep_merge(DEFAULT_CONFIG, config_source)
    else:
        raise ValueError("Config source must be a file path, dict, or None.")

validate()

Validate the configuration using a fail-fast strategy.

This method performs comprehensive validation of the configuration, including required fields, data types, file existence, and feature generator configurations.

Raises:

Type Description
ValueError

If any validation check fails. The error message will indicate the specific validation failure.

Notes

The validation includes checks for: - Required fields (inputType, inputFile, outputDir, rescore) - Input file existence and type - Output directory creation - Rescore configuration validity (TODO) - Feature generator configuration primitive validity (TODO) - Optional parameter validity (TODO): we should validate 'allele' first !!!

Source code in optimhc/core/config.py
def validate(self):
    """
    Validate the configuration using a fail-fast strategy.

    This method performs comprehensive validation of the configuration,
    including required fields, data types, file existence, and feature
    generator configurations.

    Raises
    ------
    ValueError
        If any validation check fails. The error message will indicate
        the specific validation failure.

    Notes
    -----
    The validation includes checks for:
    - Required fields (inputType, inputFile, outputDir, rescore)
    - Input file existence and type
    - Output directory creation
    - Rescore configuration validity (TODO)
    - Feature generator configuration primitive validity (TODO)
    - Optional parameter validity (TODO): we should validate 'allele' first !!!
    """
    if not isinstance(self._config, dict):
        logger.error("Configuration must be a dictionary")
        raise ValueError("Configuration must be a dictionary")

    required_fields = ["inputType", "inputFile", "outputDir", "rescore"]
    for field in required_fields:
        if field not in self._config:
            logger.error(f"Missing required configuration: '{field}'")
            raise ValueError(f"Missing required configuration: '{field}'")

        if field == "inputFile" and self._config[field] == []:
            logger.error("inputFile list cannot be empty")
            raise ValueError("inputFile list cannot be empty")
        elif self._config[field] in (None, "", []):
            logger.error(f"Required configuration '{field}' cannot be empty")
            raise ValueError(f"Required configuration '{field}' cannot be empty")

    if self._config["inputType"] not in ("pepxml", "pin"):
        logger.error("inputType must be 'pepxml' or 'pin'")
        raise ValueError("inputType must be 'pepxml' or 'pin'")

    if (
        self._config["inputType"] == "pin"
        and self._config.get("retentionTimeColumn", None) is None
    ):
        logger.error("retentionTimeColumn must be specified when inputType is 'pin'")
        raise ValueError("retentionTimeColumn must be specified when inputType is 'pin'")

    input_files = self._config["inputFile"]
    if not isinstance(input_files, (list, tuple)):
        logger.debug(f"inputFile is not a list or tuple: {input_files}. Converting to list.")
        self._config["inputFile"] = [input_files]
        input_files = self._config["inputFile"]
    if not input_files:
        logger.error("inputFile list cannot be empty")
        raise ValueError("inputFile list cannot be empty")

    for file_path in input_files:
        if not os.path.exists(file_path):
            logger.error(f"Input file does not exist: {file_path}")
            raise ValueError(f"Input file does not exist: {file_path}")

    output_dir = self._config["outputDir"]
    if not isinstance(output_dir, str):
        logger.error("outputDir must be a string")
        raise ValueError("outputDir must be a string")
    if not output_dir:
        logger.error("outputDir is required")
        raise ValueError("outputDir is required")
    os.makedirs(output_dir, exist_ok=True)

    # TODO: Validate feature generator configuration
    valid_generators = {
        "Basic",
        "OverlappingPeptide",
        "PWM",
        "MHCflurry",
        "NetMHCpan",
        "NetMHCIIpan",
        "DeepLC",
        "SpectralSimilarity",
    }

    if self._config["featureGenerator"] is None:
        logger.warning("No feature generators specified.")
        self._config["featureGenerator"] = []
    for fg in self._config["featureGenerator"]:
        if fg["name"] not in valid_generators:
            logger.error(f"Invalid feature generator: {fg['name']}")
            raise ValueError(f"Invalid feature generator: {fg['name']}")

    valid_sources = valid_generators | {"Original", "ContigFeatures"}
    if self._config.get("experiments", None) is not None:  # experiment mode
        if not isinstance(self._config["experiments"], list):
            logger.error("experiments must be a list")
            raise ValueError("experiments must be a list")
        for exp in self._config["experiments"]:
            for source in exp.get("source", []):
                if source not in valid_sources:
                    logger.error(f"Invalid source in experiments: {source}")
                    raise ValueError(f"Invalid source in experiments: {source}")

Pipeline(config)

Main pipeline class for optiMHC, encapsulating the full data processing workflow.

This class orchestrates input parsing, feature generation, rescoring, result saving, and visualization. It supports both single-run and experiment modes (multiple feature/model combinations).

Parameters:

Name Type Description Default
config str, dict, or Config

Path to YAML config, dict, or Config object.

required

Examples:

>>> from optimhc.core import Pipeline
>>> pipeline = Pipeline(config)
>>> pipeline.run()

Initialize the pipeline with a configuration file, dict, or Config object.

Parameters:

Name Type Description Default
config str, dict, or Config

Path to YAML config, dict, or Config object.

required
Source code in optimhc/core/pipeline.py
def __init__(self, config):
    """
    Initialize the pipeline with a configuration file, dict, or Config object.

    Parameters
    ----------
    config : str, dict, or Config
        Path to YAML config, dict, or Config object.
    """
    logger.debug(f"config: {config}")
    if isinstance(config, Config):
        self.config = config
    else:
        self.config = Config(config)
    self.config.validate()
    self.experiment = self.config.get("experimentName", "optimhc_experiment")
    self.output_dir = os.path.join(self.config["outputDir"], self.experiment)
    os.makedirs(self.output_dir, exist_ok=True)

    self.visualization_enabled = self.config.get("visualization", True)
    self.save_models = self.config.get("saveModels", True)
    self.to_flashlfq = self.config.get("toFlashLFQ", True)
    self.test_fdr = self.config.get("rescore", {}).get("testFDR", 0.01)
    self.train_fdr = self.config.get("rescore", {}).get("trainFDR", 0.01)
    self.model_type = self.config.get("rescore", {}).get("model", "Percolator")
    self.n_jobs = self.config.get("rescore", {}).get("numJobs", 1)

read_input()

Read input PSMs based on configuration.

Returns:

Type Description
PsmContainer

Object containing loaded PSMs.

Raises:

Type Description
ValueError

If input type is unsupported.

Exception

If file reading fails.

Source code in optimhc/core/pipeline.py
def read_input(self):
    """
    Read input PSMs based on configuration.

    Returns
    -------
    PsmContainer
        Object containing loaded PSMs.

    Raises
    ------
    ValueError
        If input type is unsupported.
    Exception
        If file reading fails.
    """
    input_type = self.config["inputType"]
    input_files = self.config["inputFile"]
    if not isinstance(input_files, list):
        input_files = [input_files]

    try:
        if input_type == "pepxml":
            psms = read_pepxml(input_files, decoy_prefix=self.config["decoyPrefix"])
        elif input_type == "pin":
            psms = read_pin(
                input_files,
                retention_time_column=self.config.get("retentionTimeColumn"),
            )
        else:
            raise ValueError(f"Unsupported input type: {input_type}")
        return psms
    except Exception as e:
        logger.error(f"Failed to read input files: {e}")
        raise

rescore(psms, model_type=None, n_jobs=None, test_fdr=None, rescoring_features=None)

Perform rescoring on the PSMs using the specified or configured model.

Parameters:

Name Type Description Default
psms PsmContainer

PSM container object.

required
model_type str

Model type ('XGBoost', 'RandomForest', 'Percolator').

None
n_jobs int

Number of parallel jobs.

None
test_fdr float

FDR threshold. List of features to use for rescoring.

None

Returns:

Name Type Description
results Results

Rescoring results.

models list

Trained models.

Notes

Rescoring logic is adapted from mokapot (https://mokapot.readthedocs.io/)

Source code in optimhc/core/pipeline.py
def rescore(self, psms, model_type=None, n_jobs=None, test_fdr=None, rescoring_features=None):
    """
    Perform rescoring on the PSMs using the specified or configured model.

    Parameters
    ----------
    psms : PsmContainer
        PSM container object.
    model_type : str, optional
        Model type ('XGBoost', 'RandomForest', 'Percolator').
    n_jobs : int, optional
        Number of parallel jobs.
    test_fdr : float, optional
        FDR threshold.
        List of features to use for rescoring.

    Returns
    -------
    results : mokapot.Results
        Rescoring results.
    models : list
        Trained models.

    Notes
    -----
    Rescoring logic is adapted from mokapot (https://mokapot.readthedocs.io/)
    """
    test_fdr = test_fdr if test_fdr is not None else self.test_fdr
    model_type = model_type if model_type is not None else self.model_type
    n_jobs = n_jobs if n_jobs is not None else self.n_jobs

    train_fdr = getattr(self, "train_fdr", 0.01)
    model_cls = rescore_model_factory.get_model(model_type)
    model = model_cls.from_config(self._build_model_config(train_fdr=train_fdr, n_jobs=n_jobs))

    kwargs = {}
    if rescoring_features is not None:
        kwargs["rescoring_features"] = rescoring_features

    results, models = mokapot.rescore(psms, model=model, test_fdr=test_fdr, **kwargs)
    return results, models

save_results(psms, results, models, output_dir=None, file_root='optimhc')

Save rescoring results, PSM data, and trained models to disk.

Parameters:

Name Type Description Default
psms PsmContainer

PSM container object.

required
results Results

Rescoring results.

required
models list

Trained models.

required
output_dir str

Output directory.

None
file_root str

Root name for output files.

'optimhc'
Source code in optimhc/core/pipeline.py
def save_results(self, psms, results, models, output_dir=None, file_root="optimhc"):
    """
    Save rescoring results, PSM data, and trained models to disk.

    Parameters
    ----------
    psms : PsmContainer
        PSM container object.
    results : mokapot.Results
        Rescoring results.
    models : list
        Trained models.
    output_dir : str, optional
        Output directory.
    file_root : str, optional
        Root name for output files.
    """
    output_dir = output_dir if output_dir is not None else self.output_dir

    results.to_txt(dest_dir=output_dir, file_root=file_root, decoys=True)
    psms.write_pin(os.path.join(output_dir, f"{file_root}.pin"))

    if self.save_models:
        model_dir = os.path.join(output_dir, "models")
        os.makedirs(model_dir, exist_ok=True)
        logger.info(f"Saving models to {model_dir}")
        for i, model in enumerate(models):
            model.save(os.path.join(model_dir, f"{file_root}.model{i}"))

    if self.to_flashlfq:
        mokapot.to_flashLFQ(results, output_dir, file_name=f"{file_root}.FlashLFQ.txt")

visualize_results(psms, results, models, output_dir=None, sources=None)

Generate and save visualizations for the analysis results.

Parameters:

Name Type Description Default
psms PsmContainer

PSM container object.

required
results Results

Rescoring results.

required
models list

Trained models.

required
output_dir str

Output directory.

None
sources list

Feature sources to include in visualizations.

None
Source code in optimhc/core/pipeline.py
def visualize_results(self, psms, results, models, output_dir=None, sources=None):
    """
    Generate and save visualizations for the analysis results.

    Parameters
    ----------
    psms : PsmContainer
        PSM container object.
    results : mokapot.Results
        Rescoring results.
    models : list
        Trained models.
    output_dir : str, optional
        Output directory.
    sources : list, optional
        Feature sources to include in visualizations.
    """
    if not self.visualization_enabled:
        logger.info("Visualization is disabled. Skipping...")
        return

    output_dir = output_dir if output_dir is not None else self.output_dir
    fig_dir = os.path.join(output_dir, "figures")
    os.makedirs(fig_dir, exist_ok=True)

    plot_qvalues(
        results,
        save_path=os.path.join(fig_dir, "qvalues.png"),
        threshold=0.05,
    )

    if sources:
        rescoring_features = {k: v for k, v in psms.rescoring_features.items() if k in sources}
    else:
        rescoring_features = psms.rescoring_features

    plot_feature_importance(
        models,
        rescoring_features,
        save_path=os.path.join(fig_dir, "feature_importance.png"),
    )
    visualize_feature_correlation(
        psms,
        save_path=os.path.join(fig_dir, "feature_correlation.png"),
    )
    visualize_target_decoy_features(
        psms,
        num_cols=4,
        save_path=os.path.join(fig_dir, "target_decoy_histogram.png"),
    )

run()

Run the complete optiMHC pipeline (single run mode).

This method executes the full workflow: input parsing, feature generation, rescoring, saving, and visualization.

Returns:

Name Type Description
psms PsmContainer

PSM container object.

results Results

Rescoring results.

models list

Trained models.

Source code in optimhc/core/pipeline.py
def run(self):
    """
    Run the complete optiMHC pipeline (single run mode).

    This method executes the full workflow: input parsing, feature generation, rescoring, saving, and visualization.

    Returns
    -------
    psms : PsmContainer
        PSM container object.
    results : mokapot.Results
        Rescoring results.
    models : list
        Trained models.
    """
    logger.info("Starting analysis pipeline")

    psms = self.read_input()
    psms = self._generate_features(psms)
    results, models = self.rescore(psms)
    self.save_results(psms, results, models)
    self.visualize_results(psms, results, models)

    logger.info(f"Analysis pipeline completed, results saved to {self.output_dir}")
    return psms, results, models

run_experiments()

Run experiments with different feature/model combinations using multiprocessing.

Each experiment is executed in its own process for complete resource isolation. The experiment configurations must be provided in the config under the 'experiments' key.

Returns:

Type Description
None
Source code in optimhc/core/pipeline.py
def run_experiments(self):
    """
    Run experiments with different feature/model combinations using multiprocessing.

    Each experiment is executed in its own process for complete resource isolation.
    The experiment configurations must be provided in the config under the 'experiments' key.

    Returns
    -------
    None
    """
    logger.info("Starting experiment mode with multiple feature combinations")

    psms = self.read_input()
    psms = self._generate_features(psms)
    pin_path = os.path.join(self.output_dir, f"optimhc.{self.experiment}.pin")
    psms.write_pin(pin_path)
    fig_summary_dir = os.path.join(self.output_dir, "figures")
    os.makedirs(fig_summary_dir, exist_ok=True)
    visualize_feature_correlation(
        psms,
        save_path=os.path.join(fig_summary_dir, "feature_correlation.png"),
    )
    # visualize_target_decoy_features(
    #     psms,
    #     num_cols=4,
    #     save_path=os.path.join(fig_summary_dir, 'target_decoy_histogram.png'),
    # )

    experiment_configs = self.config.get("experiments", [])
    processes = []
    for i, exp_config in enumerate(experiment_configs):
        exp_name = exp_config.get("name", f"Experiment_{i + 1}")
        exp_dir = os.path.join(self.output_dir, exp_name)

        logger.info(f"Starting experiment '{exp_name}' in a separate process")
        p = Process(
            target=self._run_single_experiment,
            args=(psms, exp_config, exp_name, exp_dir),
        )
        p.start()
        processes.append(p)

    # Wait for all experiment processes to finish
    for p in processes:
        p.join()

    logger.info("All experiments completed")

generate_features(psms, config)

Generate features from different generators according to the configuration.

Parameters:

Name Type Description Default
psms PsmContainer

A container object holding PSMs and relevant data.

required
config dict

Configuration dictionary loaded from YAML or CLI.

required
Source code in optimhc/core/feature_generation.py
def generate_features(psms, config):
    """
    Generate features from different generators according to the configuration.

    Parameters
    ----------
    psms : PsmContainer
        A container object holding PSMs and relevant data.
    config : dict
        Configuration dictionary loaded from YAML or CLI.
    """
    feature_generators = config.get("featureGenerator", None)
    if not feature_generators:
        return

    for generator_config in feature_generators:
        if not isinstance(generator_config, dict):
            logger.warning("Feature generator config is not a dictionary, skipping...")
            continue

        name = generator_config.get("name")
        params = generator_config.get("params", {})

        logger.info(f"Generating features with {name}...")
        generator_cls = feature_generator_factory.get_generator(name)
        generator = generator_cls.from_config(psms, config, params)
        generator.apply(psms, source=name)
        gc.collect()

read_pepxml(pepxml_files, decoy_prefix='DECOY_')

Read PSMs from a list of PepXML files.

Parameters:

Name Type Description Default
pepxml_files Union[str, List[str]]

The file path to the PepXML file or a list of file paths.

required
decoy_prefix str

The prefix used to indicate a decoy protein in the description lines of the FASTA file. Default is "DECOY_".

'DECOY_'

Returns:

Type Description
PsmContainer

A PsmContainer object containing the PSM data.

Raises:

Type Description
ValueError

If the PepXML files were generated by Percolator or PeptideProphet.

Notes

This function: 1. Reads and parses PepXML files 2. Calculates mass difference features 3. Processes matched ions and complementary ions 4. Creates charge columns 5. Log-transforms p-values 6. Returns a PsmContainer with the processed data

Source code in optimhc/parser/pepxml.py
def read_pepxml(pepxml_files, decoy_prefix="DECOY_"):
    """
    Read PSMs from a list of PepXML files.

    Parameters
    ----------
    pepxml_files : Union[str, List[str]]
        The file path to the PepXML file or a list of file paths.
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the description lines
        of the FASTA file. Default is "DECOY_".

    Returns
    -------
    PsmContainer
        A PsmContainer object containing the PSM data.

    Raises
    ------
    ValueError
        If the PepXML files were generated by Percolator or PeptideProphet.

    Notes
    -----
    This function:
    1. Reads and parses PepXML files
    2. Calculates mass difference features
    3. Processes matched ions and complementary ions
    4. Creates charge columns
    5. Log-transforms p-values
    6. Returns a PsmContainer with the processed data
    """
    proton = 1.00727646677
    if isinstance(pepxml_files, str):
        pepxml_files = [pepxml_files]
    psms = pd.concat([_parse_pepxml(f, decoy_prefix) for f in pepxml_files])

    # Check that these PSMs are not from Percolator or PeptideProphet:
    illegal_cols = {
        "Percolator q-Value",
        "Percolator PEP",
        "Percolator SVMScore",
    }

    if illegal_cols.intersection(set(psms.columns)):
        raise ValueError(
            "The PepXML files appear to have generated by Percolator or "
            "PeptideProphet; hence, they should not be analyzed with mokapot."
        )

    # For open modification searches:
    psms["mass_diff"] = psms["exp_mass"] - psms["calc_mass"]
    # Calculate massdiff features
    exp_mz = psms["exp_mass"] / psms["charge"] + proton
    calc_mz = psms["calc_mass"] / psms["charge"] + proton
    psms["abs_mz_diff"] = (exp_mz - calc_mz).abs()

    # Calculate matched ions and complementary ions
    if "num_matched_ions" in psms.columns and "tot_num_ions" in psms.columns:
        if (psms["tot_num_ions"] != 0).all():
            psms["matched_ions_ratio"] = psms["num_matched_ions"] / psms["tot_num_ions"]

    # Log number of candidates:
    if "num_matched_peptides" in psms.columns:
        psms["num_matched_peptides"] = np.log10(psms["num_matched_peptides"])

    # Create charge columns:
    psms = pd.concat([psms, pd.get_dummies(psms["charge"], prefix="charge")], axis=1)

    # psms = psms.drop("charge", axis=1)
    # -log10 p-values
    nonfeat_cols = [
        "ms_data_file",
        "scan",
        "spectrum",
        "label",
        "calc_mass",  # Retain calc_mass for FlashLFQ compatibility
        "peptide",
        "proteins",
        "charge",
        "retention_time",
    ]

    feat_cols = [c for c in psms.columns if c not in nonfeat_cols]
    psms = psms.apply(_log_features, features=feat_cols)
    rescoring_features = {"Original": feat_cols}

    return PsmContainer(
        psms=psms,
        label_column="label",
        scan_column="scan",
        spectrum_column="spectrum",
        ms_data_file_column="ms_data_file",
        peptide_column="peptide",
        protein_column="proteins",
        charge_column="charge",
        rescoring_features=rescoring_features,
        hit_rank_column="rank",
        retention_time_column="retention_time",
        calculated_mass_column="calc_mass",
    )

read_pin(pin_files, retention_time_column=None, remove_pre_nxt_aa=False)

Read PSMs from a Percolator INput (PIN) file.

Parameters:

Name Type Description Default
pin_files Union[str, List[str]]

The file path to the PIN file or a list of file paths.

required
retention_time_column Optional[str]

The column containing the retention time. If None, no retention time will be included.

None

Returns:

Type Description
PsmContainer

A PsmContainer object containing the PSM data.

Notes

This function: 1. Reads PIN file(s) into a DataFrame 2. Identifies required columns (case-insensitive) 3. Processes scan IDs and hit ranks (Only support FragPipe PIN) 4. Converts data types appropriately 5. Creates a PsmContainer with the processed data

Source code in optimhc/parser/pin.py
def read_pin(
    pin_files: Union[str, List[str]],
    retention_time_column: Optional[str] = None,
    remove_pre_nxt_aa: bool = False,
) -> PsmContainer:
    """
    Read PSMs from a Percolator INput (PIN) file.

    Parameters
    ----------
    pin_files : Union[str, List[str]]
        The file path to the PIN file or a list of file paths.
    retention_time_column : Optional[str], optional
        The column containing the retention time. If None, no retention time
        will be included.

    Returns
    -------
    PsmContainer
        A PsmContainer object containing the PSM data.

    Notes
    -----
    This function:
    1. Reads PIN file(s) into a DataFrame
    2. Identifies required columns (case-insensitive)
    3. Processes scan IDs and hit ranks (Only support FragPipe PIN)
    4. Converts data types appropriately
    5. Creates a PsmContainer with the processed data
    """
    logger.info("Reading PIN file(s) into PsmContainer.")
    if isinstance(pin_files, str):
        pin_files = [pin_files]

    pin_df = pd.concat([_read_single_pin_as_df(pin_file) for pin_file in pin_files])
    logger.info(f"Read {len(pin_df)} PSMs from {len(pin_files)} PIN files.")
    logger.debug(pin_df.head())
    logger.debug(pin_df.columns)
    logger.debug(pin_df.iloc[0])

    def find_required_columns(col: str, columns: List[str]) -> str:
        """
        Case-insensitive search for a column in the DataFrame.
        Returns the matching column name with original casing.
        """
        col_lower = col.lower()
        column_map = {c.lower(): c for c in columns}
        if col_lower not in column_map:
            raise ValueError(f"Column '{col}' not found in PSM data (case-insensitive).")
        return column_map[col_lower]

    # non-feature columns (case-insensitive search)
    label = find_required_columns("Label", pin_df.columns)
    scan = find_required_columns("ScanNr", pin_df.columns)
    specid = find_required_columns("SpecId", pin_df.columns)
    peptide = find_required_columns("Peptide", pin_df.columns)
    protein = find_required_columns("Proteins", pin_df.columns)

    # Comet: P2PI20160713_pilling_C1RA2_BB72_P1_31_3_1
    # Fragpipe: P2PI20160713_pilling_C1RA2_BB72_P1.3104.3104.2_1

    # Try to parse rank from SpecId
    def parse_specid(specid: str) -> Tuple[str, int]:
        if "_" in specid:
            parts = specid.rsplit("_", 1)
            if len(parts) != 2:
                logger.warning(f"SpecId format unexpected: {specid}, using default rank 1")
                return 1
            try:
                hit_rank = int(parts[1])
                return hit_rank
            except ValueError:
                logger.warning(f"Could not parse rank from SpecId: {specid}, using default rank 1")
                return 1
        else:
            return 1

    hit_rank = "rank"
    if "rank" in [c.lower() for c in pin_df.columns]:
        pass
    else:
        # Parse SpecId to extract hit rank and update both columns
        pin_df["rank"] = pin_df[specid].apply(parse_specid)

    retention_time_column = (
        find_required_columns(retention_time_column, pin_df.columns)
        if retention_time_column
        else None
    )

    # col: charge_[1,2,3,...] = 0, 1
    charge_map = {
        col: int(re.search(r"(\d+)", col).group(1))
        for col in pin_df.columns
        if re.search(r"charge[_]?(\d+)", col, re.IGNORECASE)
    }

    def extract_charge(row):
        for col, num in charge_map.items():
            if int(float(row[col])) == 1:
                return num
        return None

    pin_df["Charge"] = pin_df.apply(extract_charge, axis=1)

    # feature columns: columns that are not non-feature columns
    non_feature_columns = [label, scan, specid, peptide, protein, hit_rank, "Charge"]
    feature_columns = [col for col in pin_df.columns if col not in non_feature_columns]

    logger.info(
        f"Columns: label={label}, scan={scan}, specid={specid}, peptide={peptide}, "
        f"protein={protein}, hit_rank={hit_rank}, retention_time={retention_time_column}, "
        f"features={feature_columns}"
    )

    pin_df[scan] = pin_df[scan].astype(int)
    pin_df[specid] = pin_df[specid].astype(str)
    pin_df[peptide] = pin_df[peptide].astype(str)
    pin_df[protein] = pin_df[protein].astype(str)
    pin_df[hit_rank] = pin_df[hit_rank].astype(float).astype(int)
    pin_df["Charge"] = pin_df["Charge"].astype(float).astype(int)
    if retention_time_column:
        pin_df[retention_time_column] = pin_df[retention_time_column].astype(float)
    for col in feature_columns:
        pin_df[col] = pin_df[col].astype(float)

    # label = 1 for target, -1 for decoy. Convert to Boolean.
    pin_df[label] = pin_df[label] == "1"
    rescoring_features = {"Original": feature_columns}

    return PsmContainer(
        psms=pin_df,
        label_column=label,
        scan_column=scan,
        spectrum_column=specid,
        ms_data_file_column=None,
        peptide_column=peptide,
        protein_column=protein,
        charge_column="Charge",
        rescoring_features=rescoring_features,
        hit_rank_column=hit_rank,
        retention_time_column=retention_time_column,
    )

plot_feature_importance(models, rescoring_features, save_path=None, sort=False, error=False, **kwargs)

Unified function to plot average feature importance across multiple models.

This function supports: - Linear models (e.g., Linear SVR) which provide an 'estimator' attribute with a 'coef_'. The absolute value of the coefficients is used for importance, and hatch patterns are applied to differentiate between positive and negative coefficients. - XGBoost models which provide a 'feature_importances_' attribute. Since these values are always positive, no hatch patterns are applied.

Parameters:

Name Type Description Default
models list

A list of model objects. For linear models, each model should have an 'estimator' with 'coef_'. For XGBoost models, each model should have a 'feature_importances_' attribute.

required
rescoring_features dict

A dictionary where keys are sources and values are lists of features.

required
save_path str

If provided, saves the plot to the specified path.

None
sort bool

If True, sorts the features by their importance in descending order. Default is False.

False
error bool

If True, adds error bars to the plot. Default is False.

False
**kwargs dict

Additional plotting parameters: - 'figsize' : tuple, default (15, 10) Figure size in inches (width, height). - 'dpi' : int, default 300 Resolution in dots per inch. - 'palette' : str, default 'crest' Seaborn color palette name. Options include 'crest', 'flare', 'mako', 'rocket', 'tab10', 'husl', 'Set2', etc.

{}
Notes

The function automatically detects the model type based on the presence of the corresponding attribute. For linear models, it uses hatch patterns to differentiate between positive and negative coefficients. For XGBoost models, it uses solid bars since the importances are always positive.

The color palette is automatically scaled to match the number of feature sources, ensuring consistent colors between the bars and legend.

Examples:

>>> # Use default crest palette
>>> plot_feature_importance(models, rescoring_features, save_path='importance.png')
>>> # Use a different palette
>>> plot_feature_importance(models, rescoring_features, palette='flare', sort=True, error=True)
Source code in optimhc/visualization/plot_features.py
def plot_feature_importance(
    models, rescoring_features, save_path=None, sort=False, error=False, **kwargs
):
    """
    Unified function to plot average feature importance across multiple models.

    This function supports:
      - Linear models (e.g., Linear SVR) which provide an 'estimator' attribute with a 'coef_'.
        The absolute value of the coefficients is used for importance, and hatch patterns are applied
        to differentiate between positive and negative coefficients.
      - XGBoost models which provide a 'feature_importances_' attribute. Since these values are
        always positive, no hatch patterns are applied.

    Parameters
    ----------
    models : list
        A list of model objects.
        For linear models, each model should have an 'estimator' with 'coef_'.
        For XGBoost models, each model should have a 'feature_importances_' attribute.
    rescoring_features : dict
        A dictionary where keys are sources and values are lists of features.
    save_path : str, optional
        If provided, saves the plot to the specified path.
    sort : bool, optional
        If True, sorts the features by their importance in descending order.
        Default is False.
    error : bool, optional
        If True, adds error bars to the plot. Default is False.
    **kwargs : dict
        Additional plotting parameters:
        - 'figsize' : tuple, default (15, 10)
            Figure size in inches (width, height).
        - 'dpi' : int, default 300
            Resolution in dots per inch.
        - 'palette' : str, default 'crest'
            Seaborn color palette name. Options include 'crest', 'flare', 'mako',
            'rocket', 'tab10', 'husl', 'Set2', etc.

    Notes
    -----
    The function automatically detects the model type based on the presence of the corresponding attribute.
    For linear models, it uses hatch patterns to differentiate between positive and negative coefficients.
    For XGBoost models, it uses solid bars since the importances are always positive.

    The color palette is automatically scaled to match the number of feature sources, ensuring
    consistent colors between the bars and legend.

    Examples
    --------
    >>> # Use default crest palette
    >>> plot_feature_importance(models, rescoring_features, save_path='importance.png')

    >>> # Use a different palette
    >>> plot_feature_importance(models, rescoring_features, palette='flare', sort=True, error=True)
    """
    # Determine the model type based on the first model in the list.
    if hasattr(models[0].estimator, "coef_"):
        model_type = "linear"
    elif hasattr(models[0].estimator, "feature_importances_"):
        model_type = "xgb"
    else:
        raise ValueError(
            "Model type not recognized. Model must have 'estimator.coef_' for linear models or "
            "'estimator.feature_importances_' for XGBoost models."
        )

    if model_type == "linear":
        feature_importances = []
        for model in models:
            coefficients = model.estimator.coef_
            feature_importances.append(np.abs(coefficients).mean(axis=0))
            logger.debug(f"Model coefficients shape: {coefficients.shape}")

        average_feature_importance = np.mean(feature_importances, axis=0)
        std_feature_importance = np.std(feature_importances, axis=0)
        feature_signs = np.mean([model.estimator.coef_.mean(axis=0) for model in models], axis=0)

    elif model_type == "xgb":
        feature_importances = []
        for model in models:
            # Use the XGBoost feature importances directly as they are always positive
            imp = model.estimator.feature_importances_
            feature_importances.append(imp)
            logger.debug(f"Model feature importances shape: {imp.shape}")

        average_feature_importance = np.mean(feature_importances, axis=0)
        std_feature_importance = np.std(feature_importances, axis=0)
        feature_signs = np.ones_like(average_feature_importance)

    logger.debug(f"Total rescoring features: {len(sum(rescoring_features.values(), []))}")
    logger.debug(f"Average feature importance length: {len(average_feature_importance)}")
    logger.debug(f"Features: {sum(rescoring_features.values(), [])}")

    # Extract plotting parameters
    figsize = kwargs.get("figsize", (15, 10))
    dpi = kwargs.get("dpi", 300)
    palette_name = kwargs.get("palette", "Set2")

    all_features = []
    all_importances = []
    all_errors = []
    all_colors = []
    all_hatches = []  # Hatch patterns will be applied only for linear models.

    n_sources = len(rescoring_features)
    colors = sns.color_palette(palette_name, n_colors=n_sources)
    source_colors = dict(zip(rescoring_features.keys(), colors))

    for source, features in rescoring_features.items():
        color = source_colors[source]
        indices = [
            i for i, name in enumerate(sum(rescoring_features.values(), [])) if name in features
        ]
        source_importances = average_feature_importance[indices]
        source_std = std_feature_importance[indices]

        if model_type == "linear":
            source_signs = feature_signs[indices]

        if sort:
            sorted_indices = np.argsort(-source_importances)
        else:
            sorted_indices = np.arange(len(features))

        sorted_features = [features[i] for i in sorted_indices]
        sorted_importances = source_importances[sorted_indices]
        sorted_std = source_std[sorted_indices]

        all_features.extend(sorted_features)
        all_importances.extend(sorted_importances)
        all_errors.extend(sorted_std)
        all_colors.extend([color] * len(sorted_features))

        if model_type == "linear":
            # For linear models, use hatch patterns to differentiate positive and negative coefficients.
            # An empty hatch ('') for positive and '\\' for negative coefficients.
            sorted_signs = source_signs[sorted_indices]
            all_hatches.extend(["" if sign >= 0 else "\\\\" for sign in sorted_signs])
        else:
            all_hatches.extend([""] * len(sorted_features))

    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    if error:
        bars = ax.barh(all_features, all_importances, xerr=all_errors, color=all_colors, capsize=5)
    else:
        bars = ax.barh(all_features, all_importances, color=all_colors)

    if model_type == "linear":
        for bar, hatch in zip(bars, all_hatches):
            bar.set_hatch(hatch)
        legend_hatches = [
            Patch(facecolor="white", edgecolor="black", hatch="", label="Positive"),
            Patch(facecolor="white", edgecolor="black", hatch="\\\\", label="Negative"),
        ]
        legend_colors = [
            Patch(facecolor=source_colors[source], edgecolor="black", label=source)
            for source in rescoring_features.keys()
        ]
        ax.legend(handles=legend_hatches + legend_colors, loc="best")
    else:
        legend_colors = [
            Patch(facecolor=source_colors[source], edgecolor="black", label=source)
            for source in rescoring_features.keys()
        ]
        ax.legend(handles=legend_colors, loc="best")

    ax.set_xlabel("Average Feature Importance")
    ax.set_ylabel("Feature")

    save_or_show_plot(save_path, logger)

plot_qvalues(results, save_path=None, dpi=300, figsize=(15, 10), threshold=0.05, colors=None, **kwargs)

Plot q-values for the given results.

Parameters:

Name Type Description Default
results object or list

A list of results objects or a single result object. Each result object should have a method plot_qvalues.

required
save_path str

If provided, saves the plot to the specified path.

None
dpi int

The resolution of the plot. Default is 300.

300
figsize tuple

The size of the figure. Default is (15, 10).

(15, 10)
threshold float

The q-value threshold for plotting. Default is 0.05.

0.05
colors list

A list of colors for the plots. If not provided, uses default colors.

None
**kwargs dict

Additional plotting parameters.

{}

Returns:

Type Description
None

The function displays or saves the plot.

Notes

This function: 1. Creates a figure with two subplots for PSMs and peptides 2. Plots q-values for each result with different colors 3. Adds legends and titles to each subplot 4. Saves or displays the plot based on save_path

Source code in optimhc/visualization/plot_roc.py
def plot_qvalues(
    results,
    save_path=None,
    dpi=300,
    figsize=(15, 10),
    threshold=0.05,
    colors=None,
    **kwargs,
):
    """
    Plot q-values for the given results.

    Parameters
    ----------
    results : object or list
        A list of results objects or a single result object.
        Each result object should have a method `plot_qvalues`.
    save_path : str, optional
        If provided, saves the plot to the specified path.
    dpi : int, optional
        The resolution of the plot. Default is 300.
    figsize : tuple, optional
        The size of the figure. Default is (15, 10).
    threshold : float, optional
        The q-value threshold for plotting. Default is 0.05.
    colors : list, optional
        A list of colors for the plots. If not provided, uses default colors.
    **kwargs : dict
        Additional plotting parameters.

    Returns
    -------
    None
        The function displays or saves the plot.

    Notes
    -----
    This function:
    1. Creates a figure with two subplots for PSMs and peptides
    2. Plots q-values for each result with different colors
    3. Adds legends and titles to each subplot
    4. Saves or displays the plot based on save_path
    """
    if not isinstance(results, list):
        results = [results]

    if colors is None:
        colors = [
            "#1f77b4",
            "#ff7f0e",
            "#2ca02c",
            "#d62728",
            "#9467bd",
            "#8c564b",
            "#e377c2",
            "#7f7f7f",
            "#bcbd22",
            "#17becf",
        ]

    fig, axs = plt.subplots(1, 2, figsize=figsize, dpi=dpi)

    for i, result in enumerate(results):
        for ax, level in zip(axs, ["psms", "peptides"]):
            result.plot_qvalues(
                level=level,
                c=colors[i % len(colors)],
                ax=ax,
                threshold=threshold,
                label=f"Result {i + 1}" if len(results) > 1 else "Results",
                linewidth=1,
                **kwargs,
            )
            ax.legend(frameon=False)
            ax.set_title(f"{level}")

    plt.tight_layout()
    return save_or_show_plot(save_path, logger)

visualize_feature_correlation(psms, save_path=None, **kwargs)

Visualize the correlation between features in a DataFrame using a scatter plot heatmap.

Parameters:

Name Type Description Default
psms PsmContainer

A PsmContainer object containing the features to visualize.

required
save_path str

The file path to save the plot. If not provided, the plot is displayed.

None
**kwargs dict

Additional plotting parameters such as figsize, dpi, height, etc.

{}
Source code in optimhc/visualization/plot_features.py
def visualize_feature_correlation(psms: PsmContainer, save_path=None, **kwargs):
    """
    Visualize the correlation between features in a DataFrame using a scatter plot heatmap.

    Parameters
    ----------
    psms : PsmContainer
        A PsmContainer object containing the features to visualize.
    save_path : str, optional
        The file path to save the plot. If not provided, the plot is displayed.
    **kwargs : dict
        Additional plotting parameters such as `figsize`, `dpi`, `height`, etc.
    """
    rescoring_features = [item for sublist in psms.rescoring_features.values() for item in sublist]
    n_features = len(rescoring_features)

    default_height = max(8, min(20, 8 + n_features * 0.15))
    height = kwargs.get("height", default_height)
    dpi = kwargs.get("dpi", 300)

    corr = psms.psms[rescoring_features].corr()
    corr_mat = corr.stack().reset_index(name="correlation")

    g = sns.relplot(
        data=corr_mat,
        x="level_0",
        y="level_1",
        hue="correlation",
        size="correlation",
        palette="vlag",
        hue_norm=(-1, 1),
        edgecolor=".7",
        height=height,
        sizes=(50, 250),
        size_norm=(-0.2, 0.8),
        **{k: v for k, v in kwargs.items() if k not in ["height", "dpi", "figsize"]},
    )

    g.set(xlabel="", ylabel="", aspect="equal")
    g.despine(left=True, bottom=True)
    g.ax.margins(0.02)

    for label in g.ax.get_xticklabels():
        label.set_rotation(90)

    if n_features > 30:
        fontsize = max(6, 10 - n_features * 0.05)
        g.ax.tick_params(labelsize=fontsize)

    g.figure.suptitle("Feature Correlation Matrix", y=1.01, fontsize=14)
    plt.tight_layout()
    g.figure.dpi = dpi

    save_or_show_plot(save_path, logger)

visualize_target_decoy_features(psms, num_cols=5, save_path=None, **kwargs)

Visualize the distribution of features in a DataFrame using kernel density estimation plots.

Parameters:

Name Type Description Default
psms PsmContainer

A PsmContainer object containing the features to visualize.

required
num_cols int

The number of columns in the plot grid. Default is 5.

5
save_path str

The file path to save the plot. If not provided, the plot is displayed.

None
**kwargs dict

Additional plotting parameters such as figsize and dpi, etc.

{}
Notes

This function: 1. Extracts rescoring features from the PsmContainer 2. Filters out features with only one unique value 3. Creates a grid of plots showing the distribution of each feature 4. Separates target and decoy PSMs in each plot 5. Uses kernel density estimation to show the distribution shape

Source code in optimhc/visualization/plot_tdc_distribution.py
def visualize_target_decoy_features(psms: PsmContainer, num_cols=5, save_path=None, **kwargs):
    """
    Visualize the distribution of features in a DataFrame using kernel density estimation plots.

    Parameters
    ----------
    psms : PsmContainer
        A PsmContainer object containing the features to visualize.
    num_cols : int, optional
        The number of columns in the plot grid. Default is 5.
    save_path : str, optional
        The file path to save the plot. If not provided, the plot is displayed.
    **kwargs : dict
        Additional plotting parameters such as `figsize` and `dpi`, etc.

    Notes
    -----
    This function:
    1. Extracts rescoring features from the PsmContainer
    2. Filters out features with only one unique value
    3. Creates a grid of plots showing the distribution of each feature
    4. Separates target and decoy PSMs in each plot
    5. Uses kernel density estimation to show the distribution shape
    """
    rescoring_features = [
        item
        for sublist in psms.rescoring_features.values()
        for item in sublist
        if item != psms.hit_rank_column
    ]

    # drop features that only have one value
    rescoring_features = [
        feature for feature in rescoring_features if len(psms.psms[feature].unique()) > 1
    ]

    num_features = len(rescoring_features)
    num_rows = (num_features + num_cols - 1) // num_cols

    figsize = kwargs.get("figsize", (15, num_rows * 15 / num_cols))
    dpi = kwargs.get("dpi", 300)

    fig, axes = plt.subplots(num_rows, num_cols, figsize=figsize, dpi=dpi)
    axes = axes.flatten()

    psms_top_hits = psms.psms[psms.psms[psms.hit_rank_column] == 1].copy()
    num_true_hits = len(psms_top_hits[psms_top_hits[psms.label_column]])
    num_decoys = len(psms_top_hits[~psms_top_hits[psms.label_column]])
    logger.debug(f"Number of true hits: {num_true_hits}")
    logger.debug(f"Number of decoys: {num_decoys}")
    psms_top_hits[psms.label_column] = psms_top_hits[psms.label_column].map(
        {True: "Target", False: "Decoy"}
    )

    for i, feature in enumerate(rescoring_features):
        try:
            ax = axes[i]
            sns.histplot(
                data=psms_top_hits,
                x=feature,
                hue=psms.label_column,
                ax=ax,
                bins="auto",
                common_bins=True,
                multiple="dodge",
                fill=True,
                alpha=0.3,
                stat="frequency",
                kde=True,
                linewidth=0,
            )
            ax.set_title(feature)
            ax.set_xlabel("")
            ax.set_ylabel("")

        except Exception as e:
            logger.error(f"Error plotting feature {feature}: {e}")
            ax.set_visible(False)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    save_or_show_plot(save_path, logger)

Feature Generation

feature_generation

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

generate_features(psms, config)

Generate features from different generators according to the configuration.

Parameters:

Name Type Description Default
psms PsmContainer

A container object holding PSMs and relevant data.

required
config dict

Configuration dictionary loaded from YAML or CLI.

required
Source code in optimhc/core/feature_generation.py
def generate_features(psms, config):
    """
    Generate features from different generators according to the configuration.

    Parameters
    ----------
    psms : PsmContainer
        A container object holding PSMs and relevant data.
    config : dict
        Configuration dictionary loaded from YAML or CLI.
    """
    feature_generators = config.get("featureGenerator", None)
    if not feature_generators:
        return

    for generator_config in feature_generators:
        if not isinstance(generator_config, dict):
            logger.warning("Feature generator config is not a dictionary, skipping...")
            continue

        name = generator_config.get("name")
        params = generator_config.get("params", {})

        logger.info(f"Generating features with {name}...")
        generator_cls = feature_generator_factory.get_generator(name)
        generator = generator_cls.from_config(psms, config, params)
        generator.apply(psms, source=name)
        gc.collect()

PSM Container

psm_container

logger = logging.getLogger(__name__) module-attribute

PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)

A container for managing peptide-spectrum matches (PSMs) in immunopeptidomics rescoring pipelines.

Parameters:

Name Type Description Default
psms DataFrame

DataFrame containing the PSM data.

required
label_column str

Column containing the label (True for target, False for decoy).

required
scan_column str

Column containing the scan number.

required
spectrum_column str

Column containing the spectrum identifier.

required
ms_data_file_column str

Column containing the MS data file that the PSM originated from.

required
peptide_column str

Column containing the peptide sequence.

required
protein_column str

Column containing the protein accessions.

required
rescoring_features dict of str to list of str

Dictionary of feature columns for rescoring.

required
hit_rank_column str

Column containing the hit rank.

None
charge_column str

Column containing the charge state.

None
retention_time_column str

Column containing the retention time.

None
calculated_mass_column str

Column containing the calculated mass.

None
metadata_column str

Column containing metadata.

None

Attributes:

Name Type Description
psms DataFrame

Copy of the DataFrame containing the PSM data.

target_psms DataFrame

DataFrame containing only target PSMs (label = True).

decoy_psms DataFrame

DataFrame containing only decoy PSMs (label = False).

peptides list of str

List containing all peptides from the PSM data.

columns list of str

List of column names in the PSM DataFrame.

rescoring_features dict of str to list of str

Dictionary of rescoring feature columns in the PSM DataFrame.

Source code in optimhc/psm_container.py
def __init__(
    self,
    psms: pd.DataFrame,
    label_column: str,
    scan_column: str,
    spectrum_column: str,
    ms_data_file_column: str,
    peptide_column: str,
    protein_column: str,
    rescoring_features: Dict[str, List[str]],
    hit_rank_column: Optional[str] = None,
    charge_column: Optional[str] = None,
    retention_time_column: Optional[str] = None,
    calculated_mass_column: Optional[str] = None,
    metadata_column: Optional[str] = None,
):
    self._psms = psms.copy()
    self._psms.reset_index(drop=True, inplace=True)
    self.label_column = label_column
    self.scan_column = scan_column
    self.spectrum_column = spectrum_column
    self.ms_data_file_column = ms_data_file_column
    self.peptide_column = peptide_column
    self.protein_column = protein_column
    self.hit_rank_column = hit_rank_column
    self.retention_time_column = retention_time_column
    self.metadata_column = metadata_column
    self.rescoring_features = rescoring_features
    self.charge_column = charge_column
    self.calculated_mass_column = calculated_mass_column
    # rescore result column
    self.rescore_result_column = None

    # check if the columns are in the dataframe
    def check_column(col):
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")

    check_column(label_column)
    check_column(scan_column)
    check_column(spectrum_column)
    check_column(ms_data_file_column)
    check_column(peptide_column)
    check_column(protein_column)
    check_column(hit_rank_column)
    check_column(retention_time_column)
    check_column(charge_column)
    check_column(calculated_mass_column)

    # ensure the label column is boolean
    if psms[label_column].dtype != "bool":
        raise ValueError(f"Column '{label_column}' must be boolean.")

    if psms[label_column].nunique() == 1 and psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as target. No decoy PSMs found.")
    elif psms[label_column].nunique() == 1 and not psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as decoy. No target PSMs found.")

    def check_metadata_column(col):
        # check the type is Dict[str, Dict[str, str]]
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")
        if not all(isinstance(x, dict) for x in self._psms[col]):
            raise ValueError(f"Column '{col}' must contain dictionaries.")

    if metadata_column:
        check_metadata_column(metadata_column)

    def check_rescoring_features(features: Dict[str, List[str]]):
        for key, cols in features.items():
            for col in cols:
                if col not in psms.columns:
                    raise ValueError(
                        f"Column '{col}' not found in PSM data for feature '{key}'."
                    )

    check_rescoring_features(rescoring_features)

    # check if the number of decoy psms is not 0
    if len(self.decoy_psms) == 0:
        logger.error("No decoy PSMs found. Please check the decoy prefix.")
        raise ValueError("No decoy PSMs found.")

    logger.info("PsmContainer initialized with %d PSM entries.", len(self._psms))
    if self.ms_data_file_column:
        logger.info(
            "PSMs originated from %d MS data file(s).",
            len(self._psms[ms_data_file_column].unique()),
        )
    logger.info("target psms: %d", len(self.target_psms))
    logger.info("decoy psms: %d", len(self.decoy_psms))
    logger.info("unique peptides: %d", len(np.unique(self.peptides)))
    logger.info("rescoring features: %s", rescoring_features)

psms property

Get a copy of the PSM DataFrame to prevent external modification.

Returns:

Type Description
DataFrame

A copy of the PSM DataFrame.

target_psms property

Get a DataFrame containing only target PSMs.

Returns:

Type Description
DataFrame

DataFrame with only target PSMs (label = True).

decoy_psms property

Get a DataFrame containing only decoy PSMs.

Returns:

Type Description
DataFrame

DataFrame with only decoy PSMs (label = False).

columns property

Get the column names of the PSM DataFrame.

Returns:

Type Description
list of str

List of column names.

feature_columns property

Get a list of all feature columns in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature column names.

feature_sources property

Get a list of all feature sources in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature source names.

peptides property

Get the peptide sequences from the PSM data.

Returns:

Type Description
list of str

List of peptide sequences.

ms_data_files property

Get the MS data files from the PSM data.

Returns:

Type Description
list of str

List of MS data file names.

scan_ids property

Get the scan numbers from the PSM data.

Returns:

Type Description
list of int

List of scan numbers.

charges property

Get the charge states from the PSM data.

Returns:

Type Description
list of int

List of charge states.

metadata property

Get the metadata from the PSM data.

Returns:

Type Description
Series

Series containing metadata for each PSM.

spectrum_ids property

Get the spectrum identifiers from the PSM data.

Returns:

Type Description
list of str

List of spectrum identifiers.

identifier_columns property

Get the columns that uniquely identify each PSM.

Returns:

Type Description
list of str

List of identifier column names.

__len__()

Get the number of PSMs in the container.

Returns:

Type Description
int

Number of PSMs.

Source code in optimhc/psm_container.py
def __len__(self) -> int:
    """
    Get the number of PSMs in the container.

    Returns
    -------
    int
        Number of PSMs.
    """
    return len(self._psms)

copy()

Return a deep copy of the PsmContainer object.

Returns:

Type Description
PsmContainer

A deep copy of the current PsmContainer.

Source code in optimhc/psm_container.py
def copy(self) -> "PsmContainer":
    """
    Return a deep copy of the PsmContainer object.

    Returns
    -------
    PsmContainer
        A deep copy of the current PsmContainer.
    """
    import copy

    return copy.deepcopy(self)

__repr__()

Return a string representation of the PsmContainer.

Returns:

Type Description
str

String summary of the PsmContainer.

Source code in optimhc/psm_container.py
def __repr__(self) -> str:
    """
    Return a string representation of the PsmContainer.

    Returns
    -------
    str
        String summary of the PsmContainer.
    """
    return (
        f"PsmContainer with {len(self)} PSMs\n"
        f"\t - Target PSMs: {len(self.target_psms)}\n"
        f"\t - Decoy PSMs: {len(self.decoy_psms)}\n"
        f"\t - Unique Peptides: {len(np.unique(self.peptides))}\n"
        f"\t - Unique Spectra: {len(self._psms[self.spectrum_column].unique())}\n"
        f"\t - Rescoring Features: {self.rescoring_features}\n"
    )

drop_features(features)

Drop specified features from the PSM DataFrame.

Parameters:

Name Type Description Default
features list of str

List of feature column names to drop.

required

Raises:

Type Description
ValueError

If any of the features do not exist in the DataFrame.

Source code in optimhc/psm_container.py
def drop_features(self, features: List[str]) -> None:
    """
    Drop specified features from the PSM DataFrame.

    Parameters
    ----------
    features : list of str
        List of feature column names to drop.

    Raises
    ------
    ValueError
        If any of the features do not exist in the DataFrame.
    """
    missing_features = [f for f in features if f not in self._psms.columns]
    if missing_features:
        raise ValueError(f"Features not found in PSM data: {missing_features}")

    self._psms.drop(columns=features, inplace=True)
    # Create a list of sources to update
    sources_to_update = []
    for source, cols in self.rescoring_features.items():
        self.rescoring_features[source] = [col for col in cols if col not in features]
        if not self.rescoring_features[source]:
            sources_to_update.append(source)

    logger.info(
        f"Sources to be removed: {sources_to_update}. Because all the features are removed."
    )
    # Remove sources with no features left
    for source in sources_to_update:
        del self.rescoring_features[source]

drop_source(source)

Drop all features associated with a specific source from the PSM DataFrame.

Parameters:

Name Type Description Default
source str

Name of the source to drop.

required

Raises:

Type Description
ValueError

If the source does not exist in the rescoring features.

Source code in optimhc/psm_container.py
def drop_source(self, source: str) -> None:
    """
    Drop all features associated with a specific source from the PSM DataFrame.

    Parameters
    ----------
    source : str
        Name of the source to drop.

    Raises
    ------
    ValueError
        If the source does not exist in the rescoring features.
    """
    if source not in self.rescoring_features:
        raise ValueError(f"Source '{source}' not found in rescoring features.")
    self.drop_features(self.rescoring_features[source])

add_metadata(metadata_df, psms_key, metadata_key, source)

Merge new metadata into the PSM DataFrame based on specified columns. Metadata from the specified source is stored as a nested dictionary inside the metadata column.

Parameters:

Name Type Description Default
metadata_df DataFrame

DataFrame containing new metadata to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
metadata_key str or list of str

Column name(s) in the metadata data to merge on.

required
source str

Name of the source of the new metadata.

required
Source code in optimhc/psm_container.py
def add_metadata(
    self,
    metadata_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    metadata_key: Union[str, List[str]],
    source,
) -> None:
    """
    Merge new metadata into the PSM DataFrame based on specified columns.
    Metadata from the specified source is stored as a nested dictionary inside the metadata column.

    Parameters
    ----------
    metadata_df : pd.DataFrame
        DataFrame containing new metadata to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    metadata_key : str or list of str
        Column name(s) in the metadata data to merge on.
    source : str
        Name of the source of the new metadata.
    """
    if self.metadata_column is None:
        logger.info("No existing metadata column. Creating new metadata column.")
        self.metadata_column = "metadata"
        self._psms["metadata"] = [{} for _ in range(len(self._psms))]

    metadata_cols = [col for col in metadata_df.columns if col not in metadata_key]
    merged_df = self.psms.merge(
        metadata_df, left_on=psms_key, right_on=metadata_key, how="left"
    )
    if source in self._psms["metadata"]:
        logger.warning(f"{source} already exists in metadata. Overwriting.")
    for col in metadata_cols:
        merged_df["metadata"] = merged_df.apply(
            lambda row: {
                **row["metadata"],
                source: (
                    {col: row[col]}
                    if source not in row["metadata"]
                    else {**row["metadata"][source], col: row[col]}
                ),
            },
            axis=1,
        )

    self._psms["metadata"] = merged_df["metadata"]

get_top_hits(n=1)

Get the top n hits based on the hit rank column. If the hit rank column is not specified, returns the original PSMs.

Parameters:

Name Type Description Default
n int

The number of top hits to return. Default is 1.

1

Returns:

Type Description
PsmContainer

A new PsmContainer object containing the top n hits.

Source code in optimhc/psm_container.py
def get_top_hits(self, n: int = 1):
    """
    Get the top n hits based on the hit rank column.
    If the hit rank column is not specified, returns the original PSMs.

    Parameters
    ----------
    n : int, optional
        The number of top hits to return. Default is 1.

    Returns
    -------
    PsmContainer
        A new PsmContainer object containing the top n hits.
    """
    if self.hit_rank_column is None:
        logger.warning("Rank column not specified. Return the original PSMs.")
        return self.copy()

    psms = self.copy()
    psms._psms = psms._psms[psms._psms[self.hit_rank_column] <= n]
    return psms

add_features(features_df, psms_key, feature_key, source, suffix=None)

Merge new features into the PSM DataFrame based on specified columns.

This method performs a left join between the PSM data and feature data, ensuring that all PSMs are preserved while adding new features. It handles column name conflicts through optional suffixing and maintains feature source tracking.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
feature_key str or list of str

Column name(s) in the features data to merge on.

required
source str

Name of the source of the new features (e.g., 'deeplc', 'netmhc').

required
suffix str

Suffix to add to the new columns if there's a name conflict. Required when new feature columns have the same names as existing columns. For example, if adding features from different sources (e.g., 'score' from DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

None

Returns:

Type Description
None

Raises:

Type Description
ValueError

If duplicate columns exist without suffix. If merging features changes the number of PSMs.

Notes

The method follows these steps: 1. Validates input and prepares merge keys 2. Checks for column name conflicts 3. Manages feature source: if the source already exists, it will be overwritten 4. Performs left join merge 5. Verifies data integrity

Suffix Usage

The suffix parameter is used to handle column name conflicts: - When adding features from different sources that might have the same column names - When you want to keep both the original and new features with the same name - When you need to track the source of features in the column names

If suffix is not provided and there are duplicate column names: - The method will raise a ValueError - You must either provide a suffix or rename the columns before adding

Examples:

>>> container = PsmContainer(...)
>>> # Adding features without suffix (no conflicts)
>>> features_df1 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'feature1': [0.1, 0.2, 0.3],
...     'feature2': [0.4, 0.5, 0.6]
... })
>>> container.add_features(
...     features_df1,
...     psms_key='scan',
...     feature_key='scan',
...     source='source1'
... )
>>> # Adding features with suffix (handling conflicts)
>>> features_df2 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
...     'feature3': [0.7, 0.8, 0.9]
... })
>>> container.add_features(
...     features_df2,
...     psms_key='scan',
...     feature_key='scan',
...     source='source2',
...     suffix='_new'  # 'score' becomes 'score_new'
... )
Source code in optimhc/psm_container.py
def add_features(
    self,
    features_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    feature_key: Union[str, List[str]],
    source: str,
    suffix: Optional[str] = None,
) -> None:
    """Merge new features into the PSM DataFrame based on specified columns.

    This method performs a left join between the PSM data and feature data,
    ensuring that all PSMs are preserved while adding new features. It handles
    column name conflicts through optional suffixing and maintains feature source
    tracking.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    feature_key : str or list of str
        Column name(s) in the features data to merge on.
    source : str
        Name of the source of the new features (e.g., 'deeplc', 'netmhc').
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
        Required when new feature columns have the same names as existing columns.
        For example, if adding features from different sources (e.g., 'score' from
        DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If duplicate columns exist without suffix.
        If merging features changes the number of PSMs.

    Notes
    -----
    The method follows these steps:
    1. Validates input and prepares merge keys
    2. Checks for column name conflicts
    3. Manages feature source: if the source already exists, it will be overwritten
    4. Performs left join merge
    5. Verifies data integrity

    Suffix Usage
    -----------
    The suffix parameter is used to handle column name conflicts:
    - When adding features from different sources that might have the same column names
    - When you want to keep both the original and new features with the same name
    - When you need to track the source of features in the column names

    If suffix is not provided and there are duplicate column names:
    - The method will raise a ValueError
    - You must either provide a suffix or rename the columns before adding

    Examples
    --------
    >>> container = PsmContainer(...)
    >>> # Adding features without suffix (no conflicts)
    >>> features_df1 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'feature1': [0.1, 0.2, 0.3],
    ...     'feature2': [0.4, 0.5, 0.6]
    ... })
    >>> container.add_features(
    ...     features_df1,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source1'
    ... )
    >>> # Adding features with suffix (handling conflicts)
    >>> features_df2 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
    ...     'feature3': [0.7, 0.8, 0.9]
    ... })
    >>> container.add_features(
    ...     features_df2,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source2',
    ...     suffix='_new'  # 'score' becomes 'score_new'
    ... )
    """
    if isinstance(psms_key, str):
        psms_key = [psms_key]

    if isinstance(feature_key, str):
        feature_key = [feature_key]

    new_feature_cols = [col for col in features_df.columns if col not in feature_key]

    for cols in new_feature_cols:
        if cols in self._psms.columns:
            logger.warning(f"Column '{cols}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix ")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")
    logger.info(f"Adding {len(new_feature_cols)} new features from {source}.")

    if not new_feature_cols:
        logger.warning("No new features to add. Check the feature key and PSMs key.")
        logger.warning(f"Feature key: {feature_key}; PSMs key: {psms_key}")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    # TODO: reluctant logic
    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df = features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}
    )
    original_len = len(self._psms)
    # avoid merge the right key to the psms
    self._psms = self._psms.merge(
        features_df, left_on=psms_key, right_on=feature_key, how="left"
    )

    if feature_key != psms_key:
        cols_to_drop = [
            col for col in feature_key if col not in psms_key and col in self._psms.columns
        ]
        if cols_to_drop:
            logger.debug(f"Dropping columns from feature_key not in psms_key: {cols_to_drop}")
            self._psms.drop(columns=cols_to_drop, inplace=True)

    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate keys."
        )

add_features_by_index(features_df, source, suffix=None)

Merge new features into the PSM DataFrame based on the DataFrame index.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
source str

Name of the source of the new features.

required
suffix str

Suffix to add to the new columns if there's a name conflict.

None
Source code in optimhc/psm_container.py
def add_features_by_index(
    self, features_df: pd.DataFrame, source: str, suffix: Optional[str] = None
) -> None:
    """
    Merge new features into the PSM DataFrame based on the DataFrame index.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    source : str
        Name of the source of the new features.
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
    """
    new_feature_cols = [col for col in features_df.columns]
    for col in new_feature_cols:
        if col in self._psms.columns:
            logger.warning(f"Column '{col}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix.")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")

    logger.info(f"Adding {len(new_feature_cols)} new features from {source} by index.")

    if not new_feature_cols:
        logger.warning("No new features to add.")
        raise ValueError("No new features to add.")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}, inplace=True
    )
    original_len = len(self._psms)
    self._psms = self._psms.merge(
        features_df,
        left_index=True,
        right_index=True,
        how="left",  # Perform a left join to preserve all original PSM data
    )

    # Ensure that the merge did not change the number of rows in the PSM DataFrame
    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate indices."
        )

add_results(results_df, psms_key, result_key)

Add results of rescore engine to the PSM DataFrame based on specified columns.

Parameters:

Name Type Description Default
results_df DataFrame

DataFrame containing new results to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
result_key str or list of str

Column name(s) in the results data to merge on.

required
Source code in optimhc/psm_container.py
def add_results(
    self,
    results_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    result_key: Union[str, List[str]],
) -> None:
    """
    Add results of rescore engine to the PSM DataFrame based on specified columns.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing new results to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    result_key : str or list of str
        Column name(s) in the results data to merge on.
    """
    if self.rescore_result_column is not None:
        logger.warning("Rescore result column already exists. Overwriting.")

    if set(self._psms.columns) & set(results_df.columns):
        raise ValueError(
            "Duplicate columns exist. Please rename the columns in the results data."
        )

    self.rescore_result_column = result_key
    self._psms = self._psms.merge(
        results_df,
        left_on=psms_key,
        right_on=result_key,
        how="left",
        validate="one_to_one",
    )
    self._psms.drop(columns=result_key, inplace=True)
    logger.info("Added rescore results to PSM data.")

write_pin(output_file, style='default', source=None)

Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats. The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

Parameters:

Name Type Description Default
output_file str

Path to the output PIN file.

required
style str

If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins). If set to 'default', outputs all features specified in rescoring_features, plus required Percolator columns.

'default'
source list of str

List of feature sources to include. If None, includes all sources.

None

Returns:

Type Description
DataFrame

The DataFrame written to the PIN file.

Notes
  • The first three columns are always: SpecID, Label, ScanNr.
  • For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
  • If hit_rank_column is not specified, rank is set to 1 for all rows.
  • Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
  • The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
  • The 'Peptide' column is formatted with underscores (e.g., _.PEPTIDE._).
  • For standard format, all features from rescoring_features are appended between ScanNr and Peptide columns.
  • The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
  • Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

Example output (default style): SpecId Label ScanNr feature1 feature2 ... Peptide Proteins

Example output (msbooster style): SpecId Label ScanNr retentiontime rank hyperscore Peptide Proteins or SpecId Label ScanNr retentiontime rank log10_evalue Peptide Proteins

Raises:

Type Description
ValueError

If required columns are missing for the selected style.

Source code in optimhc/psm_container.py
def write_pin(
    self, output_file: str, style: str = "default", source: List[str] = None
) -> None:
    """
    Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats.
    The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

    Parameters
    ----------
    output_file : str
        Path to the output PIN file.
    style : str, optional
        If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins).
        If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.
    source : list of str, optional
        List of feature sources to include. If None, includes all sources.

    Returns
    -------
    pd.DataFrame
        The DataFrame written to the PIN file.

    Notes
    -----
    - The first three columns are always: SpecID, Label, ScanNr.
    - For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
    - If `hit_rank_column` is not specified, rank is set to 1 for all rows.
    - Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
    - The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
    - The 'Peptide' column is formatted with underscores (e.g., `_.PEPTIDE._`).
    - For standard format, all features from `rescoring_features` are appended between ScanNr and Peptide columns.
    - The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
    - Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

    Example output (default style):
        SpecId	Label	ScanNr	feature1	feature2	...	Peptide	Proteins

    Example output (msbooster style):
        SpecId	Label	ScanNr	retentiontime	rank	hyperscore	Peptide	Proteins
        or
        SpecId	Label	ScanNr	retentiontime	rank	log10_evalue	Peptide	Proteins

    Raises
    ------
    ValueError
        If required columns are missing for the selected style.
    """
    df = self._psms.copy()
    # Check if the label column is str
    # Case1: label column is str
    if df[self.label_column].dtype == "str":
        df["PercolatorLabel"] = df[self.label_column].map({"True": 1, "False": -1})
    # Case2: label column is bool
    elif df[self.label_column].dtype == "bool":
        df["PercolatorLabel"] = df[self.label_column].map({True: 1, False: -1})
    else:
        # try to convert to bool
        logger.warning("Label column is not str or bool. Converting to bool.")
        df["PercolatorLabel"] = df[self.label_column].astype(bool).map({True: 1, False: -1})
    logger.info("Writing PIN file to %s", output_file)
    logger.info("Using style: %s", style)

    feature_cols = []
    if source is None:
        for _, cols in self.rescoring_features.items():
            feature_cols.extend(cols)
    else:
        for s in source:
            if s not in self.rescoring_features:
                raise ValueError(f"Source '{s}' not found in rescoring features.")
            feature_cols.extend(self.rescoring_features[s])

    pin_df = pd.DataFrame()
    pin_df["SpecId"] = df[self.spectrum_column]
    pin_df["Label"] = df["PercolatorLabel"]
    pin_df["ScanNr"] = df[self.scan_column]

    if style == "msbooster":
        if self.retention_time_column:
            pin_df["retentiontime"] = df[self.retention_time_column]
        else:
            raise ValueError("Retention time column is required for msbooster style.")

        pin_df["rank"] = df[self.hit_rank_column].astype(int) if self.hit_rank_column else 1
        if "hyperscore" in self.feature_columns:
            pin_df["hyperscore"] = df["hyperscore"]
        elif "expect" in self.feature_columns:
            pin_df["log10_evalue"] = df["expect"]
        else:
            raise ValueError(
                "Either 'hyperscore' or 'expect' column is required for msbooster style."
            )

        # Add other features and jump the hyperscore or expect column
        for col in feature_cols:
            if col not in [
                "hyperscore",
                "expect",
                self.hit_rank_column,
                self.retention_time_column,
            ]:
                pin_df[col] = df[col]

        # PEPTIDE -> _.PEPTIDE._
        # Add _. at the front and ._ at the end of the peptide column
        pin_df["Peptide"] = df[self.peptide_column].apply(
            lambda x: f"_.{x}._" if isinstance(x, str) else x
        )

    elif style == "default":
        for col in feature_cols:
            pin_df[col] = df[col]
        pin_df["Peptide"] = df[self.peptide_column]
    else:
        raise ValueError(f"Unknown style: {style}. Use 'msbooster' or 'default'.")

    pin_df["Proteins"] = df[self.protein_column].apply(
        lambda x: ";".join(x) if isinstance(x, (list, tuple)) else x
    )
    pin_df = self._convert_float_to_int(pin_df)
    pin_df.to_csv(output_file, sep="\t", index=False)
    logger.info("PIN file written to %s", output_file)
    return pin_df

Utilities

utils

logger = getLogger(__name__) module-attribute

convert_pfm_to_pwm(pfm_filename, pseudocount=0.8, background_freqs=None)

Convert a Position Frequency Matrix (PFM) file to a Position Weight Matrix (PWM).

Parameters:

Name Type Description Default
pfm_filename str

The file path to the PFM file.

required
pseudocount float

The pseudocount to add to the PFM to avoid zero probabilities. Default is 0.8.

0.8
background_freqs dict

Dictionary containing the background frequencies for each amino acid. If None, uses 1/20 for all.

None

Returns:

Type Description
DataFrame

DataFrame representation of the PWM.

Notes

The conversion process involves: 1. Adding pseudocounts to the PFM 2. Converting to Position Probability Matrix (PPM) 3. Converting to PWM using log2(PPM/background_freqs)

Source code in optimhc/utils.py
def convert_pfm_to_pwm(pfm_filename, pseudocount=0.8, background_freqs=None):
    """
    Convert a Position Frequency Matrix (PFM) file to a Position Weight Matrix (PWM).

    Parameters
    ----------
    pfm_filename : str
        The file path to the PFM file.
    pseudocount : float, optional
        The pseudocount to add to the PFM to avoid zero probabilities. Default is 0.8.
    background_freqs : dict, optional
        Dictionary containing the background frequencies for each amino acid.
        If None, uses 1/20 for all.

    Returns
    -------
    pd.DataFrame
        DataFrame representation of the PWM.

    Notes
    -----
    The conversion process involves:
    1. Adding pseudocounts to the PFM
    2. Converting to Position Probability Matrix (PPM)
    3. Converting to PWM using log2(PPM/background_freqs)
    """
    # Default background frequencies if not provided
    if background_freqs is None:
        background_freqs = 1 / 20

    pfm = pd.read_csv(pfm_filename, sep="\t", header=None, index_col=0)
    pfm.drop(pfm.columns[-1], axis=1, inplace=True)  # Drop any extraneous columns
    pfm_pseudo = pfm + pseudocount
    ppm = pfm_pseudo.div(pfm_pseudo.sum(axis=0), axis=1)

    # Handle both scalar and dictionary background frequencies
    if isinstance(background_freqs, dict):
        pwm = np.log2(ppm.div(list(background_freqs.values())))
    else:
        pwm = np.log2(ppm / background_freqs)

    return pwm

strip_flanking_and_charge(peptide)

Remove the pre and next amino acids from a peptide sequence. Also when there is a charge state at the end of the peptide, remove it.

Parameters:

Name Type Description Default
peptide str

The peptide sequence with flanking amino acids. Example: '.AANDAGYFNDEMAPIEVKTK.' Example: 'F.VTVQGRAIC[119.0041]SDPNNKRVKN4.A' Example: '-.RRVEHHDHAVVSGR4.L'

required

Returns:

Type Description
str

The peptide sequence with flanking amino acids removed. Example: 'AANDAGYFNDEMAPIEVKTK' Example: 'VTVQGRAIC[119.0041]SDPNNKRVKN' Example: 'RRVEHHDHAVVSGR'

Notes

This function removes any amino acids before the first '.' and after the last '.' in the peptide sequence.

Source code in optimhc/utils.py
def strip_flanking_and_charge(peptide: str) -> str:
    """
    Remove the pre and next amino acids from a peptide sequence.
    Also when there is a charge state at the end of the peptide, remove it.

    Parameters
    ----------
    peptide : str
        The peptide sequence with flanking amino acids.
        Example: '.AANDAGYFNDEMAPIEVKTK.'
        Example: 'F.VTVQGRAIC[119.0041]SDPNNKRVKN4.A'
        Example: '-.RRVEHHDHAVVSGR4.L'

    Returns
    -------
    str
        The peptide sequence with flanking amino acids removed.
        Example: 'AANDAGYFNDEMAPIEVKTK'
        Example: 'VTVQGRAIC[119.0041]SDPNNKRVKN'
        Example: 'RRVEHHDHAVVSGR'

    Notes
    -----
    This function removes any amino acids before the first '.' and after the last '.'
    in the peptide sequence.
    """
    peptide = re.sub(r"^[^.]*\.|\.[^.]*$", "", peptide)

    # Some PIN may have charge state at the end of the peptide, e.g., R.RRVEHHDHAVVSGR4.L
    # We should remove it to get the correct peptide sequence
    # Fragpipe: F.VTVQGRAIC[119.0041]SDPNNKRVKN4.A, remove charge state '4' in the peptide

    if peptide and peptide[-1].isdigit():
        peptide = peptide[:-1]

    return peptide

remove_modifications(peptide, keep_modification=None)

Remove modifications from a peptide sequence, with an option to keep specific modifications.

Parameters:

Name Type Description Default
peptide str

The peptide sequence with modifications in brackets. Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'

required
keep_modification str or list

The modification(s) to keep. If provided, only these modifications will be preserved in the output sequence. Default is None.

None

Returns:

Type Description
str

The peptide sequence with modifications removed or kept. Example: 'AANDAGYFNDEMAPIEVKTK' (if keep_modification is None) Example: 'AANDAGYFNDEM[15.9949]APIEVKTK' (if keep_modification=['15.9949'])

Notes

Modifications are specified in square brackets after the amino acid. If keep_modification is provided, only those specific modifications will be preserved in the output sequence.

Source code in optimhc/utils.py
def remove_modifications(peptide: str, keep_modification=None) -> str:
    """
    Remove modifications from a peptide sequence, with an option to keep specific modifications.

    Parameters
    ----------
    peptide : str
        The peptide sequence with modifications in brackets.
        Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'
    keep_modification : str or list, optional
        The modification(s) to keep. If provided, only these modifications will be
        preserved in the output sequence. Default is None.

    Returns
    -------
    str
        The peptide sequence with modifications removed or kept.
        Example: 'AANDAGYFNDEMAPIEVKTK' (if keep_modification is None)
        Example: 'AANDAGYFNDEM[15.9949]APIEVKTK' (if keep_modification=['15.9949'])

    Notes
    -----
    Modifications are specified in square brackets after the amino acid.
    If keep_modification is provided, only those specific modifications will be
    preserved in the output sequence.
    """
    if keep_modification is None:
        return re.sub(r"\[.*?\]", "", peptide)
    else:
        if not isinstance(keep_modification, list):
            keep_modification = [keep_modification]

        def replace_mod(match):
            mod = match.group(0)
            if any(keep in mod for keep in keep_modification):
                return mod
            return ""

        return re.sub(r"\[.*?\]", replace_mod, peptide)

preprocess_peptide(peptide)

Preprocess the peptide sequence by removing flanking regions and modifications.

Parameters:

Name Type Description Default
peptide str

Original peptide sequence with possible flanking regions and modifications. Example: '.AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK.'

required

Returns:

Type Description
str

Cleaned peptide sequence without flanking regions and modifications. Example: 'AANDAGYFNDEMAPIEVKTK'

Notes

This function performs two operations in sequence: 1. Removes flanking amino acids using remove_pre_and_nxt_aa 2. Removes all modifications using remove_modifications

Source code in optimhc/utils.py
def preprocess_peptide(peptide: str) -> str:
    """
    Preprocess the peptide sequence by removing flanking regions and modifications.

    Parameters
    ----------
    peptide : str
        Original peptide sequence with possible flanking regions and modifications.
        Example: '.AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK.'

    Returns
    -------
    str
        Cleaned peptide sequence without flanking regions and modifications.
        Example: 'AANDAGYFNDEMAPIEVKTK'

    Notes
    -----
    This function performs two operations in sequence:
    1. Removes flanking amino acids using remove_pre_and_nxt_aa
    2. Removes all modifications using remove_modifications
    """
    peptide = strip_flanking_and_charge(peptide)
    peptide = remove_modifications(peptide)
    return peptide

list_all_files_in_directory(directory_path)

Retrieve all files in the specified directory and return a list of file paths.

Parameters:

Name Type Description Default
directory_path str

The path to the directory to search in. Example: '/path/to/directory'

required

Returns:

Type Description
list of str

List of absolute file paths found in the directory and its subdirectories. Example: ['/path/to/directory/file1.txt', '/path/to/directory/subdir/file2.txt']

Notes

This function recursively searches through all subdirectories and returns absolute paths for all files found.

Source code in optimhc/utils.py
def list_all_files_in_directory(directory_path: str) -> List[str]:
    """
    Retrieve all files in the specified directory and return a list of file paths.

    Parameters
    ----------
    directory_path : str
        The path to the directory to search in.
        Example: '/path/to/directory'

    Returns
    -------
    list of str
        List of absolute file paths found in the directory and its subdirectories.
        Example: ['/path/to/directory/file1.txt', '/path/to/directory/subdir/file2.txt']

    Notes
    -----
    This function recursively searches through all subdirectories and returns
    absolute paths for all files found.
    """
    path = Path(directory_path)
    file_list = [str(file) for file in path.rglob("*") if file.is_file()]
    return file_list

extract_unimod_from_peptidoform(peptide, mod_dict)

Convert a modified peptide sequence into DeepLC format.

Parameters:

Name Type Description Default
peptide str

The input peptide sequence with modifications in brackets. Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'

required
mod_dict dict

Dictionary mapping modification names (in peptide) to corresponding Unimod names. Example: {'15.9949': 'Oxidation', '42.0106': 'Acetyl'}

required

Returns:

Type Description
tuple

(seq, modifications): seq : str The unmodified peptide sequence. modifications : str String of modifications formatted as position|UnimodName, separated by pipes |.

Source code in optimhc/utils.py
def extract_unimod_from_peptidoform(peptide: str, mod_dict: dict) -> tuple:
    """
    Convert a modified peptide sequence into DeepLC format.

    Parameters
    ----------
    peptide : str
        The input peptide sequence with modifications in brackets.
        Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'
    mod_dict : dict
        Dictionary mapping modification names (in peptide) to corresponding Unimod names.
        Example: {'15.9949': 'Oxidation', '42.0106': 'Acetyl'}

    Returns
    -------
    tuple
        (seq, modifications):
            seq : str
                The unmodified peptide sequence.
            modifications : str
                String of modifications formatted as `position|UnimodName`, separated by pipes `|`.
    """
    clean_sequence = ""
    modifications = []
    current_position = 0
    i = 0
    while i < len(peptide):
        if peptide[i] == "[":
            end = peptide.find("]", i)
            if end == -1:
                raise ValueError(
                    f"Invalid modification format in {peptide}: missing closing bracket."
                )
            mod_name = peptide[i + 1 : end]
            if mod_name not in mod_dict:
                raise ValueError(f"Modification '{mod_name}' not found in the dictionary.")
            modifications.append(f"{current_position}|{mod_dict[mod_name]}")
            i = end + 1
        else:
            clean_sequence += peptide[i]
            current_position += 1
            i += 1

    modification_str = "|".join(modifications)
    logger.debug(f"Original peptide: {peptide}")
    logger.debug(f"Output clean_sequence: {clean_sequence}")
    logger.debug(f"Output modifications: {modification_str}")
    return clean_sequence, modification_str

convert_to_unimod_format(peptide, mod_dict)

Convert a modified peptide sequence into Unimod format.

Parameters:

Name Type Description Default
peptide str

The input peptide sequence with modifications in brackets. Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'

required
mod_dict dict

Dictionary mapping modification names (in peptide) to corresponding Unimod names. Example: {'15.9949': 'UNIMOD:4', '42.0106': 'UNIMOD:1'}

required

Returns:

Type Description
str

The peptide sequence formatted for Unimod. Example: 'AANDAGYFNDEM[UNIMOD:4]APIEVK[UNIMOD:1]TK'

Notes

This function replaces the modification names in brackets with their corresponding Unimod identifiers while preserving the peptide sequence structure.

Source code in optimhc/utils.py
def convert_to_unimod_format(peptide: str, mod_dict: dict) -> str:
    """
    Convert a modified peptide sequence into Unimod format.

    Parameters
    ----------
    peptide : str
        The input peptide sequence with modifications in brackets.
        Example: 'AANDAGYFNDEM[15.9949]APIEVK[42.0106]TK'
    mod_dict : dict
        Dictionary mapping modification names (in peptide) to corresponding Unimod names.
        Example: {'15.9949': 'UNIMOD:4', '42.0106': 'UNIMOD:1'}

    Returns
    -------
    str
        The peptide sequence formatted for Unimod.
        Example: 'AANDAGYFNDEM[UNIMOD:4]APIEVK[UNIMOD:1]TK'

    Notes
    -----
    This function replaces the modification names in brackets with their
    corresponding Unimod identifiers while preserving the peptide sequence
    structure.
    """
    res = peptide
    for key, value in mod_dict.items():
        res = res.replace(f"[{key}]", f"[{value}]")
    logger.debug(f"Original peptide: {peptide}")
    logger.debug(f"Output peptide: {res}")
    return res