Skip to content

Feature Generators

Base Class

base_feature_generator

PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)

A container for managing peptide-spectrum matches (PSMs) in immunopeptidomics rescoring pipelines.

Parameters:

Name Type Description Default
psms DataFrame

DataFrame containing the PSM data.

required
label_column str

Column containing the label (True for target, False for decoy).

required
scan_column str

Column containing the scan number.

required
spectrum_column str

Column containing the spectrum identifier.

required
ms_data_file_column str

Column containing the MS data file that the PSM originated from.

required
peptide_column str

Column containing the peptide sequence.

required
protein_column str

Column containing the protein accessions.

required
rescoring_features dict of str to list of str

Dictionary of feature columns for rescoring.

required
hit_rank_column str

Column containing the hit rank.

None
charge_column str

Column containing the charge state.

None
retention_time_column str

Column containing the retention time.

None
calculated_mass_column str

Column containing the calculated mass.

None
metadata_column str

Column containing metadata.

None

Attributes:

Name Type Description
psms DataFrame

Copy of the DataFrame containing the PSM data.

target_psms DataFrame

DataFrame containing only target PSMs (label = True).

decoy_psms DataFrame

DataFrame containing only decoy PSMs (label = False).

peptides list of str

List containing all peptides from the PSM data.

columns list of str

List of column names in the PSM DataFrame.

rescoring_features dict of str to list of str

Dictionary of rescoring feature columns in the PSM DataFrame.

Source code in optimhc/psm_container.py
def __init__(
    self,
    psms: pd.DataFrame,
    label_column: str,
    scan_column: str,
    spectrum_column: str,
    ms_data_file_column: str,
    peptide_column: str,
    protein_column: str,
    rescoring_features: Dict[str, List[str]],
    hit_rank_column: Optional[str] = None,
    charge_column: Optional[str] = None,
    retention_time_column: Optional[str] = None,
    calculated_mass_column: Optional[str] = None,
    metadata_column: Optional[str] = None,
):
    self._psms = psms.copy()
    self._psms.reset_index(drop=True, inplace=True)
    self.label_column = label_column
    self.scan_column = scan_column
    self.spectrum_column = spectrum_column
    self.ms_data_file_column = ms_data_file_column
    self.peptide_column = peptide_column
    self.protein_column = protein_column
    self.hit_rank_column = hit_rank_column
    self.retention_time_column = retention_time_column
    self.metadata_column = metadata_column
    self.rescoring_features = rescoring_features
    self.charge_column = charge_column
    self.calculated_mass_column = calculated_mass_column
    # rescore result column
    self.rescore_result_column = None

    # check if the columns are in the dataframe
    def check_column(col):
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")

    check_column(label_column)
    check_column(scan_column)
    check_column(spectrum_column)
    check_column(ms_data_file_column)
    check_column(peptide_column)
    check_column(protein_column)
    check_column(hit_rank_column)
    check_column(retention_time_column)
    check_column(charge_column)
    check_column(calculated_mass_column)

    # ensure the label column is boolean
    if psms[label_column].dtype != "bool":
        raise ValueError(f"Column '{label_column}' must be boolean.")

    if psms[label_column].nunique() == 1 and psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as target. No decoy PSMs found.")
    elif psms[label_column].nunique() == 1 and not psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as decoy. No target PSMs found.")

    def check_metadata_column(col):
        # check the type is Dict[str, Dict[str, str]]
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")
        if not all(isinstance(x, dict) for x in self._psms[col]):
            raise ValueError(f"Column '{col}' must contain dictionaries.")

    if metadata_column:
        check_metadata_column(metadata_column)

    def check_rescoring_features(features: Dict[str, List[str]]):
        for key, cols in features.items():
            for col in cols:
                if col not in psms.columns:
                    raise ValueError(
                        f"Column '{col}' not found in PSM data for feature '{key}'."
                    )

    check_rescoring_features(rescoring_features)

    # check if the number of decoy psms is not 0
    if len(self.decoy_psms) == 0:
        logger.error("No decoy PSMs found. Please check the decoy prefix.")
        raise ValueError("No decoy PSMs found.")

    logger.info("PsmContainer initialized with %d PSM entries.", len(self._psms))
    if self.ms_data_file_column:
        logger.info(
            "PSMs originated from %d MS data file(s).",
            len(self._psms[ms_data_file_column].unique()),
        )
    logger.info("target psms: %d", len(self.target_psms))
    logger.info("decoy psms: %d", len(self.decoy_psms))
    logger.info("unique peptides: %d", len(np.unique(self.peptides)))
    logger.info("rescoring features: %s", rescoring_features)

psms property

Get a copy of the PSM DataFrame to prevent external modification.

Returns:

Type Description
DataFrame

A copy of the PSM DataFrame.

target_psms property

Get a DataFrame containing only target PSMs.

Returns:

Type Description
DataFrame

DataFrame with only target PSMs (label = True).

decoy_psms property

Get a DataFrame containing only decoy PSMs.

Returns:

Type Description
DataFrame

DataFrame with only decoy PSMs (label = False).

columns property

Get the column names of the PSM DataFrame.

Returns:

Type Description
list of str

List of column names.

feature_columns property

Get a list of all feature columns in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature column names.

feature_sources property

Get a list of all feature sources in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature source names.

peptides property

Get the peptide sequences from the PSM data.

Returns:

Type Description
list of str

List of peptide sequences.

ms_data_files property

Get the MS data files from the PSM data.

Returns:

Type Description
list of str

List of MS data file names.

scan_ids property

Get the scan numbers from the PSM data.

Returns:

Type Description
list of int

List of scan numbers.

charges property

Get the charge states from the PSM data.

Returns:

Type Description
list of int

List of charge states.

metadata property

Get the metadata from the PSM data.

Returns:

Type Description
Series

Series containing metadata for each PSM.

spectrum_ids property

Get the spectrum identifiers from the PSM data.

Returns:

Type Description
list of str

List of spectrum identifiers.

identifier_columns property

Get the columns that uniquely identify each PSM.

Returns:

Type Description
list of str

List of identifier column names.

__len__()

Get the number of PSMs in the container.

Returns:

Type Description
int

Number of PSMs.

Source code in optimhc/psm_container.py
def __len__(self) -> int:
    """
    Get the number of PSMs in the container.

    Returns
    -------
    int
        Number of PSMs.
    """
    return len(self._psms)

copy()

Return a deep copy of the PsmContainer object.

Returns:

Type Description
PsmContainer

A deep copy of the current PsmContainer.

Source code in optimhc/psm_container.py
def copy(self) -> "PsmContainer":
    """
    Return a deep copy of the PsmContainer object.

    Returns
    -------
    PsmContainer
        A deep copy of the current PsmContainer.
    """
    import copy

    return copy.deepcopy(self)

__repr__()

Return a string representation of the PsmContainer.

Returns:

Type Description
str

String summary of the PsmContainer.

Source code in optimhc/psm_container.py
def __repr__(self) -> str:
    """
    Return a string representation of the PsmContainer.

    Returns
    -------
    str
        String summary of the PsmContainer.
    """
    return (
        f"PsmContainer with {len(self)} PSMs\n"
        f"\t - Target PSMs: {len(self.target_psms)}\n"
        f"\t - Decoy PSMs: {len(self.decoy_psms)}\n"
        f"\t - Unique Peptides: {len(np.unique(self.peptides))}\n"
        f"\t - Unique Spectra: {len(self._psms[self.spectrum_column].unique())}\n"
        f"\t - Rescoring Features: {self.rescoring_features}\n"
    )

drop_features(features)

Drop specified features from the PSM DataFrame.

Parameters:

Name Type Description Default
features list of str

List of feature column names to drop.

required

Raises:

Type Description
ValueError

If any of the features do not exist in the DataFrame.

Source code in optimhc/psm_container.py
def drop_features(self, features: List[str]) -> None:
    """
    Drop specified features from the PSM DataFrame.

    Parameters
    ----------
    features : list of str
        List of feature column names to drop.

    Raises
    ------
    ValueError
        If any of the features do not exist in the DataFrame.
    """
    missing_features = [f for f in features if f not in self._psms.columns]
    if missing_features:
        raise ValueError(f"Features not found in PSM data: {missing_features}")

    self._psms.drop(columns=features, inplace=True)
    # Create a list of sources to update
    sources_to_update = []
    for source, cols in self.rescoring_features.items():
        self.rescoring_features[source] = [col for col in cols if col not in features]
        if not self.rescoring_features[source]:
            sources_to_update.append(source)

    logger.info(
        f"Sources to be removed: {sources_to_update}. Because all the features are removed."
    )
    # Remove sources with no features left
    for source in sources_to_update:
        del self.rescoring_features[source]

drop_source(source)

Drop all features associated with a specific source from the PSM DataFrame.

Parameters:

Name Type Description Default
source str

Name of the source to drop.

required

Raises:

Type Description
ValueError

If the source does not exist in the rescoring features.

Source code in optimhc/psm_container.py
def drop_source(self, source: str) -> None:
    """
    Drop all features associated with a specific source from the PSM DataFrame.

    Parameters
    ----------
    source : str
        Name of the source to drop.

    Raises
    ------
    ValueError
        If the source does not exist in the rescoring features.
    """
    if source not in self.rescoring_features:
        raise ValueError(f"Source '{source}' not found in rescoring features.")
    self.drop_features(self.rescoring_features[source])

add_metadata(metadata_df, psms_key, metadata_key, source)

Merge new metadata into the PSM DataFrame based on specified columns. Metadata from the specified source is stored as a nested dictionary inside the metadata column.

Parameters:

Name Type Description Default
metadata_df DataFrame

DataFrame containing new metadata to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
metadata_key str or list of str

Column name(s) in the metadata data to merge on.

required
source str

Name of the source of the new metadata.

required
Source code in optimhc/psm_container.py
def add_metadata(
    self,
    metadata_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    metadata_key: Union[str, List[str]],
    source,
) -> None:
    """
    Merge new metadata into the PSM DataFrame based on specified columns.
    Metadata from the specified source is stored as a nested dictionary inside the metadata column.

    Parameters
    ----------
    metadata_df : pd.DataFrame
        DataFrame containing new metadata to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    metadata_key : str or list of str
        Column name(s) in the metadata data to merge on.
    source : str
        Name of the source of the new metadata.
    """
    if self.metadata_column is None:
        logger.info("No existing metadata column. Creating new metadata column.")
        self.metadata_column = "metadata"
        self._psms["metadata"] = [{} for _ in range(len(self._psms))]

    metadata_cols = [col for col in metadata_df.columns if col not in metadata_key]
    merged_df = self.psms.merge(
        metadata_df, left_on=psms_key, right_on=metadata_key, how="left"
    )
    if source in self._psms["metadata"]:
        logger.warning(f"{source} already exists in metadata. Overwriting.")
    for col in metadata_cols:
        merged_df["metadata"] = merged_df.apply(
            lambda row: {
                **row["metadata"],
                source: (
                    {col: row[col]}
                    if source not in row["metadata"]
                    else {**row["metadata"][source], col: row[col]}
                ),
            },
            axis=1,
        )

    self._psms["metadata"] = merged_df["metadata"]

get_top_hits(n=1)

Get the top n hits based on the hit rank column. If the hit rank column is not specified, returns the original PSMs.

Parameters:

Name Type Description Default
n int

The number of top hits to return. Default is 1.

1

Returns:

Type Description
PsmContainer

A new PsmContainer object containing the top n hits.

Source code in optimhc/psm_container.py
def get_top_hits(self, n: int = 1):
    """
    Get the top n hits based on the hit rank column.
    If the hit rank column is not specified, returns the original PSMs.

    Parameters
    ----------
    n : int, optional
        The number of top hits to return. Default is 1.

    Returns
    -------
    PsmContainer
        A new PsmContainer object containing the top n hits.
    """
    if self.hit_rank_column is None:
        logger.warning("Rank column not specified. Return the original PSMs.")
        return self.copy()

    psms = self.copy()
    psms._psms = psms._psms[psms._psms[self.hit_rank_column] <= n]
    return psms

add_features(features_df, psms_key, feature_key, source, suffix=None)

Merge new features into the PSM DataFrame based on specified columns.

This method performs a left join between the PSM data and feature data, ensuring that all PSMs are preserved while adding new features. It handles column name conflicts through optional suffixing and maintains feature source tracking.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
feature_key str or list of str

Column name(s) in the features data to merge on.

required
source str

Name of the source of the new features (e.g., 'deeplc', 'netmhc').

required
suffix str

Suffix to add to the new columns if there's a name conflict. Required when new feature columns have the same names as existing columns. For example, if adding features from different sources (e.g., 'score' from DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

None

Returns:

Type Description
None

Raises:

Type Description
ValueError

If duplicate columns exist without suffix. If merging features changes the number of PSMs.

Notes

The method follows these steps: 1. Validates input and prepares merge keys 2. Checks for column name conflicts 3. Manages feature source: if the source already exists, it will be overwritten 4. Performs left join merge 5. Verifies data integrity

Suffix Usage

The suffix parameter is used to handle column name conflicts: - When adding features from different sources that might have the same column names - When you want to keep both the original and new features with the same name - When you need to track the source of features in the column names

If suffix is not provided and there are duplicate column names: - The method will raise a ValueError - You must either provide a suffix or rename the columns before adding

Examples:

>>> container = PsmContainer(...)
>>> # Adding features without suffix (no conflicts)
>>> features_df1 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'feature1': [0.1, 0.2, 0.3],
...     'feature2': [0.4, 0.5, 0.6]
... })
>>> container.add_features(
...     features_df1,
...     psms_key='scan',
...     feature_key='scan',
...     source='source1'
... )
>>> # Adding features with suffix (handling conflicts)
>>> features_df2 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
...     'feature3': [0.7, 0.8, 0.9]
... })
>>> container.add_features(
...     features_df2,
...     psms_key='scan',
...     feature_key='scan',
...     source='source2',
...     suffix='_new'  # 'score' becomes 'score_new'
... )
Source code in optimhc/psm_container.py
def add_features(
    self,
    features_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    feature_key: Union[str, List[str]],
    source: str,
    suffix: Optional[str] = None,
) -> None:
    """Merge new features into the PSM DataFrame based on specified columns.

    This method performs a left join between the PSM data and feature data,
    ensuring that all PSMs are preserved while adding new features. It handles
    column name conflicts through optional suffixing and maintains feature source
    tracking.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    feature_key : str or list of str
        Column name(s) in the features data to merge on.
    source : str
        Name of the source of the new features (e.g., 'deeplc', 'netmhc').
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
        Required when new feature columns have the same names as existing columns.
        For example, if adding features from different sources (e.g., 'score' from
        DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If duplicate columns exist without suffix.
        If merging features changes the number of PSMs.

    Notes
    -----
    The method follows these steps:
    1. Validates input and prepares merge keys
    2. Checks for column name conflicts
    3. Manages feature source: if the source already exists, it will be overwritten
    4. Performs left join merge
    5. Verifies data integrity

    Suffix Usage
    -----------
    The suffix parameter is used to handle column name conflicts:
    - When adding features from different sources that might have the same column names
    - When you want to keep both the original and new features with the same name
    - When you need to track the source of features in the column names

    If suffix is not provided and there are duplicate column names:
    - The method will raise a ValueError
    - You must either provide a suffix or rename the columns before adding

    Examples
    --------
    >>> container = PsmContainer(...)
    >>> # Adding features without suffix (no conflicts)
    >>> features_df1 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'feature1': [0.1, 0.2, 0.3],
    ...     'feature2': [0.4, 0.5, 0.6]
    ... })
    >>> container.add_features(
    ...     features_df1,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source1'
    ... )
    >>> # Adding features with suffix (handling conflicts)
    >>> features_df2 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
    ...     'feature3': [0.7, 0.8, 0.9]
    ... })
    >>> container.add_features(
    ...     features_df2,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source2',
    ...     suffix='_new'  # 'score' becomes 'score_new'
    ... )
    """
    if isinstance(psms_key, str):
        psms_key = [psms_key]

    if isinstance(feature_key, str):
        feature_key = [feature_key]

    new_feature_cols = [col for col in features_df.columns if col not in feature_key]

    for cols in new_feature_cols:
        if cols in self._psms.columns:
            logger.warning(f"Column '{cols}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix ")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")
    logger.info(f"Adding {len(new_feature_cols)} new features from {source}.")

    if not new_feature_cols:
        logger.warning("No new features to add. Check the feature key and PSMs key.")
        logger.warning(f"Feature key: {feature_key}; PSMs key: {psms_key}")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    # TODO: reluctant logic
    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df = features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}
    )
    original_len = len(self._psms)
    # avoid merge the right key to the psms
    self._psms = self._psms.merge(
        features_df, left_on=psms_key, right_on=feature_key, how="left"
    )

    if feature_key != psms_key:
        cols_to_drop = [
            col for col in feature_key if col not in psms_key and col in self._psms.columns
        ]
        if cols_to_drop:
            logger.debug(f"Dropping columns from feature_key not in psms_key: {cols_to_drop}")
            self._psms.drop(columns=cols_to_drop, inplace=True)

    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate keys."
        )

add_features_by_index(features_df, source, suffix=None)

Merge new features into the PSM DataFrame based on the DataFrame index.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
source str

Name of the source of the new features.

required
suffix str

Suffix to add to the new columns if there's a name conflict.

None
Source code in optimhc/psm_container.py
def add_features_by_index(
    self, features_df: pd.DataFrame, source: str, suffix: Optional[str] = None
) -> None:
    """
    Merge new features into the PSM DataFrame based on the DataFrame index.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    source : str
        Name of the source of the new features.
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
    """
    new_feature_cols = [col for col in features_df.columns]
    for col in new_feature_cols:
        if col in self._psms.columns:
            logger.warning(f"Column '{col}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix.")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")

    logger.info(f"Adding {len(new_feature_cols)} new features from {source} by index.")

    if not new_feature_cols:
        logger.warning("No new features to add.")
        raise ValueError("No new features to add.")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}, inplace=True
    )
    original_len = len(self._psms)
    self._psms = self._psms.merge(
        features_df,
        left_index=True,
        right_index=True,
        how="left",  # Perform a left join to preserve all original PSM data
    )

    # Ensure that the merge did not change the number of rows in the PSM DataFrame
    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate indices."
        )

add_results(results_df, psms_key, result_key)

Add results of rescore engine to the PSM DataFrame based on specified columns.

Parameters:

Name Type Description Default
results_df DataFrame

DataFrame containing new results to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
result_key str or list of str

Column name(s) in the results data to merge on.

required
Source code in optimhc/psm_container.py
def add_results(
    self,
    results_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    result_key: Union[str, List[str]],
) -> None:
    """
    Add results of rescore engine to the PSM DataFrame based on specified columns.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing new results to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    result_key : str or list of str
        Column name(s) in the results data to merge on.
    """
    if self.rescore_result_column is not None:
        logger.warning("Rescore result column already exists. Overwriting.")

    if set(self._psms.columns) & set(results_df.columns):
        raise ValueError(
            "Duplicate columns exist. Please rename the columns in the results data."
        )

    self.rescore_result_column = result_key
    self._psms = self._psms.merge(
        results_df,
        left_on=psms_key,
        right_on=result_key,
        how="left",
        validate="one_to_one",
    )
    self._psms.drop(columns=result_key, inplace=True)
    logger.info("Added rescore results to PSM data.")

write_pin(output_file, style='default', source=None)

Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats. The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

Parameters:

Name Type Description Default
output_file str

Path to the output PIN file.

required
style str

If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins). If set to 'default', outputs all features specified in rescoring_features, plus required Percolator columns.

'default'
source list of str

List of feature sources to include. If None, includes all sources.

None

Returns:

Type Description
DataFrame

The DataFrame written to the PIN file.

Notes
  • The first three columns are always: SpecID, Label, ScanNr.
  • For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
  • If hit_rank_column is not specified, rank is set to 1 for all rows.
  • Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
  • The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
  • The 'Peptide' column is formatted with underscores (e.g., _.PEPTIDE._).
  • For standard format, all features from rescoring_features are appended between ScanNr and Peptide columns.
  • The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
  • Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

Example output (default style): SpecId Label ScanNr feature1 feature2 ... Peptide Proteins

Example output (msbooster style): SpecId Label ScanNr retentiontime rank hyperscore Peptide Proteins or SpecId Label ScanNr retentiontime rank log10_evalue Peptide Proteins

Raises:

Type Description
ValueError

If required columns are missing for the selected style.

Source code in optimhc/psm_container.py
def write_pin(
    self, output_file: str, style: str = "default", source: List[str] = None
) -> None:
    """
    Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats.
    The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

    Parameters
    ----------
    output_file : str
        Path to the output PIN file.
    style : str, optional
        If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins).
        If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.
    source : list of str, optional
        List of feature sources to include. If None, includes all sources.

    Returns
    -------
    pd.DataFrame
        The DataFrame written to the PIN file.

    Notes
    -----
    - The first three columns are always: SpecID, Label, ScanNr.
    - For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
    - If `hit_rank_column` is not specified, rank is set to 1 for all rows.
    - Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
    - The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
    - The 'Peptide' column is formatted with underscores (e.g., `_.PEPTIDE._`).
    - For standard format, all features from `rescoring_features` are appended between ScanNr and Peptide columns.
    - The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
    - Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

    Example output (default style):
        SpecId	Label	ScanNr	feature1	feature2	...	Peptide	Proteins

    Example output (msbooster style):
        SpecId	Label	ScanNr	retentiontime	rank	hyperscore	Peptide	Proteins
        or
        SpecId	Label	ScanNr	retentiontime	rank	log10_evalue	Peptide	Proteins

    Raises
    ------
    ValueError
        If required columns are missing for the selected style.
    """
    df = self._psms.copy()
    # Check if the label column is str
    # Case1: label column is str
    if df[self.label_column].dtype == "str":
        df["PercolatorLabel"] = df[self.label_column].map({"True": 1, "False": -1})
    # Case2: label column is bool
    elif df[self.label_column].dtype == "bool":
        df["PercolatorLabel"] = df[self.label_column].map({True: 1, False: -1})
    else:
        # try to convert to bool
        logger.warning("Label column is not str or bool. Converting to bool.")
        df["PercolatorLabel"] = df[self.label_column].astype(bool).map({True: 1, False: -1})
    logger.info("Writing PIN file to %s", output_file)
    logger.info("Using style: %s", style)

    feature_cols = []
    if source is None:
        for _, cols in self.rescoring_features.items():
            feature_cols.extend(cols)
    else:
        for s in source:
            if s not in self.rescoring_features:
                raise ValueError(f"Source '{s}' not found in rescoring features.")
            feature_cols.extend(self.rescoring_features[s])

    pin_df = pd.DataFrame()
    pin_df["SpecId"] = df[self.spectrum_column]
    pin_df["Label"] = df["PercolatorLabel"]
    pin_df["ScanNr"] = df[self.scan_column]

    if style == "msbooster":
        if self.retention_time_column:
            pin_df["retentiontime"] = df[self.retention_time_column]
        else:
            raise ValueError("Retention time column is required for msbooster style.")

        pin_df["rank"] = df[self.hit_rank_column].astype(int) if self.hit_rank_column else 1
        if "hyperscore" in self.feature_columns:
            pin_df["hyperscore"] = df["hyperscore"]
        elif "expect" in self.feature_columns:
            pin_df["log10_evalue"] = df["expect"]
        else:
            raise ValueError(
                "Either 'hyperscore' or 'expect' column is required for msbooster style."
            )

        # Add other features and jump the hyperscore or expect column
        for col in feature_cols:
            if col not in [
                "hyperscore",
                "expect",
                self.hit_rank_column,
                self.retention_time_column,
            ]:
                pin_df[col] = df[col]

        # PEPTIDE -> _.PEPTIDE._
        # Add _. at the front and ._ at the end of the peptide column
        pin_df["Peptide"] = df[self.peptide_column].apply(
            lambda x: f"_.{x}._" if isinstance(x, str) else x
        )

    elif style == "default":
        for col in feature_cols:
            pin_df[col] = df[col]
        pin_df["Peptide"] = df[self.peptide_column]
    else:
        raise ValueError(f"Unknown style: {style}. Use 'msbooster' or 'default'.")

    pin_df["Proteins"] = df[self.protein_column].apply(
        lambda x: ";".join(x) if isinstance(x, (list, tuple)) else x
    )
    pin_df = self._convert_float_to_int(pin_df)
    pin_df.to_csv(output_file, sep="\t", index=False)
    logger.info("PIN file written to %s", output_file)
    return pin_df

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

Basic

basic

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

BasicFeatureGenerator(peptides, remove_pre_nxt_aa=True, remove_modification=True, *args, **kwargs)

Bases: BaseFeatureGenerator

Feature generator that generates basic features from peptide sequences.

This generator calculates features such as peptide length, proportion of unique amino acids, Shannon entropy of amino acid distribution, difference between peptide length and average peptide length, and count of unique amino acids.

Parameters:

Name Type Description Default
peptides List[str]

List of peptide sequences to generate features for.

required
remove_pre_nxt_aa bool

Whether to remove the amino acids adjacent to the peptide. If True, removes them. Default is True.

True
remove_modification bool

Whether to remove modifications in the peptide sequences. If True, removes them. Default is True.

True
Notes

The generated features include: - length_diff_from_avg: Difference between peptide length and average length - abs_length_diff_from_avg: Absolute difference between peptide length and average length - unique_aa_count: Number of unique amino acids in the peptide - unique_aa_proportion: Proportion of unique amino acids in the peptide - shannon_entropy: Shannon entropy of amino acid distribution

Source code in optimhc/feature/basic.py
def __init__(
    self,
    peptides: List[str],
    remove_pre_nxt_aa: bool = True,
    remove_modification: bool = True,
    *args,
    **kwargs,
):
    super().__init__(*args, **kwargs)
    self.peptides = peptides
    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    self.remove_modification = remove_modification
    self.avg_length = None
    logger.info(f"Initialized BasicFeatureGenerator with {len(peptides)} peptides.")

feature_columns property

Return the list of generated feature column names.

id_column property

Return the list of input columns required for feature generation.

Returns:

Type Description
List[str]

List of input column names required for feature generation. Currently only requires 'Peptide' column.

generate_features()

Generate basic features for the provided peptides.

Returns:

Type Description
DataFrame

DataFrame containing peptides and their computed features: - length_diff_from_avg: Difference from average peptide length - abs_length_diff_from_avg: Absolute difference from average length - unique_aa_count: Number of unique amino acids - unique_aa_proportion: Proportion of unique amino acids - shannon_entropy: Shannon entropy of amino acid distribution

Raises:

Type Description
ValueError

If NaN values are found in the generated features.

Notes

All features are converted to float type before returning. The method calculates average peptide length across all peptides and uses it as a reference for length-based features.

Source code in optimhc/feature/basic.py
def generate_features(self) -> pd.DataFrame:
    """
    Generate basic features for the provided peptides.

    Returns
    -------
    pd.DataFrame
        DataFrame containing peptides and their computed features:
        - length_diff_from_avg: Difference from average peptide length
        - abs_length_diff_from_avg: Absolute difference from average length
        - unique_aa_count: Number of unique amino acids
        - unique_aa_proportion: Proportion of unique amino acids
        - shannon_entropy: Shannon entropy of amino acid distribution

    Raises
    ------
    ValueError
        If NaN values are found in the generated features.

    Notes
    -----
    All features are converted to float type before returning.
    The method calculates average peptide length across all peptides
    and uses it as a reference for length-based features.
    """
    logger.info("Generating basic features.")
    peptides_df = pd.DataFrame(self.peptides, columns=["Peptide"])
    peptides_df["clean_peptide"] = peptides_df["Peptide"].apply(self._preprocess_peptide)
    peptides_df["peptide_length"] = peptides_df["clean_peptide"].apply(len)
    self.avg_length = peptides_df["peptide_length"].mean()
    peptides_df["length_diff_from_avg"] = peptides_df["peptide_length"] - self.avg_length
    peptides_df["abs_length_diff_from_avg"] = peptides_df["length_diff_from_avg"].abs()
    peptides_df["unique_aa_count"] = peptides_df["clean_peptide"].apply(lambda x: len(set(x)))
    peptides_df["unique_aa_proportion"] = (
        peptides_df["unique_aa_count"] / peptides_df["peptide_length"]
    )
    peptides_df["shannon_entropy"] = peptides_df["clean_peptide"].apply(self._shannon_entropy)
    features_df = peptides_df[["Peptide"] + self.feature_columns]
    # Fix SettingWithCopyWarning: make an explicit copy before assignment
    features_df = features_df.copy()
    for col in self.feature_columns:
        features_df[col] = features_df[col].astype(float)
    if features_df.isna().sum().sum() > 0:
        logger.error("NaN values found in the generated features.")
        raise ValueError("NaN values found in the generated features.")

    logger.info(f"Generated basic features for {len(features_df)} peptides.")
    return features_df

Spectral Similarity

spectral_similarity

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

SpectralSimilarityFeatureGenerator(spectrum_ids, peptides, charges, scan_ids, mz_file_paths, model_type, collision_energies=None, instruments=None, fragmentation_types=None, remove_pre_nxt_aa=False, mod_dict=None, url='koina.wilhelmlab.org:443', ssl=True, top_n=36, tolerance_ppm=20)

Bases: BaseFeatureGenerator

Feature generator for calculating similarity between experimental and predicted spectra.

This class works through the following steps: 1. Extract experimental spectral data from mzML files 2. Use Koina for theoretical spectra prediction 3. Align experimental and predicted spectra 4. Calculate similarity metrics as features

Parameters:

Name Type Description Default
peptides list of str

List of peptide sequences.

required
charges list of int

List of charge states.

required
scan_ids list of int

List of scan IDs.

required
mz_file_paths list of str

List of mzML file paths.

required
model_type str

Prediction model type, either "HCD" or "CID".

required
collision_energies list of float

List of collision energies, required when model_type is "HCD".

None
remove_pre_nxt_aa bool

Whether to remove preceding and next amino acids, default is True.

False
remove_modification bool

Whether to remove modifications, default is True.

required
url str

Koina server URL, default is "koina.wilhelmlab.org:443".

'koina.wilhelmlab.org:443'
top_n int

Number of top peaks to use for alignment, default is 12.

36
tolerance_ppm float

Mass tolerance for alignment in ppm, default is 20.

20
Source code in optimhc/feature/spectral_similarity.py
def __init__(
    self,
    spectrum_ids: List[str],
    peptides: List[str],
    charges: List[int],
    scan_ids: List[int],
    mz_file_paths: List[str],
    model_type: str,
    collision_energies: List[float] = None,
    instruments: List[str] = None,
    fragmentation_types: List[str] = None,
    remove_pre_nxt_aa: bool = False,
    mod_dict: Optional[Dict[str, str]] = None,
    url: str = "koina.wilhelmlab.org:443",
    ssl: bool = True,
    top_n: int = 36,
    tolerance_ppm: float = 20,
):
    self.spectrum_ids = spectrum_ids
    self.peptides = peptides
    self.charges = charges
    self.scan_ids = scan_ids
    self.mz_file_paths = mz_file_paths
    self.model_type = model_type
    self.collision_energies = collision_energies
    self.instruments = instruments
    self.fragmentation_types = fragmentation_types
    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    self.mod_dict = mod_dict
    self.url = url
    self.ssl = ssl
    self.top_n = top_n
    self.tolerance_ppm = tolerance_ppm
    self.results = None
    self._raw_predictions = None

    logger.info(f"Initializing SpectralSimilarityFeatureGenerator with {len(peptides)} PSMs")
    logger.info(f"Using model: {self.model_type}")

    self.df = pd.DataFrame(
        {
            "spectrum_id": self.spectrum_ids,
            "scan": self.scan_ids,
            "peptide": self.peptides,
            "charge": self.charges,
            "mz_file_path": self.mz_file_paths,
        }
    )

    self.df["processed_peptide"] = self.df["peptide"].apply(self._preprocess_peptide)
    logger.info(f"Received {len(self.df)} PSMs for spectral similarity feature generation")

id_column property

Returns a list of input columns required for the feature generator.

feature_columns property

Returns a list of feature columns generated by the feature generator.

raw_predictions property

Returns the raw prediction results from Koina.

Returns:

Type Description
DataFrame

Raw prediction results DataFrame.

input_df()

Return the generated features as a DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the generated features.

Source code in optimhc/feature/spectral_similarity.py
def input_df(self) -> pd.DataFrame:
    """
    Return the generated features as a DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the generated features.
    """
    return self.df

get_raw_predictions()

Get the raw prediction results DataFrame from Koina.

Returns:

Type Description
DataFrame

Raw prediction results DataFrame.

Source code in optimhc/feature/spectral_similarity.py
def get_raw_predictions(self) -> pd.DataFrame:
    """
    Get the raw prediction results DataFrame from Koina.

    Returns
    -------
    pd.DataFrame
        Raw prediction results DataFrame.
    """
    return self.raw_predictions

save_raw_predictions(file_path, **kwargs)

Save the raw prediction results to a file.

Parameters:

Name Type Description Default
file_path str

Path to save the file.

required
**kwargs

Other parameters passed to pandas.DataFrame.to_csv.

{}
Source code in optimhc/feature/spectral_similarity.py
def save_raw_predictions(self, file_path: str, **kwargs) -> None:
    """
    Save the raw prediction results to a file.

    Parameters
    ----------
    file_path : str
        Path to save the file.
    **kwargs
        Other parameters passed to ``pandas.DataFrame.to_csv``.
    """
    if "index" not in kwargs:
        kwargs["index"] = False
    if self.raw_predictions is not None:
        self.raw_predictions.to_csv(file_path, **kwargs)
        logger.info(f"Raw prediction results saved to: {file_path}")
    else:
        logger.warning("No raw prediction results available to save.")

generate_features()

Public interface for generating spectral similarity features.

Returns:

Type Description
DataFrame

DataFrame containing the generated features.

Notes

This method is a wrapper around _generate_features that ensures the results are cached and only computed once.

Source code in optimhc/feature/spectral_similarity.py
def generate_features(self) -> pd.DataFrame:
    """
    Public interface for generating spectral similarity features.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the generated features.

    Notes
    -----
    This method is a wrapper around _generate_features that ensures
    the results are cached and only computed once.
    """
    if self.results is None:
        self.results = self._generate_features()
    return self.results[self.id_column + self.feature_columns]

get_full_data()

Return the full DataFrame with all columns.

Returns:

Type Description
DataFrame

Full DataFrame with all columns.

Notes

This method returns the complete DataFrame including all intermediate results and raw data used in feature generation.

Source code in optimhc/feature/spectral_similarity.py
def get_full_data(self) -> pd.DataFrame:
    """
    Return the full DataFrame with all columns.

    Returns
    -------
    pd.DataFrame
        Full DataFrame with all columns.

    Notes
    -----
    This method returns the complete DataFrame including all intermediate
    results and raw data used in feature generation.
    """
    return self.results

align_peaks(exp_mz_sorted, exp_intensity_sorted, pred_mz_sorted, tolerance_ppm)

Align sorted experimental peaks to sorted predicted peaks using ppm tolerance.

For each predicted peak, find the experimental peak within the tolerance window that has the highest intensity.

Returns:

Name Type Description
aligned_exp_intensity float64 array of length n_pred
matched_exp_indices int64 array of length n_pred (-1 = no match)
Source code in optimhc/feature/numba_utils.py
@nb.njit(cache=True)
def align_peaks(
    exp_mz_sorted: np.ndarray,
    exp_intensity_sorted: np.ndarray,
    pred_mz_sorted: np.ndarray,
    tolerance_ppm: float,
):
    """
    Align sorted experimental peaks to sorted predicted peaks using ppm tolerance.

    For each predicted peak, find the experimental peak within the tolerance window
    that has the highest intensity.

    Returns
    -------
    aligned_exp_intensity : float64 array of length n_pred
    matched_exp_indices   : int64 array of length n_pred (-1 = no match)
    """
    n_pred = len(pred_mz_sorted)
    n_exp = len(exp_mz_sorted)

    aligned_exp_intensity = np.zeros(n_pred, dtype=np.float64)
    matched_exp_indices = np.full(n_pred, -1, dtype=np.int64)

    start_pos = 0

    for i in range(n_pred):
        pred_peak_mz = pred_mz_sorted[i]
        fragment_min = pred_peak_mz * (1.0 - tolerance_ppm / 1e6)
        fragment_max = pred_peak_mz * (1.0 + tolerance_ppm / 1e6)

        matched_int = 0.0
        past_start = 0

        while start_pos + past_start < n_exp:
            exp_peak_mz = exp_mz_sorted[start_pos + past_start]
            if exp_peak_mz < fragment_min:
                start_pos += 1
            elif exp_peak_mz <= fragment_max:
                exp_peak_int = exp_intensity_sorted[start_pos + past_start]
                if exp_peak_int > matched_int:
                    matched_int = exp_peak_int
                    matched_exp_indices[i] = start_pos + past_start
                past_start += 1
            else:
                break

        aligned_exp_intensity[i] = matched_int

    return aligned_exp_intensity, matched_exp_indices

compute_similarity_features(exp_vector, pred_vector)

Compute all similarity metrics between two aligned intensity vectors.

Returns:

Type Description
tuple of 8 float64 values:

(spectral_angle_similarity, cosine_similarity, pearson_correlation, spearman_correlation, mean_squared_error, unweighted_entropy_similarity, predicted_seen_nonzero, predicted_not_seen)

Source code in optimhc/feature/numba_utils.py
@nb.njit(cache=True)
def compute_similarity_features(exp_vector, pred_vector):
    """
    Compute all similarity metrics between two aligned intensity vectors.

    Returns
    -------
    tuple of 8 float64 values:
        (spectral_angle_similarity, cosine_similarity,
         pearson_correlation, spearman_correlation,
         mean_squared_error, unweighted_entropy_similarity,
         predicted_seen_nonzero, predicted_not_seen)
    """
    n = len(exp_vector)

    exp_sq = 0.0
    pred_sq = 0.0
    dot = 0.0
    exp_sum = 0.0
    pred_sum = 0.0
    for i in range(n):
        e = exp_vector[i]
        p = pred_vector[i]
        exp_sq += e * e
        pred_sq += p * p
        dot += e * p
        exp_sum += e
        pred_sum += p

    exp_l2 = np.sqrt(exp_sq)
    pred_l2 = np.sqrt(pred_sq)

    # spectral angle similarity
    if exp_l2 > 0.0 and pred_l2 > 0.0:
        cos_val = dot / (exp_l2 * pred_l2)
        cos_val = max(-1.0, min(1.0, cos_val))
        spectral_angle = 1.0 - 2.0 * np.arccos(cos_val) / np.pi
    else:
        spectral_angle = 0.0

    # cosine similarity
    if exp_sum == 0.0 or pred_sum == 0.0:
        cosine_sim = 0.0
    elif exp_l2 > 0.0 and pred_l2 > 0.0:
        cosine_sim = dot / (exp_l2 * pred_l2)
    else:
        cosine_sim = 0.0

    # MSE on L2-normalized vectors
    mse = 0.0
    for i in range(n):
        e = exp_vector[i] / exp_l2 if exp_l2 > 0.0 else 0.0
        p = pred_vector[i] / pred_l2 if pred_l2 > 0.0 else 0.0
        diff = e - p
        mse += diff * diff
    mse /= n if n > 0 else 1.0

    # pearson correlation
    exp_mean = exp_sum / n if n > 0 else 0.0
    pred_mean = pred_sum / n if n > 0 else 0.0
    num_p = 0.0
    exp_var = 0.0
    pred_var = 0.0
    for i in range(n):
        de = exp_vector[i] - exp_mean
        dp = pred_vector[i] - pred_mean
        num_p += de * dp
        exp_var += de * de
        pred_var += dp * dp
    if exp_var > 0.0 and pred_var > 0.0:
        pearson = num_p / np.sqrt(exp_var * pred_var)
    else:
        pearson = 0.0

    # spearman correlation
    if exp_var > 0.0 and pred_var > 0.0:
        exp_ranks = _rankdata_average(exp_vector)
        pred_ranks = _rankdata_average(pred_vector)
        rmean_e = 0.0
        rmean_p = 0.0
        for i in range(n):
            rmean_e += exp_ranks[i]
            rmean_p += pred_ranks[i]
        rmean_e /= n
        rmean_p /= n
        snum = 0.0
        svar_e = 0.0
        svar_p = 0.0
        for i in range(n):
            de = exp_ranks[i] - rmean_e
            dp = pred_ranks[i] - rmean_p
            snum += de * dp
            svar_e += de * de
            svar_p += dp * dp
        if svar_e > 0.0 and svar_p > 0.0:
            spearman = snum / np.sqrt(svar_e * svar_p)
        else:
            spearman = 0.0
    else:
        spearman = 0.0

    # unweighted entropy similarity
    s_exp = 0.0
    s_pred = 0.0
    s_mixed = 0.0
    for i in range(n):
        ep = exp_vector[i] / exp_sum if exp_sum > 0.0 else 0.0
        pp = pred_vector[i] / pred_sum if pred_sum > 0.0 else 0.0
        if ep > 0.0:
            s_exp -= ep * np.log(ep)
        if pp > 0.0:
            s_pred -= pp * np.log(pp)
        mp = 0.5 * (ep + pp)
        if mp > 0.0:
            s_mixed -= mp * np.log(mp)
    entropy_sim = 1.0 - (2.0 * s_mixed - s_exp - s_pred) / np.log(4.0)

    # predicted seen/not seen
    seen = 0.0
    not_seen = 0.0
    for i in range(n):
        if pred_vector[i] > 0.0:
            if exp_vector[i] > 0.0:
                seen += 1.0
            else:
                not_seen += 1.0

    return (
        spectral_angle,
        cosine_sim,
        pearson,
        spearman,
        mse,
        entropy_sim,
        seen,
        not_seen,
    )

extract_mzml_data(mzml_filename, scan_ids=None)

Extract scan data from an mzML file.

Parameters:

Name Type Description Default
mzml_filename str

The path to the mzML file.

required
scan_ids list[int] or None

A list of scan IDs to extract. If None, extracts all scans.

None

Returns:

Type Description
DataFrame

A DataFrame containing the extracted scan data with columns: - source: The source file name - scan: The scan ID - mz: The m/z values array - intensity: The intensity values array - charge: The charge state - retention_time: The retention time

Notes

This function: 1. Reads the mzML file using pyteomics 2. Extracts scan data including retention time, charge state, m/z values, and intensities 3. Filters scans based on provided scan IDs if specified 4. Returns a DataFrame with the extracted data

Source code in optimhc/parser/mzml.py
def extract_mzml_data(mzml_filename, scan_ids=None):
    """
    Extract scan data from an mzML file.

    Parameters
    ----------
    mzml_filename : str
        The path to the mzML file.
    scan_ids : list[int] or None, optional
        A list of scan IDs to extract. If None, extracts all scans.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the extracted scan data with columns:
        - source: The source file name
        - scan: The scan ID
        - mz: The m/z values array
        - intensity: The intensity values array
        - charge: The charge state
        - retention_time: The retention time

    Notes
    -----
    This function:
    1. Reads the mzML file using pyteomics
    2. Extracts scan data including retention time, charge state, m/z values, and intensities
    3. Filters scans based on provided scan IDs if specified
    4. Returns a DataFrame with the extracted data
    """
    filename = mzml_filename.split("/")[-1].replace(".mzML", "")
    logger.info(f"Extracting scans from {mzml_filename}")

    scan_ids = set(scan_ids) if scan_ids is not None else None

    (
        extracted_scan_ids,
        mzml_filenames,
        intensities,
        mz_values,
        charges,
        retention_times,
    ) = ([], [], [], [], [], [])

    try:
        with mzml.read(mzml_filename) as reader:
            for spectrum in reader:
                try:
                    scan_id = int(spectrum["id"].split("scan=")[-1])

                    if scan_ids is not None and scan_id not in scan_ids:
                        continue

                    mz_array = np.array(spectrum.get("m/z array", []))
                    intensity_array = np.array(spectrum.get("intensity array", []))

                    charge = None
                    try:
                        charge = int(
                            spectrum["precursorList"]["precursor"][0]["selectedIonList"][
                                "selectedIon"
                            ][0]["charge state"]
                        )
                    except (KeyError, ValueError, IndexError):
                        pass

                    retention_time = None
                    try:
                        retention_time = float(spectrum["scanList"]["scan"][0]["scan start time"])
                    except (KeyError, ValueError, IndexError):
                        pass

                    extracted_scan_ids.append(scan_id)
                    mzml_filenames.append(filename)
                    intensities.append(intensity_array)
                    mz_values.append(mz_array)
                    charges.append(charge)
                    retention_times.append(retention_time)

                except Exception as e:
                    logger.warning(f"Skipping scan {scan_id} due to error: {e}")

    except Exception as e:
        logger.error(f"Failed to parse mzML file {mzml_filename}: {e}")
        raise RuntimeError(f"Error processing mzML file {mzml_filename}: {e}")

    data_dict = {
        "source": mzml_filenames,
        "scan": extracted_scan_ids,
        "mz": mz_values,
        "intensity": intensities,
        "charge": charges,
        "retention_time": retention_times,
    }

    scans_df = pd.DataFrame(data_dict)
    scans_df = scans_df.drop_duplicates(subset=["source", "scan"])

    logger.info(f"Successfully extracted {len(scans_df)} scans from {mzml_filename}")

    return scans_df

DeepLC

deeplc

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)

A container for managing peptide-spectrum matches (PSMs) in immunopeptidomics rescoring pipelines.

Parameters:

Name Type Description Default
psms DataFrame

DataFrame containing the PSM data.

required
label_column str

Column containing the label (True for target, False for decoy).

required
scan_column str

Column containing the scan number.

required
spectrum_column str

Column containing the spectrum identifier.

required
ms_data_file_column str

Column containing the MS data file that the PSM originated from.

required
peptide_column str

Column containing the peptide sequence.

required
protein_column str

Column containing the protein accessions.

required
rescoring_features dict of str to list of str

Dictionary of feature columns for rescoring.

required
hit_rank_column str

Column containing the hit rank.

None
charge_column str

Column containing the charge state.

None
retention_time_column str

Column containing the retention time.

None
calculated_mass_column str

Column containing the calculated mass.

None
metadata_column str

Column containing metadata.

None

Attributes:

Name Type Description
psms DataFrame

Copy of the DataFrame containing the PSM data.

target_psms DataFrame

DataFrame containing only target PSMs (label = True).

decoy_psms DataFrame

DataFrame containing only decoy PSMs (label = False).

peptides list of str

List containing all peptides from the PSM data.

columns list of str

List of column names in the PSM DataFrame.

rescoring_features dict of str to list of str

Dictionary of rescoring feature columns in the PSM DataFrame.

Source code in optimhc/psm_container.py
def __init__(
    self,
    psms: pd.DataFrame,
    label_column: str,
    scan_column: str,
    spectrum_column: str,
    ms_data_file_column: str,
    peptide_column: str,
    protein_column: str,
    rescoring_features: Dict[str, List[str]],
    hit_rank_column: Optional[str] = None,
    charge_column: Optional[str] = None,
    retention_time_column: Optional[str] = None,
    calculated_mass_column: Optional[str] = None,
    metadata_column: Optional[str] = None,
):
    self._psms = psms.copy()
    self._psms.reset_index(drop=True, inplace=True)
    self.label_column = label_column
    self.scan_column = scan_column
    self.spectrum_column = spectrum_column
    self.ms_data_file_column = ms_data_file_column
    self.peptide_column = peptide_column
    self.protein_column = protein_column
    self.hit_rank_column = hit_rank_column
    self.retention_time_column = retention_time_column
    self.metadata_column = metadata_column
    self.rescoring_features = rescoring_features
    self.charge_column = charge_column
    self.calculated_mass_column = calculated_mass_column
    # rescore result column
    self.rescore_result_column = None

    # check if the columns are in the dataframe
    def check_column(col):
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")

    check_column(label_column)
    check_column(scan_column)
    check_column(spectrum_column)
    check_column(ms_data_file_column)
    check_column(peptide_column)
    check_column(protein_column)
    check_column(hit_rank_column)
    check_column(retention_time_column)
    check_column(charge_column)
    check_column(calculated_mass_column)

    # ensure the label column is boolean
    if psms[label_column].dtype != "bool":
        raise ValueError(f"Column '{label_column}' must be boolean.")

    if psms[label_column].nunique() == 1 and psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as target. No decoy PSMs found.")
    elif psms[label_column].nunique() == 1 and not psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as decoy. No target PSMs found.")

    def check_metadata_column(col):
        # check the type is Dict[str, Dict[str, str]]
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")
        if not all(isinstance(x, dict) for x in self._psms[col]):
            raise ValueError(f"Column '{col}' must contain dictionaries.")

    if metadata_column:
        check_metadata_column(metadata_column)

    def check_rescoring_features(features: Dict[str, List[str]]):
        for key, cols in features.items():
            for col in cols:
                if col not in psms.columns:
                    raise ValueError(
                        f"Column '{col}' not found in PSM data for feature '{key}'."
                    )

    check_rescoring_features(rescoring_features)

    # check if the number of decoy psms is not 0
    if len(self.decoy_psms) == 0:
        logger.error("No decoy PSMs found. Please check the decoy prefix.")
        raise ValueError("No decoy PSMs found.")

    logger.info("PsmContainer initialized with %d PSM entries.", len(self._psms))
    if self.ms_data_file_column:
        logger.info(
            "PSMs originated from %d MS data file(s).",
            len(self._psms[ms_data_file_column].unique()),
        )
    logger.info("target psms: %d", len(self.target_psms))
    logger.info("decoy psms: %d", len(self.decoy_psms))
    logger.info("unique peptides: %d", len(np.unique(self.peptides)))
    logger.info("rescoring features: %s", rescoring_features)

psms property

Get a copy of the PSM DataFrame to prevent external modification.

Returns:

Type Description
DataFrame

A copy of the PSM DataFrame.

target_psms property

Get a DataFrame containing only target PSMs.

Returns:

Type Description
DataFrame

DataFrame with only target PSMs (label = True).

decoy_psms property

Get a DataFrame containing only decoy PSMs.

Returns:

Type Description
DataFrame

DataFrame with only decoy PSMs (label = False).

columns property

Get the column names of the PSM DataFrame.

Returns:

Type Description
list of str

List of column names.

feature_columns property

Get a list of all feature columns in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature column names.

feature_sources property

Get a list of all feature sources in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature source names.

peptides property

Get the peptide sequences from the PSM data.

Returns:

Type Description
list of str

List of peptide sequences.

ms_data_files property

Get the MS data files from the PSM data.

Returns:

Type Description
list of str

List of MS data file names.

scan_ids property

Get the scan numbers from the PSM data.

Returns:

Type Description
list of int

List of scan numbers.

charges property

Get the charge states from the PSM data.

Returns:

Type Description
list of int

List of charge states.

metadata property

Get the metadata from the PSM data.

Returns:

Type Description
Series

Series containing metadata for each PSM.

spectrum_ids property

Get the spectrum identifiers from the PSM data.

Returns:

Type Description
list of str

List of spectrum identifiers.

identifier_columns property

Get the columns that uniquely identify each PSM.

Returns:

Type Description
list of str

List of identifier column names.

__len__()

Get the number of PSMs in the container.

Returns:

Type Description
int

Number of PSMs.

Source code in optimhc/psm_container.py
def __len__(self) -> int:
    """
    Get the number of PSMs in the container.

    Returns
    -------
    int
        Number of PSMs.
    """
    return len(self._psms)

copy()

Return a deep copy of the PsmContainer object.

Returns:

Type Description
PsmContainer

A deep copy of the current PsmContainer.

Source code in optimhc/psm_container.py
def copy(self) -> "PsmContainer":
    """
    Return a deep copy of the PsmContainer object.

    Returns
    -------
    PsmContainer
        A deep copy of the current PsmContainer.
    """
    import copy

    return copy.deepcopy(self)

__repr__()

Return a string representation of the PsmContainer.

Returns:

Type Description
str

String summary of the PsmContainer.

Source code in optimhc/psm_container.py
def __repr__(self) -> str:
    """
    Return a string representation of the PsmContainer.

    Returns
    -------
    str
        String summary of the PsmContainer.
    """
    return (
        f"PsmContainer with {len(self)} PSMs\n"
        f"\t - Target PSMs: {len(self.target_psms)}\n"
        f"\t - Decoy PSMs: {len(self.decoy_psms)}\n"
        f"\t - Unique Peptides: {len(np.unique(self.peptides))}\n"
        f"\t - Unique Spectra: {len(self._psms[self.spectrum_column].unique())}\n"
        f"\t - Rescoring Features: {self.rescoring_features}\n"
    )

drop_features(features)

Drop specified features from the PSM DataFrame.

Parameters:

Name Type Description Default
features list of str

List of feature column names to drop.

required

Raises:

Type Description
ValueError

If any of the features do not exist in the DataFrame.

Source code in optimhc/psm_container.py
def drop_features(self, features: List[str]) -> None:
    """
    Drop specified features from the PSM DataFrame.

    Parameters
    ----------
    features : list of str
        List of feature column names to drop.

    Raises
    ------
    ValueError
        If any of the features do not exist in the DataFrame.
    """
    missing_features = [f for f in features if f not in self._psms.columns]
    if missing_features:
        raise ValueError(f"Features not found in PSM data: {missing_features}")

    self._psms.drop(columns=features, inplace=True)
    # Create a list of sources to update
    sources_to_update = []
    for source, cols in self.rescoring_features.items():
        self.rescoring_features[source] = [col for col in cols if col not in features]
        if not self.rescoring_features[source]:
            sources_to_update.append(source)

    logger.info(
        f"Sources to be removed: {sources_to_update}. Because all the features are removed."
    )
    # Remove sources with no features left
    for source in sources_to_update:
        del self.rescoring_features[source]

drop_source(source)

Drop all features associated with a specific source from the PSM DataFrame.

Parameters:

Name Type Description Default
source str

Name of the source to drop.

required

Raises:

Type Description
ValueError

If the source does not exist in the rescoring features.

Source code in optimhc/psm_container.py
def drop_source(self, source: str) -> None:
    """
    Drop all features associated with a specific source from the PSM DataFrame.

    Parameters
    ----------
    source : str
        Name of the source to drop.

    Raises
    ------
    ValueError
        If the source does not exist in the rescoring features.
    """
    if source not in self.rescoring_features:
        raise ValueError(f"Source '{source}' not found in rescoring features.")
    self.drop_features(self.rescoring_features[source])

add_metadata(metadata_df, psms_key, metadata_key, source)

Merge new metadata into the PSM DataFrame based on specified columns. Metadata from the specified source is stored as a nested dictionary inside the metadata column.

Parameters:

Name Type Description Default
metadata_df DataFrame

DataFrame containing new metadata to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
metadata_key str or list of str

Column name(s) in the metadata data to merge on.

required
source str

Name of the source of the new metadata.

required
Source code in optimhc/psm_container.py
def add_metadata(
    self,
    metadata_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    metadata_key: Union[str, List[str]],
    source,
) -> None:
    """
    Merge new metadata into the PSM DataFrame based on specified columns.
    Metadata from the specified source is stored as a nested dictionary inside the metadata column.

    Parameters
    ----------
    metadata_df : pd.DataFrame
        DataFrame containing new metadata to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    metadata_key : str or list of str
        Column name(s) in the metadata data to merge on.
    source : str
        Name of the source of the new metadata.
    """
    if self.metadata_column is None:
        logger.info("No existing metadata column. Creating new metadata column.")
        self.metadata_column = "metadata"
        self._psms["metadata"] = [{} for _ in range(len(self._psms))]

    metadata_cols = [col for col in metadata_df.columns if col not in metadata_key]
    merged_df = self.psms.merge(
        metadata_df, left_on=psms_key, right_on=metadata_key, how="left"
    )
    if source in self._psms["metadata"]:
        logger.warning(f"{source} already exists in metadata. Overwriting.")
    for col in metadata_cols:
        merged_df["metadata"] = merged_df.apply(
            lambda row: {
                **row["metadata"],
                source: (
                    {col: row[col]}
                    if source not in row["metadata"]
                    else {**row["metadata"][source], col: row[col]}
                ),
            },
            axis=1,
        )

    self._psms["metadata"] = merged_df["metadata"]

get_top_hits(n=1)

Get the top n hits based on the hit rank column. If the hit rank column is not specified, returns the original PSMs.

Parameters:

Name Type Description Default
n int

The number of top hits to return. Default is 1.

1

Returns:

Type Description
PsmContainer

A new PsmContainer object containing the top n hits.

Source code in optimhc/psm_container.py
def get_top_hits(self, n: int = 1):
    """
    Get the top n hits based on the hit rank column.
    If the hit rank column is not specified, returns the original PSMs.

    Parameters
    ----------
    n : int, optional
        The number of top hits to return. Default is 1.

    Returns
    -------
    PsmContainer
        A new PsmContainer object containing the top n hits.
    """
    if self.hit_rank_column is None:
        logger.warning("Rank column not specified. Return the original PSMs.")
        return self.copy()

    psms = self.copy()
    psms._psms = psms._psms[psms._psms[self.hit_rank_column] <= n]
    return psms

add_features(features_df, psms_key, feature_key, source, suffix=None)

Merge new features into the PSM DataFrame based on specified columns.

This method performs a left join between the PSM data and feature data, ensuring that all PSMs are preserved while adding new features. It handles column name conflicts through optional suffixing and maintains feature source tracking.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
feature_key str or list of str

Column name(s) in the features data to merge on.

required
source str

Name of the source of the new features (e.g., 'deeplc', 'netmhc').

required
suffix str

Suffix to add to the new columns if there's a name conflict. Required when new feature columns have the same names as existing columns. For example, if adding features from different sources (e.g., 'score' from DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

None

Returns:

Type Description
None

Raises:

Type Description
ValueError

If duplicate columns exist without suffix. If merging features changes the number of PSMs.

Notes

The method follows these steps: 1. Validates input and prepares merge keys 2. Checks for column name conflicts 3. Manages feature source: if the source already exists, it will be overwritten 4. Performs left join merge 5. Verifies data integrity

Suffix Usage

The suffix parameter is used to handle column name conflicts: - When adding features from different sources that might have the same column names - When you want to keep both the original and new features with the same name - When you need to track the source of features in the column names

If suffix is not provided and there are duplicate column names: - The method will raise a ValueError - You must either provide a suffix or rename the columns before adding

Examples:

>>> container = PsmContainer(...)
>>> # Adding features without suffix (no conflicts)
>>> features_df1 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'feature1': [0.1, 0.2, 0.3],
...     'feature2': [0.4, 0.5, 0.6]
... })
>>> container.add_features(
...     features_df1,
...     psms_key='scan',
...     feature_key='scan',
...     source='source1'
... )
>>> # Adding features with suffix (handling conflicts)
>>> features_df2 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
...     'feature3': [0.7, 0.8, 0.9]
... })
>>> container.add_features(
...     features_df2,
...     psms_key='scan',
...     feature_key='scan',
...     source='source2',
...     suffix='_new'  # 'score' becomes 'score_new'
... )
Source code in optimhc/psm_container.py
def add_features(
    self,
    features_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    feature_key: Union[str, List[str]],
    source: str,
    suffix: Optional[str] = None,
) -> None:
    """Merge new features into the PSM DataFrame based on specified columns.

    This method performs a left join between the PSM data and feature data,
    ensuring that all PSMs are preserved while adding new features. It handles
    column name conflicts through optional suffixing and maintains feature source
    tracking.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    feature_key : str or list of str
        Column name(s) in the features data to merge on.
    source : str
        Name of the source of the new features (e.g., 'deeplc', 'netmhc').
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
        Required when new feature columns have the same names as existing columns.
        For example, if adding features from different sources (e.g., 'score' from
        DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If duplicate columns exist without suffix.
        If merging features changes the number of PSMs.

    Notes
    -----
    The method follows these steps:
    1. Validates input and prepares merge keys
    2. Checks for column name conflicts
    3. Manages feature source: if the source already exists, it will be overwritten
    4. Performs left join merge
    5. Verifies data integrity

    Suffix Usage
    -----------
    The suffix parameter is used to handle column name conflicts:
    - When adding features from different sources that might have the same column names
    - When you want to keep both the original and new features with the same name
    - When you need to track the source of features in the column names

    If suffix is not provided and there are duplicate column names:
    - The method will raise a ValueError
    - You must either provide a suffix or rename the columns before adding

    Examples
    --------
    >>> container = PsmContainer(...)
    >>> # Adding features without suffix (no conflicts)
    >>> features_df1 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'feature1': [0.1, 0.2, 0.3],
    ...     'feature2': [0.4, 0.5, 0.6]
    ... })
    >>> container.add_features(
    ...     features_df1,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source1'
    ... )
    >>> # Adding features with suffix (handling conflicts)
    >>> features_df2 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
    ...     'feature3': [0.7, 0.8, 0.9]
    ... })
    >>> container.add_features(
    ...     features_df2,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source2',
    ...     suffix='_new'  # 'score' becomes 'score_new'
    ... )
    """
    if isinstance(psms_key, str):
        psms_key = [psms_key]

    if isinstance(feature_key, str):
        feature_key = [feature_key]

    new_feature_cols = [col for col in features_df.columns if col not in feature_key]

    for cols in new_feature_cols:
        if cols in self._psms.columns:
            logger.warning(f"Column '{cols}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix ")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")
    logger.info(f"Adding {len(new_feature_cols)} new features from {source}.")

    if not new_feature_cols:
        logger.warning("No new features to add. Check the feature key and PSMs key.")
        logger.warning(f"Feature key: {feature_key}; PSMs key: {psms_key}")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    # TODO: reluctant logic
    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df = features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}
    )
    original_len = len(self._psms)
    # avoid merge the right key to the psms
    self._psms = self._psms.merge(
        features_df, left_on=psms_key, right_on=feature_key, how="left"
    )

    if feature_key != psms_key:
        cols_to_drop = [
            col for col in feature_key if col not in psms_key and col in self._psms.columns
        ]
        if cols_to_drop:
            logger.debug(f"Dropping columns from feature_key not in psms_key: {cols_to_drop}")
            self._psms.drop(columns=cols_to_drop, inplace=True)

    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate keys."
        )

add_features_by_index(features_df, source, suffix=None)

Merge new features into the PSM DataFrame based on the DataFrame index.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
source str

Name of the source of the new features.

required
suffix str

Suffix to add to the new columns if there's a name conflict.

None
Source code in optimhc/psm_container.py
def add_features_by_index(
    self, features_df: pd.DataFrame, source: str, suffix: Optional[str] = None
) -> None:
    """
    Merge new features into the PSM DataFrame based on the DataFrame index.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    source : str
        Name of the source of the new features.
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
    """
    new_feature_cols = [col for col in features_df.columns]
    for col in new_feature_cols:
        if col in self._psms.columns:
            logger.warning(f"Column '{col}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix.")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")

    logger.info(f"Adding {len(new_feature_cols)} new features from {source} by index.")

    if not new_feature_cols:
        logger.warning("No new features to add.")
        raise ValueError("No new features to add.")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}, inplace=True
    )
    original_len = len(self._psms)
    self._psms = self._psms.merge(
        features_df,
        left_index=True,
        right_index=True,
        how="left",  # Perform a left join to preserve all original PSM data
    )

    # Ensure that the merge did not change the number of rows in the PSM DataFrame
    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate indices."
        )

add_results(results_df, psms_key, result_key)

Add results of rescore engine to the PSM DataFrame based on specified columns.

Parameters:

Name Type Description Default
results_df DataFrame

DataFrame containing new results to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
result_key str or list of str

Column name(s) in the results data to merge on.

required
Source code in optimhc/psm_container.py
def add_results(
    self,
    results_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    result_key: Union[str, List[str]],
) -> None:
    """
    Add results of rescore engine to the PSM DataFrame based on specified columns.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing new results to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    result_key : str or list of str
        Column name(s) in the results data to merge on.
    """
    if self.rescore_result_column is not None:
        logger.warning("Rescore result column already exists. Overwriting.")

    if set(self._psms.columns) & set(results_df.columns):
        raise ValueError(
            "Duplicate columns exist. Please rename the columns in the results data."
        )

    self.rescore_result_column = result_key
    self._psms = self._psms.merge(
        results_df,
        left_on=psms_key,
        right_on=result_key,
        how="left",
        validate="one_to_one",
    )
    self._psms.drop(columns=result_key, inplace=True)
    logger.info("Added rescore results to PSM data.")

write_pin(output_file, style='default', source=None)

Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats. The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

Parameters:

Name Type Description Default
output_file str

Path to the output PIN file.

required
style str

If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins). If set to 'default', outputs all features specified in rescoring_features, plus required Percolator columns.

'default'
source list of str

List of feature sources to include. If None, includes all sources.

None

Returns:

Type Description
DataFrame

The DataFrame written to the PIN file.

Notes
  • The first three columns are always: SpecID, Label, ScanNr.
  • For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
  • If hit_rank_column is not specified, rank is set to 1 for all rows.
  • Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
  • The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
  • The 'Peptide' column is formatted with underscores (e.g., _.PEPTIDE._).
  • For standard format, all features from rescoring_features are appended between ScanNr and Peptide columns.
  • The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
  • Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

Example output (default style): SpecId Label ScanNr feature1 feature2 ... Peptide Proteins

Example output (msbooster style): SpecId Label ScanNr retentiontime rank hyperscore Peptide Proteins or SpecId Label ScanNr retentiontime rank log10_evalue Peptide Proteins

Raises:

Type Description
ValueError

If required columns are missing for the selected style.

Source code in optimhc/psm_container.py
def write_pin(
    self, output_file: str, style: str = "default", source: List[str] = None
) -> None:
    """
    Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats.
    The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

    Parameters
    ----------
    output_file : str
        Path to the output PIN file.
    style : str, optional
        If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins).
        If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.
    source : list of str, optional
        List of feature sources to include. If None, includes all sources.

    Returns
    -------
    pd.DataFrame
        The DataFrame written to the PIN file.

    Notes
    -----
    - The first three columns are always: SpecID, Label, ScanNr.
    - For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
    - If `hit_rank_column` is not specified, rank is set to 1 for all rows.
    - Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
    - The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
    - The 'Peptide' column is formatted with underscores (e.g., `_.PEPTIDE._`).
    - For standard format, all features from `rescoring_features` are appended between ScanNr and Peptide columns.
    - The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
    - Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

    Example output (default style):
        SpecId	Label	ScanNr	feature1	feature2	...	Peptide	Proteins

    Example output (msbooster style):
        SpecId	Label	ScanNr	retentiontime	rank	hyperscore	Peptide	Proteins
        or
        SpecId	Label	ScanNr	retentiontime	rank	log10_evalue	Peptide	Proteins

    Raises
    ------
    ValueError
        If required columns are missing for the selected style.
    """
    df = self._psms.copy()
    # Check if the label column is str
    # Case1: label column is str
    if df[self.label_column].dtype == "str":
        df["PercolatorLabel"] = df[self.label_column].map({"True": 1, "False": -1})
    # Case2: label column is bool
    elif df[self.label_column].dtype == "bool":
        df["PercolatorLabel"] = df[self.label_column].map({True: 1, False: -1})
    else:
        # try to convert to bool
        logger.warning("Label column is not str or bool. Converting to bool.")
        df["PercolatorLabel"] = df[self.label_column].astype(bool).map({True: 1, False: -1})
    logger.info("Writing PIN file to %s", output_file)
    logger.info("Using style: %s", style)

    feature_cols = []
    if source is None:
        for _, cols in self.rescoring_features.items():
            feature_cols.extend(cols)
    else:
        for s in source:
            if s not in self.rescoring_features:
                raise ValueError(f"Source '{s}' not found in rescoring features.")
            feature_cols.extend(self.rescoring_features[s])

    pin_df = pd.DataFrame()
    pin_df["SpecId"] = df[self.spectrum_column]
    pin_df["Label"] = df["PercolatorLabel"]
    pin_df["ScanNr"] = df[self.scan_column]

    if style == "msbooster":
        if self.retention_time_column:
            pin_df["retentiontime"] = df[self.retention_time_column]
        else:
            raise ValueError("Retention time column is required for msbooster style.")

        pin_df["rank"] = df[self.hit_rank_column].astype(int) if self.hit_rank_column else 1
        if "hyperscore" in self.feature_columns:
            pin_df["hyperscore"] = df["hyperscore"]
        elif "expect" in self.feature_columns:
            pin_df["log10_evalue"] = df["expect"]
        else:
            raise ValueError(
                "Either 'hyperscore' or 'expect' column is required for msbooster style."
            )

        # Add other features and jump the hyperscore or expect column
        for col in feature_cols:
            if col not in [
                "hyperscore",
                "expect",
                self.hit_rank_column,
                self.retention_time_column,
            ]:
                pin_df[col] = df[col]

        # PEPTIDE -> _.PEPTIDE._
        # Add _. at the front and ._ at the end of the peptide column
        pin_df["Peptide"] = df[self.peptide_column].apply(
            lambda x: f"_.{x}._" if isinstance(x, str) else x
        )

    elif style == "default":
        for col in feature_cols:
            pin_df[col] = df[col]
        pin_df["Peptide"] = df[self.peptide_column]
    else:
        raise ValueError(f"Unknown style: {style}. Use 'msbooster' or 'default'.")

    pin_df["Proteins"] = df[self.protein_column].apply(
        lambda x: ";".join(x) if isinstance(x, (list, tuple)) else x
    )
    pin_df = self._convert_float_to_int(pin_df)
    pin_df.to_csv(output_file, sep="\t", index=False)
    logger.info("PIN file written to %s", output_file)
    return pin_df

DeepLCFeatureGenerator(psms, calibration_criteria_column, lower_score_is_better=False, calibration_set_size=None, processes=1, model_path=None, remove_pre_nxt_aa=True, mod_dict=None, *args, **kwargs)

Bases: BaseFeatureGenerator

Generate DeepLC-based features for rescoring.

This generator uses DeepLC to predict retention times and calculates various features based on the differences between predicted and observed retention times.

Parameters:

Name Type Description Default
psms PsmContainer

PSMs to generate features for.

required
calibration_criteria_column str

Column name in the PSMs DataFrame to use for DeepLC calibration.

required
lower_score_is_better bool

Whether a lower PSM score denotes a better matching PSM. Default is False.

False
calibration_set_size int or float

Amount of best PSMs to use for DeepLC calibration. If this value is lower than the number of available PSMs, all PSMs will be used. Default is 0.15.

None
processes int

Number of processes to use in DeepLC. Default is 1.

1
model_path str

Path to the DeepLC model. If None, the default model will be used.

None
remove_pre_nxt_aa bool

Whether to remove the first and last amino acids from the peptide sequence. Default is True.

True
mod_dict dict

Dictionary of modifications to be used for DeepLC. If None, no modifications will be used.

None
Notes

DeepLC retraining is on by default. Add deeplc_retrain: False as a keyword argument to disable retraining.

The generated features include: - observed_retention_time: Original retention time from the data - predicted_retention_time: DeepLC predicted retention time - retention_time_diff: Difference between predicted and observed times - abs_retention_time_diff: Absolute difference between predicted and observed times - retention_time_ratio: Ratio of min(pred,obs) to max(pred,obs)

Generate DeepLC-based features for rescoring.

DeepLC retraining is on by default. Add deeplc_retrain: False as a keyword argument to disable retraining.

Parameters:

Name Type Description Default
psms PsmContainer

PSMs to generate features for.

required
calibration_criteria_column str

Column name in the PSMs DataFrame to use for DeepLC calibration.

required
lower_score_is_better bool

Whether a lower PSM score denotes a better matching PSM. Default: False.

False
calibration_set_size int or float

Amount of best PSMs to use for DeepLC calibration. If this value is lower than the number of available PSMs, all PSMs will be used. (default: 0.15)

None
processes int or None

Number of processes to use in DeepLC. Defaults to 1.

1
model_path str

Path to the DeepLC model. If None, the default model will be used.

None
remove_pre_nxt_aa bool

Whether to remove the first and last amino acids from the peptide sequence. Default: True.

True
mod_dict dict

Dictionary of modifications to be used for DeepLC. If None, no modifications will be used.

None
*args list

Additional positional arguments are passed to DeepLC.

()
**kwargs dict

Additional keyword arguments are passed to DeepLC.

{}
Source code in optimhc/feature/deeplc.py
def __init__(
    self,
    psms: PsmContainer,
    calibration_criteria_column: str,
    lower_score_is_better: bool = False,
    calibration_set_size: Union[int, float, None] = None,
    processes: int = 1,
    model_path: Optional[str] = None,
    remove_pre_nxt_aa: bool = True,
    mod_dict: Optional[Dict[str, str]] = None,
    *args,
    **kwargs,
):
    """
    Generate DeepLC-based features for rescoring.

    DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to
    disable retraining.

    Parameters
    ----------
    psms : PsmContainer
        PSMs to generate features for.
    calibration_criteria_column : str
        Column name in the PSMs DataFrame to use for DeepLC calibration.
    lower_score_is_better : bool
        Whether a lower PSM score denotes a better matching PSM. Default: False.
    calibration_set_size : int or float
        Amount of best PSMs to use for DeepLC calibration. If this value is lower
        than the number of available PSMs, all PSMs will be used. (default: 0.15)
    processes : int or None
        Number of processes to use in DeepLC. Defaults to 1.
    model_path : str
        Path to the DeepLC model. If None, the default model will be used.
    remove_pre_nxt_aa : bool
        Whether to remove the first and last amino acids from the peptide sequence.
        Default: True.
    mod_dict : dict
        Dictionary of modifications to be used for DeepLC. If None, no modifications
        will be used.
    *args : list
        Additional positional arguments are passed to DeepLC.
    **kwargs : dict
        Additional keyword arguments are passed to DeepLC.
    """
    self.psms = psms
    self.lower_score_is_better = lower_score_is_better
    self.calibration_criteria_column = calibration_criteria_column
    self.calibration_set_size = calibration_set_size
    self.processes = processes
    self.model_path = model_path
    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    self.mod_dict = mod_dict
    self.deeplc_df = self._get_deeplc_df()
    self.DeepLC = DeepLC
    self._raw_predictions = None
    if model_path is not None:
        self.deeplc_predictor = self.DeepLC(
            n_jobs=self.processes,
            path_model=model_path,
        )
    else:
        self.deeplc_predictor = self.DeepLC(n_jobs=self.processes)
    logger.info(
        f"Initialized DeepLCFeatureGenerator with {len(self.psms)} PSMs."
        f" Calibration criteria: {self.calibration_criteria_column}."
        f" Lower score is better: {self.lower_score_is_better}."
        f" Calibration set size: {self.calibration_set_size}."
        f" Processes: {self.processes}."
        f" Model path: {self.model_path}."
    )

feature_columns property

Return the list of generated feature column names.

Returns:

Type Description
List[str]

List of feature column names: - observed_retention_time - predicted_retention_time - retention_time_diff - abs_retention_time_diff - retention_time_ratio

id_column property

Return the list of input columns required for the feature generator.

Returns:

Type Description
List[str]

List of input columns required for feature generation. Currently returns an empty list as the required columns are handled internally by the PsmContainer.

raw_predictions property

Get the raw predictions DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the raw predictions: - peptide: Cleaned peptide sequence - predicted_rt: DeepLC predicted retention time - observed_rt: Original retention time - modifications: Unimod format modifications

Notes

If predictions haven't been generated yet, this will trigger feature generation automatically.

generate_features()

Generate DeepLC features for the provided PSMs.

Returns:

Type Description
DataFrame

DataFrame containing the PSMs with added DeepLC features: - original_seq: Original peptide sequence - observed_retention_time: Original retention time - predicted_retention_time: DeepLC predicted retention time - retention_time_diff: Difference between predicted and observed times - abs_retention_time_diff: Absolute difference between predicted and observed times - retention_time_ratio: Ratio of min(pred,obs) to max(pred,obs)

Notes

This method: 1. Prepares data in DeepLC format 2. Calibrates DeepLC if calibration set is specified 3. Predicts retention times 4. Calculates various retention time-based features 5. Handles missing values by imputing with median values

Source code in optimhc/feature/deeplc.py
def generate_features(self) -> pd.DataFrame:
    """
    Generate DeepLC features for the provided PSMs.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the PSMs with added DeepLC features:
        - original_seq: Original peptide sequence
        - observed_retention_time: Original retention time
        - predicted_retention_time: DeepLC predicted retention time
        - retention_time_diff: Difference between predicted and observed times
        - abs_retention_time_diff: Absolute difference between predicted and observed times
        - retention_time_ratio: Ratio of min(pred,obs) to max(pred,obs)

    Notes
    -----
    This method:
    1. Prepares data in DeepLC format
    2. Calibrates DeepLC if calibration set is specified
    3. Predicts retention times
    4. Calculates various retention time-based features
    5. Handles missing values by imputing with median values
    """
    logger.info("Generating DeepLC features.")

    # Extract DeepLC input DataFrame
    self.deeplc_df = self._get_deeplc_df()

    # Calibrate DeepLC predictor
    if self.calibration_set_size:
        calibration_df = self._get_calibration_psms(self.deeplc_df)
        logger.debug(f"Calibrating DeepLC with {len(calibration_df)} PSMs.")
        self.deeplc_predictor.calibrate_preds(
            seq_df=calibration_df[["seq", "tr", "modifications"]]
        )

    # Predict retention times
    logger.info("Predicting retention times using DeepLC.")
    predictions = self.deeplc_predictor.make_preds(
        seq_df=self.deeplc_df[["seq", "tr", "modifications"]]
    )

    self._raw_predictions = pd.DataFrame(
        {
            "peptide": self.deeplc_df["seq"],
            "predicted_rt": predictions,
            "observed_rt": self.deeplc_df["tr"],
            "modifications": self.deeplc_df["modifications"],
        }
    )

    # Calculate retention time differences
    rt_diffs = predictions - self.deeplc_df["tr"]
    self.deeplc_df["predicted_retention_time"] = predictions
    self.deeplc_df["retention_time_diff"] = rt_diffs

    result_df = pd.DataFrame()
    result_df["original_seq"] = self.deeplc_df["original_seq"]
    result_df["observed_retention_time"] = self.deeplc_df["tr"]
    result_df["predicted_retention_time"] = self.deeplc_df["predicted_retention_time"]
    result_df["retention_time_diff"] = self.deeplc_df["retention_time_diff"]
    result_df["abs_retention_time_diff"] = self.deeplc_df["retention_time_diff"].abs()

    # Adopt from 'DeepRescore2': RTR = min(pred, obs) / max(pred, obs)
    result_df["retention_time_ratio"] = np.minimum(
        result_df["predicted_retention_time"], result_df["observed_retention_time"]
    ) / np.maximum(result_df["predicted_retention_time"], result_df["observed_retention_time"])

    for col in self.feature_columns:
        nan_rows = result_df[result_df[col].isna()]
        if not nan_rows.empty:
            logger.warning(
                f"Column {col} contains NaN values. Rows with NaN values:\n{nan_rows}"
            )
        median_value = result_df[col].median()
        result_df[col].fillna(median_value, inplace=True)
        result_df[col] = result_df[col].astype(float)

    return result_df

get_full_data()

Get the full DeepLC DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the DeepLC input data with all columns: - original_seq: Original peptide sequence - label: Target/decoy label - seq: Cleaned peptide sequence - modifications: Unimod format modifications - tr: Retention time - score: Calibration criteria score - predicted_retention_time: DeepLC predicted retention time - retention_time_diff: Difference between predicted and observed times

Source code in optimhc/feature/deeplc.py
def get_full_data(self) -> pd.DataFrame:
    """
    Get the full DeepLC DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the DeepLC input data with all columns:
        - original_seq: Original peptide sequence
        - label: Target/decoy label
        - seq: Cleaned peptide sequence
        - modifications: Unimod format modifications
        - tr: Retention time
        - score: Calibration criteria score
        - predicted_retention_time: DeepLC predicted retention time
        - retention_time_diff: Difference between predicted and observed times
    """
    return self.deeplc_df

get_raw_predictions()

Get the raw predictions DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the raw predictions: - peptide: Cleaned peptide sequence - predicted_rt: DeepLC predicted retention time - observed_rt: Original retention time - modifications: Unimod format modifications

Notes

This is a convenience method that returns the same data as the raw_predictions property.

Source code in optimhc/feature/deeplc.py
def get_raw_predictions(self) -> pd.DataFrame:
    """
    Get the raw predictions DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the raw predictions:
        - peptide: Cleaned peptide sequence
        - predicted_rt: DeepLC predicted retention time
        - observed_rt: Original retention time
        - modifications: Unimod format modifications

    Notes
    -----
    This is a convenience method that returns the same data as the
    raw_predictions property.
    """
    return self.raw_predictions

save_raw_predictions(file_path, **kwargs)

Save the raw prediction results to a file.

Parameters:

Name Type Description Default
file_path str

Path to save the file.

required
**kwargs dict

Additional parameters passed to pandas.DataFrame.to_csv. If 'index' is not specified, it defaults to False.

{}
Notes

This method saves the raw predictions DataFrame to a CSV file. The DataFrame includes: - peptide: Cleaned peptide sequence - predicted_rt: DeepLC predicted retention time - observed_rt: Original retention time - modifications: Unimod format modifications

Source code in optimhc/feature/deeplc.py
def save_raw_predictions(self, file_path: str, **kwargs) -> None:
    """
    Save the raw prediction results to a file.

    Parameters
    ----------
    file_path : str
        Path to save the file.
    **kwargs : dict
        Additional parameters passed to pandas.DataFrame.to_csv.
        If 'index' is not specified, it defaults to False.

    Notes
    -----
    This method saves the raw predictions DataFrame to a CSV file.
    The DataFrame includes:
    - peptide: Cleaned peptide sequence
    - predicted_rt: DeepLC predicted retention time
    - observed_rt: Original retention time
    - modifications: Unimod format modifications
    """
    if "index" not in kwargs:
        kwargs["index"] = False
    if self.raw_predictions is not None:
        self.raw_predictions.to_csv(file_path, **kwargs)
        logger.info(f"Raw predictions saved to {file_path}")
    else:
        logger.warning("Raw predictions have not been generated yet.")

Overlapping Peptide

overlapping_peptide

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)

A container for managing peptide-spectrum matches (PSMs) in immunopeptidomics rescoring pipelines.

Parameters:

Name Type Description Default
psms DataFrame

DataFrame containing the PSM data.

required
label_column str

Column containing the label (True for target, False for decoy).

required
scan_column str

Column containing the scan number.

required
spectrum_column str

Column containing the spectrum identifier.

required
ms_data_file_column str

Column containing the MS data file that the PSM originated from.

required
peptide_column str

Column containing the peptide sequence.

required
protein_column str

Column containing the protein accessions.

required
rescoring_features dict of str to list of str

Dictionary of feature columns for rescoring.

required
hit_rank_column str

Column containing the hit rank.

None
charge_column str

Column containing the charge state.

None
retention_time_column str

Column containing the retention time.

None
calculated_mass_column str

Column containing the calculated mass.

None
metadata_column str

Column containing metadata.

None

Attributes:

Name Type Description
psms DataFrame

Copy of the DataFrame containing the PSM data.

target_psms DataFrame

DataFrame containing only target PSMs (label = True).

decoy_psms DataFrame

DataFrame containing only decoy PSMs (label = False).

peptides list of str

List containing all peptides from the PSM data.

columns list of str

List of column names in the PSM DataFrame.

rescoring_features dict of str to list of str

Dictionary of rescoring feature columns in the PSM DataFrame.

Source code in optimhc/psm_container.py
def __init__(
    self,
    psms: pd.DataFrame,
    label_column: str,
    scan_column: str,
    spectrum_column: str,
    ms_data_file_column: str,
    peptide_column: str,
    protein_column: str,
    rescoring_features: Dict[str, List[str]],
    hit_rank_column: Optional[str] = None,
    charge_column: Optional[str] = None,
    retention_time_column: Optional[str] = None,
    calculated_mass_column: Optional[str] = None,
    metadata_column: Optional[str] = None,
):
    self._psms = psms.copy()
    self._psms.reset_index(drop=True, inplace=True)
    self.label_column = label_column
    self.scan_column = scan_column
    self.spectrum_column = spectrum_column
    self.ms_data_file_column = ms_data_file_column
    self.peptide_column = peptide_column
    self.protein_column = protein_column
    self.hit_rank_column = hit_rank_column
    self.retention_time_column = retention_time_column
    self.metadata_column = metadata_column
    self.rescoring_features = rescoring_features
    self.charge_column = charge_column
    self.calculated_mass_column = calculated_mass_column
    # rescore result column
    self.rescore_result_column = None

    # check if the columns are in the dataframe
    def check_column(col):
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")

    check_column(label_column)
    check_column(scan_column)
    check_column(spectrum_column)
    check_column(ms_data_file_column)
    check_column(peptide_column)
    check_column(protein_column)
    check_column(hit_rank_column)
    check_column(retention_time_column)
    check_column(charge_column)
    check_column(calculated_mass_column)

    # ensure the label column is boolean
    if psms[label_column].dtype != "bool":
        raise ValueError(f"Column '{label_column}' must be boolean.")

    if psms[label_column].nunique() == 1 and psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as target. No decoy PSMs found.")
    elif psms[label_column].nunique() == 1 and not psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as decoy. No target PSMs found.")

    def check_metadata_column(col):
        # check the type is Dict[str, Dict[str, str]]
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")
        if not all(isinstance(x, dict) for x in self._psms[col]):
            raise ValueError(f"Column '{col}' must contain dictionaries.")

    if metadata_column:
        check_metadata_column(metadata_column)

    def check_rescoring_features(features: Dict[str, List[str]]):
        for key, cols in features.items():
            for col in cols:
                if col not in psms.columns:
                    raise ValueError(
                        f"Column '{col}' not found in PSM data for feature '{key}'."
                    )

    check_rescoring_features(rescoring_features)

    # check if the number of decoy psms is not 0
    if len(self.decoy_psms) == 0:
        logger.error("No decoy PSMs found. Please check the decoy prefix.")
        raise ValueError("No decoy PSMs found.")

    logger.info("PsmContainer initialized with %d PSM entries.", len(self._psms))
    if self.ms_data_file_column:
        logger.info(
            "PSMs originated from %d MS data file(s).",
            len(self._psms[ms_data_file_column].unique()),
        )
    logger.info("target psms: %d", len(self.target_psms))
    logger.info("decoy psms: %d", len(self.decoy_psms))
    logger.info("unique peptides: %d", len(np.unique(self.peptides)))
    logger.info("rescoring features: %s", rescoring_features)

psms property

Get a copy of the PSM DataFrame to prevent external modification.

Returns:

Type Description
DataFrame

A copy of the PSM DataFrame.

target_psms property

Get a DataFrame containing only target PSMs.

Returns:

Type Description
DataFrame

DataFrame with only target PSMs (label = True).

decoy_psms property

Get a DataFrame containing only decoy PSMs.

Returns:

Type Description
DataFrame

DataFrame with only decoy PSMs (label = False).

columns property

Get the column names of the PSM DataFrame.

Returns:

Type Description
list of str

List of column names.

feature_columns property

Get a list of all feature columns in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature column names.

feature_sources property

Get a list of all feature sources in the PSM DataFrame.

Returns:

Type Description
list of str

List of feature source names.

peptides property

Get the peptide sequences from the PSM data.

Returns:

Type Description
list of str

List of peptide sequences.

ms_data_files property

Get the MS data files from the PSM data.

Returns:

Type Description
list of str

List of MS data file names.

scan_ids property

Get the scan numbers from the PSM data.

Returns:

Type Description
list of int

List of scan numbers.

charges property

Get the charge states from the PSM data.

Returns:

Type Description
list of int

List of charge states.

metadata property

Get the metadata from the PSM data.

Returns:

Type Description
Series

Series containing metadata for each PSM.

spectrum_ids property

Get the spectrum identifiers from the PSM data.

Returns:

Type Description
list of str

List of spectrum identifiers.

identifier_columns property

Get the columns that uniquely identify each PSM.

Returns:

Type Description
list of str

List of identifier column names.

__len__()

Get the number of PSMs in the container.

Returns:

Type Description
int

Number of PSMs.

Source code in optimhc/psm_container.py
def __len__(self) -> int:
    """
    Get the number of PSMs in the container.

    Returns
    -------
    int
        Number of PSMs.
    """
    return len(self._psms)

copy()

Return a deep copy of the PsmContainer object.

Returns:

Type Description
PsmContainer

A deep copy of the current PsmContainer.

Source code in optimhc/psm_container.py
def copy(self) -> "PsmContainer":
    """
    Return a deep copy of the PsmContainer object.

    Returns
    -------
    PsmContainer
        A deep copy of the current PsmContainer.
    """
    import copy

    return copy.deepcopy(self)

__repr__()

Return a string representation of the PsmContainer.

Returns:

Type Description
str

String summary of the PsmContainer.

Source code in optimhc/psm_container.py
def __repr__(self) -> str:
    """
    Return a string representation of the PsmContainer.

    Returns
    -------
    str
        String summary of the PsmContainer.
    """
    return (
        f"PsmContainer with {len(self)} PSMs\n"
        f"\t - Target PSMs: {len(self.target_psms)}\n"
        f"\t - Decoy PSMs: {len(self.decoy_psms)}\n"
        f"\t - Unique Peptides: {len(np.unique(self.peptides))}\n"
        f"\t - Unique Spectra: {len(self._psms[self.spectrum_column].unique())}\n"
        f"\t - Rescoring Features: {self.rescoring_features}\n"
    )

drop_features(features)

Drop specified features from the PSM DataFrame.

Parameters:

Name Type Description Default
features list of str

List of feature column names to drop.

required

Raises:

Type Description
ValueError

If any of the features do not exist in the DataFrame.

Source code in optimhc/psm_container.py
def drop_features(self, features: List[str]) -> None:
    """
    Drop specified features from the PSM DataFrame.

    Parameters
    ----------
    features : list of str
        List of feature column names to drop.

    Raises
    ------
    ValueError
        If any of the features do not exist in the DataFrame.
    """
    missing_features = [f for f in features if f not in self._psms.columns]
    if missing_features:
        raise ValueError(f"Features not found in PSM data: {missing_features}")

    self._psms.drop(columns=features, inplace=True)
    # Create a list of sources to update
    sources_to_update = []
    for source, cols in self.rescoring_features.items():
        self.rescoring_features[source] = [col for col in cols if col not in features]
        if not self.rescoring_features[source]:
            sources_to_update.append(source)

    logger.info(
        f"Sources to be removed: {sources_to_update}. Because all the features are removed."
    )
    # Remove sources with no features left
    for source in sources_to_update:
        del self.rescoring_features[source]

drop_source(source)

Drop all features associated with a specific source from the PSM DataFrame.

Parameters:

Name Type Description Default
source str

Name of the source to drop.

required

Raises:

Type Description
ValueError

If the source does not exist in the rescoring features.

Source code in optimhc/psm_container.py
def drop_source(self, source: str) -> None:
    """
    Drop all features associated with a specific source from the PSM DataFrame.

    Parameters
    ----------
    source : str
        Name of the source to drop.

    Raises
    ------
    ValueError
        If the source does not exist in the rescoring features.
    """
    if source not in self.rescoring_features:
        raise ValueError(f"Source '{source}' not found in rescoring features.")
    self.drop_features(self.rescoring_features[source])

add_metadata(metadata_df, psms_key, metadata_key, source)

Merge new metadata into the PSM DataFrame based on specified columns. Metadata from the specified source is stored as a nested dictionary inside the metadata column.

Parameters:

Name Type Description Default
metadata_df DataFrame

DataFrame containing new metadata to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
metadata_key str or list of str

Column name(s) in the metadata data to merge on.

required
source str

Name of the source of the new metadata.

required
Source code in optimhc/psm_container.py
def add_metadata(
    self,
    metadata_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    metadata_key: Union[str, List[str]],
    source,
) -> None:
    """
    Merge new metadata into the PSM DataFrame based on specified columns.
    Metadata from the specified source is stored as a nested dictionary inside the metadata column.

    Parameters
    ----------
    metadata_df : pd.DataFrame
        DataFrame containing new metadata to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    metadata_key : str or list of str
        Column name(s) in the metadata data to merge on.
    source : str
        Name of the source of the new metadata.
    """
    if self.metadata_column is None:
        logger.info("No existing metadata column. Creating new metadata column.")
        self.metadata_column = "metadata"
        self._psms["metadata"] = [{} for _ in range(len(self._psms))]

    metadata_cols = [col for col in metadata_df.columns if col not in metadata_key]
    merged_df = self.psms.merge(
        metadata_df, left_on=psms_key, right_on=metadata_key, how="left"
    )
    if source in self._psms["metadata"]:
        logger.warning(f"{source} already exists in metadata. Overwriting.")
    for col in metadata_cols:
        merged_df["metadata"] = merged_df.apply(
            lambda row: {
                **row["metadata"],
                source: (
                    {col: row[col]}
                    if source not in row["metadata"]
                    else {**row["metadata"][source], col: row[col]}
                ),
            },
            axis=1,
        )

    self._psms["metadata"] = merged_df["metadata"]

get_top_hits(n=1)

Get the top n hits based on the hit rank column. If the hit rank column is not specified, returns the original PSMs.

Parameters:

Name Type Description Default
n int

The number of top hits to return. Default is 1.

1

Returns:

Type Description
PsmContainer

A new PsmContainer object containing the top n hits.

Source code in optimhc/psm_container.py
def get_top_hits(self, n: int = 1):
    """
    Get the top n hits based on the hit rank column.
    If the hit rank column is not specified, returns the original PSMs.

    Parameters
    ----------
    n : int, optional
        The number of top hits to return. Default is 1.

    Returns
    -------
    PsmContainer
        A new PsmContainer object containing the top n hits.
    """
    if self.hit_rank_column is None:
        logger.warning("Rank column not specified. Return the original PSMs.")
        return self.copy()

    psms = self.copy()
    psms._psms = psms._psms[psms._psms[self.hit_rank_column] <= n]
    return psms

add_features(features_df, psms_key, feature_key, source, suffix=None)

Merge new features into the PSM DataFrame based on specified columns.

This method performs a left join between the PSM data and feature data, ensuring that all PSMs are preserved while adding new features. It handles column name conflicts through optional suffixing and maintains feature source tracking.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
feature_key str or list of str

Column name(s) in the features data to merge on.

required
source str

Name of the source of the new features (e.g., 'deeplc', 'netmhc').

required
suffix str

Suffix to add to the new columns if there's a name conflict. Required when new feature columns have the same names as existing columns. For example, if adding features from different sources (e.g., 'score' from DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

None

Returns:

Type Description
None

Raises:

Type Description
ValueError

If duplicate columns exist without suffix. If merging features changes the number of PSMs.

Notes

The method follows these steps: 1. Validates input and prepares merge keys 2. Checks for column name conflicts 3. Manages feature source: if the source already exists, it will be overwritten 4. Performs left join merge 5. Verifies data integrity

Suffix Usage

The suffix parameter is used to handle column name conflicts: - When adding features from different sources that might have the same column names - When you want to keep both the original and new features with the same name - When you need to track the source of features in the column names

If suffix is not provided and there are duplicate column names: - The method will raise a ValueError - You must either provide a suffix or rename the columns before adding

Examples:

>>> container = PsmContainer(...)
>>> # Adding features without suffix (no conflicts)
>>> features_df1 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'feature1': [0.1, 0.2, 0.3],
...     'feature2': [0.4, 0.5, 0.6]
... })
>>> container.add_features(
...     features_df1,
...     psms_key='scan',
...     feature_key='scan',
...     source='source1'
... )
>>> # Adding features with suffix (handling conflicts)
>>> features_df2 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
...     'feature3': [0.7, 0.8, 0.9]
... })
>>> container.add_features(
...     features_df2,
...     psms_key='scan',
...     feature_key='scan',
...     source='source2',
...     suffix='_new'  # 'score' becomes 'score_new'
... )
Source code in optimhc/psm_container.py
def add_features(
    self,
    features_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    feature_key: Union[str, List[str]],
    source: str,
    suffix: Optional[str] = None,
) -> None:
    """Merge new features into the PSM DataFrame based on specified columns.

    This method performs a left join between the PSM data and feature data,
    ensuring that all PSMs are preserved while adding new features. It handles
    column name conflicts through optional suffixing and maintains feature source
    tracking.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    feature_key : str or list of str
        Column name(s) in the features data to merge on.
    source : str
        Name of the source of the new features (e.g., 'deeplc', 'netmhc').
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
        Required when new feature columns have the same names as existing columns.
        For example, if adding features from different sources (e.g., 'score' from
        DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If duplicate columns exist without suffix.
        If merging features changes the number of PSMs.

    Notes
    -----
    The method follows these steps:
    1. Validates input and prepares merge keys
    2. Checks for column name conflicts
    3. Manages feature source: if the source already exists, it will be overwritten
    4. Performs left join merge
    5. Verifies data integrity

    Suffix Usage
    -----------
    The suffix parameter is used to handle column name conflicts:
    - When adding features from different sources that might have the same column names
    - When you want to keep both the original and new features with the same name
    - When you need to track the source of features in the column names

    If suffix is not provided and there are duplicate column names:
    - The method will raise a ValueError
    - You must either provide a suffix or rename the columns before adding

    Examples
    --------
    >>> container = PsmContainer(...)
    >>> # Adding features without suffix (no conflicts)
    >>> features_df1 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'feature1': [0.1, 0.2, 0.3],
    ...     'feature2': [0.4, 0.5, 0.6]
    ... })
    >>> container.add_features(
    ...     features_df1,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source1'
    ... )
    >>> # Adding features with suffix (handling conflicts)
    >>> features_df2 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
    ...     'feature3': [0.7, 0.8, 0.9]
    ... })
    >>> container.add_features(
    ...     features_df2,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source2',
    ...     suffix='_new'  # 'score' becomes 'score_new'
    ... )
    """
    if isinstance(psms_key, str):
        psms_key = [psms_key]

    if isinstance(feature_key, str):
        feature_key = [feature_key]

    new_feature_cols = [col for col in features_df.columns if col not in feature_key]

    for cols in new_feature_cols:
        if cols in self._psms.columns:
            logger.warning(f"Column '{cols}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix ")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")
    logger.info(f"Adding {len(new_feature_cols)} new features from {source}.")

    if not new_feature_cols:
        logger.warning("No new features to add. Check the feature key and PSMs key.")
        logger.warning(f"Feature key: {feature_key}; PSMs key: {psms_key}")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    # TODO: reluctant logic
    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df = features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}
    )
    original_len = len(self._psms)
    # avoid merge the right key to the psms
    self._psms = self._psms.merge(
        features_df, left_on=psms_key, right_on=feature_key, how="left"
    )

    if feature_key != psms_key:
        cols_to_drop = [
            col for col in feature_key if col not in psms_key and col in self._psms.columns
        ]
        if cols_to_drop:
            logger.debug(f"Dropping columns from feature_key not in psms_key: {cols_to_drop}")
            self._psms.drop(columns=cols_to_drop, inplace=True)

    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate keys."
        )

add_features_by_index(features_df, source, suffix=None)

Merge new features into the PSM DataFrame based on the DataFrame index.

Parameters:

Name Type Description Default
features_df DataFrame

DataFrame containing new features to add.

required
source str

Name of the source of the new features.

required
suffix str

Suffix to add to the new columns if there's a name conflict.

None
Source code in optimhc/psm_container.py
def add_features_by_index(
    self, features_df: pd.DataFrame, source: str, suffix: Optional[str] = None
) -> None:
    """
    Merge new features into the PSM DataFrame based on the DataFrame index.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    source : str
        Name of the source of the new features.
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
    """
    new_feature_cols = [col for col in features_df.columns]
    for col in new_feature_cols:
        if col in self._psms.columns:
            logger.warning(f"Column '{col}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix.")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")

    logger.info(f"Adding {len(new_feature_cols)} new features from {source} by index.")

    if not new_feature_cols:
        logger.warning("No new features to add.")
        raise ValueError("No new features to add.")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}, inplace=True
    )
    original_len = len(self._psms)
    self._psms = self._psms.merge(
        features_df,
        left_index=True,
        right_index=True,
        how="left",  # Perform a left join to preserve all original PSM data
    )

    # Ensure that the merge did not change the number of rows in the PSM DataFrame
    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate indices."
        )

add_results(results_df, psms_key, result_key)

Add results of rescore engine to the PSM DataFrame based on specified columns.

Parameters:

Name Type Description Default
results_df DataFrame

DataFrame containing new results to add.

required
psms_key str or list of str

Column name(s) in the PSM data to merge on.

required
result_key str or list of str

Column name(s) in the results data to merge on.

required
Source code in optimhc/psm_container.py
def add_results(
    self,
    results_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    result_key: Union[str, List[str]],
) -> None:
    """
    Add results of rescore engine to the PSM DataFrame based on specified columns.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing new results to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    result_key : str or list of str
        Column name(s) in the results data to merge on.
    """
    if self.rescore_result_column is not None:
        logger.warning("Rescore result column already exists. Overwriting.")

    if set(self._psms.columns) & set(results_df.columns):
        raise ValueError(
            "Duplicate columns exist. Please rename the columns in the results data."
        )

    self.rescore_result_column = result_key
    self._psms = self._psms.merge(
        results_df,
        left_on=psms_key,
        right_on=result_key,
        how="left",
        validate="one_to_one",
    )
    self._psms.drop(columns=result_key, inplace=True)
    logger.info("Added rescore results to PSM data.")

write_pin(output_file, style='default', source=None)

Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats. The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

Parameters:

Name Type Description Default
output_file str

Path to the output PIN file.

required
style str

If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins). If set to 'default', outputs all features specified in rescoring_features, plus required Percolator columns.

'default'
source list of str

List of feature sources to include. If None, includes all sources.

None

Returns:

Type Description
DataFrame

The DataFrame written to the PIN file.

Notes
  • The first three columns are always: SpecID, Label, ScanNr.
  • For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
  • If hit_rank_column is not specified, rank is set to 1 for all rows.
  • Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
  • The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
  • The 'Peptide' column is formatted with underscores (e.g., _.PEPTIDE._).
  • For standard format, all features from rescoring_features are appended between ScanNr and Peptide columns.
  • The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
  • Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

Example output (default style): SpecId Label ScanNr feature1 feature2 ... Peptide Proteins

Example output (msbooster style): SpecId Label ScanNr retentiontime rank hyperscore Peptide Proteins or SpecId Label ScanNr retentiontime rank log10_evalue Peptide Proteins

Raises:

Type Description
ValueError

If required columns are missing for the selected style.

Source code in optimhc/psm_container.py
def write_pin(
    self, output_file: str, style: str = "default", source: List[str] = None
) -> None:
    """
    Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats.
    The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

    Parameters
    ----------
    output_file : str
        Path to the output PIN file.
    style : str, optional
        If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins).
        If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.
    source : list of str, optional
        List of feature sources to include. If None, includes all sources.

    Returns
    -------
    pd.DataFrame
        The DataFrame written to the PIN file.

    Notes
    -----
    - The first three columns are always: SpecID, Label, ScanNr.
    - For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
    - If `hit_rank_column` is not specified, rank is set to 1 for all rows.
    - Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
    - The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
    - The 'Peptide' column is formatted with underscores (e.g., `_.PEPTIDE._`).
    - For standard format, all features from `rescoring_features` are appended between ScanNr and Peptide columns.
    - The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
    - Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

    Example output (default style):
        SpecId	Label	ScanNr	feature1	feature2	...	Peptide	Proteins

    Example output (msbooster style):
        SpecId	Label	ScanNr	retentiontime	rank	hyperscore	Peptide	Proteins
        or
        SpecId	Label	ScanNr	retentiontime	rank	log10_evalue	Peptide	Proteins

    Raises
    ------
    ValueError
        If required columns are missing for the selected style.
    """
    df = self._psms.copy()
    # Check if the label column is str
    # Case1: label column is str
    if df[self.label_column].dtype == "str":
        df["PercolatorLabel"] = df[self.label_column].map({"True": 1, "False": -1})
    # Case2: label column is bool
    elif df[self.label_column].dtype == "bool":
        df["PercolatorLabel"] = df[self.label_column].map({True: 1, False: -1})
    else:
        # try to convert to bool
        logger.warning("Label column is not str or bool. Converting to bool.")
        df["PercolatorLabel"] = df[self.label_column].astype(bool).map({True: 1, False: -1})
    logger.info("Writing PIN file to %s", output_file)
    logger.info("Using style: %s", style)

    feature_cols = []
    if source is None:
        for _, cols in self.rescoring_features.items():
            feature_cols.extend(cols)
    else:
        for s in source:
            if s not in self.rescoring_features:
                raise ValueError(f"Source '{s}' not found in rescoring features.")
            feature_cols.extend(self.rescoring_features[s])

    pin_df = pd.DataFrame()
    pin_df["SpecId"] = df[self.spectrum_column]
    pin_df["Label"] = df["PercolatorLabel"]
    pin_df["ScanNr"] = df[self.scan_column]

    if style == "msbooster":
        if self.retention_time_column:
            pin_df["retentiontime"] = df[self.retention_time_column]
        else:
            raise ValueError("Retention time column is required for msbooster style.")

        pin_df["rank"] = df[self.hit_rank_column].astype(int) if self.hit_rank_column else 1
        if "hyperscore" in self.feature_columns:
            pin_df["hyperscore"] = df["hyperscore"]
        elif "expect" in self.feature_columns:
            pin_df["log10_evalue"] = df["expect"]
        else:
            raise ValueError(
                "Either 'hyperscore' or 'expect' column is required for msbooster style."
            )

        # Add other features and jump the hyperscore or expect column
        for col in feature_cols:
            if col not in [
                "hyperscore",
                "expect",
                self.hit_rank_column,
                self.retention_time_column,
            ]:
                pin_df[col] = df[col]

        # PEPTIDE -> _.PEPTIDE._
        # Add _. at the front and ._ at the end of the peptide column
        pin_df["Peptide"] = df[self.peptide_column].apply(
            lambda x: f"_.{x}._" if isinstance(x, str) else x
        )

    elif style == "default":
        for col in feature_cols:
            pin_df[col] = df[col]
        pin_df["Peptide"] = df[self.peptide_column]
    else:
        raise ValueError(f"Unknown style: {style}. Use 'msbooster' or 'default'.")

    pin_df["Proteins"] = df[self.protein_column].apply(
        lambda x: ";".join(x) if isinstance(x, (list, tuple)) else x
    )
    pin_df = self._convert_float_to_int(pin_df)
    pin_df.to_csv(output_file, sep="\t", index=False)
    logger.info("PIN file written to %s", output_file)
    return pin_df

OverlappingPeptideFeatureGenerator(peptides, min_overlap_length=6, min_length=7, max_length=60, min_entropy=0, fill_missing='median', remove_pre_nxt_aa=False, remove_modification=True, *args, **kwargs)

Bases: BaseFeatureGenerator

Generates features based on peptide sequence overlaps using the Overlap-Layout-Consensus (OLC) algorithm.

This generator constructs an overlap graph of peptides, removes transitive edges, simplifies the graph to contigs, and computes features such as the number of overlaps, log-transformed overlap counts, overlap ranks, and contig lengths. It also filters out peptides with low entropy or outlier lengths before processing. Additionally, it records detailed information about brother peptides and contigs, accessible via the get_all_data method.

Parameters:

Name Type Description Default
peptides list of str

List of peptide sequences.

required
min_overlap_length int

Minimum required overlap length for peptides to be considered overlapping. Default is 6.

6
min_length int

Minimum peptide length to include in processing. Default is 7.

7
max_length int

Maximum peptide length to include in processing. Default is 60.

60
min_entropy float

Minimum Shannon entropy for peptides to include in processing. Default is 0.

0
fill_missing str

Method to fill missing values for filtered peptides. Options are 'median' or 'zero'. Default is 'median'.

'median'
remove_pre_nxt_aa bool

Whether to remove the preceding and following amino acids from peptides. Default is False.

False
remove_modification bool

Whether to remove modifications from peptides. Default is True.

True

Attributes:

Name Type Description
original_peptides list of str

Original list of peptide sequences.

min_overlap_length int

Minimum required overlap length.

min_length int

Minimum peptide length.

max_length int

Maximum peptide length.

min_entropy float

Minimum Shannon entropy.

fill_missing str

Method to fill missing values.

remove_pre_nxt_aa bool

Whether to remove preceding and following amino acids.

remove_modification bool

Whether to remove modifications.

filtered_peptides list of str

List of peptides after filtering.

filtered_indices list of int

Indices of filtered peptides.

peptide_to_index dict of str to int

Mapping of peptides to their indices.

overlap_data DataFrame

DataFrame containing overlap data.

peptide_to_contig dict of str to int

Mapping of peptides to their contig indices.

assembled_contigs list of dict

List of assembled contigs.

full_data DataFrame

Full data including brother peptides and contig information.

_overlap_graph DiGraph

Overlap graph.

_simplified_graph DiGraph

Simplified graph with transitive edges removed.

Notes

Key Data Structures: 1. contigs: List[List[str]] - Represents non-branching paths in the overlap graph - Each inner list contains peptide sequences that form a continuous chain - Example: [['PEPTIDE1', 'PEPTIDE2'], ['PEPTIDE3']]

2. assembled_contigs: List[Dict]
   - Contains the assembled sequences and their constituent peptides
   - Each dictionary has two keys:
       'sequence': The merged/assembled sequence of overlapping peptides
       'peptides': List of peptides that were used to build this contig
   - Example: [
       {
           'sequence': 'LONGPEPTIDESEQUENCE',
           'peptides': ['LONGPEP', 'PEPTIDE', 'SEQUENCE']
       },
       {
           'sequence': 'SINGLEPEPTIDE',
           'peptides': ['SINGLEPEPTIDE']
       }
   ]

3. peptide_to_contig: Dict[str, int]
   - Maps each peptide to its contig index in assembled_contigs
   - Key: peptide sequence
   - Value: index of the contig containing this peptide
   - Example: {
       'LONGPEP': 0,
       'PEPTIDE': 0,
       'SEQUENCE': 0,
       'SINGLEPEPTIDE': 1
   }

4. overlap_graph (_overlap_graph): nx.DiGraph
   - Directed graph representing all possible overlaps between peptides
   - Nodes: peptide sequences
   - Edges: overlaps between peptides
   - Edge weights: length of overlap

5. simplified_graph (_simplified_graph): nx.DiGraph
   - Simplified version of overlap_graph with transitive edges removed
   - Used for final contig assembly
   - More efficient representation of essential overlaps
Source code in optimhc/feature/overlapping_peptide.py
def __init__(
    self,
    peptides: List[str],
    min_overlap_length: int = 6,
    min_length: int = 7,
    max_length: int = 60,
    min_entropy: float = 0,
    fill_missing: str = "median",  # 'median' or 'zero'
    remove_pre_nxt_aa: bool = False,
    remove_modification: bool = True,
    *args,
    **kwargs,
):
    self.original_peptides = peptides
    self.min_overlap_length = min_overlap_length
    self.min_length = min_length
    self.max_length = max_length
    self.min_entropy = min_entropy
    self.fill_missing = fill_missing.lower()
    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    self.remove_modification = remove_modification
    self.filtered_peptides = []
    self.filtered_indices = []
    self.peptide_to_index = {}
    self.overlap_data = None
    self.peptide_to_contig = {}
    self.assembled_contigs = []
    self.full_data = None
    self._overlap_graph = None
    self._simplified_graph = None
    logger.info(
        f"Initialized OverlappingPeptideFeatureGenerator with {len(peptides)} peptides and minimum overlap length: {min_overlap_length}"
    )
    logger.info(
        f"remove_pre_nxt_aa: {remove_pre_nxt_aa}, remove_modification: {remove_modification}"
    )
    logger.info(
        f"Peptide filtering parameters - min_length: {min_length}, max_length: {max_length}, min_entropy: {min_entropy}"
    )

id_column property

Returns a list of input columns required for the feature generator.

Returns: List[str]: List of input columns.

feature_columns property

Returns the feature column names.

overlap_graph property

Returns the overlap graph.

simplified_graph property

Returns the layout graph.

contigs property

Returns the assembled contigs.

generate_features()

Generates features for peptide overlaps, including the count of overlapping peptides, contig length, and log-transformed counts and ranks.

Returns: pd.DataFrame: DataFrame containing the features.

Source code in optimhc/feature/overlapping_peptide.py
def generate_features(self) -> pd.DataFrame:
    """
    Generates features for peptide overlaps, including the count of overlapping peptides, contig length,
    and log-transformed counts and ranks.

    Returns:
        pd.DataFrame: DataFrame containing the features.
    """
    features_df = self._integrate_overlap_features()
    features_df = features_df[["Peptide"] + self.feature_columns]
    logger.info(f"Generated overlap features for {len(features_df)} peptides.")
    return features_df

get_full_data()

Returns the full data including brother peptides and contig information for each peptide. In the output, the lists of contig peptides and brother peptides include redundant peptides, so that their counts match the corresponding peptide and contig_member_count.

Returns: pd.DataFrame: DataFrame containing peptides and their brother peptides and contigs.

Source code in optimhc/feature/overlapping_peptide.py
def get_full_data(self) -> pd.DataFrame:
    """
    Returns the full data including brother peptides and contig information for each peptide.
    In the output, the lists of contig peptides and brother peptides include redundant peptides,
    so that their counts match the corresponding peptide and contig_member_count.

    Returns:
        pd.DataFrame: DataFrame containing peptides and their brother peptides and contigs.
    """
    self._integrate_overlap_features()
    if self.full_data is not None:
        logger.info("Full data has already been computed. Returning cached data.")
        return self.full_data
    data_list = []

    for peptide in tqdm(self.filtered_peptides):
        contig_idx = self.peptide_to_contig.get(peptide, None)
        if contig_idx is not None:
            contig_info = self.assembled_contigs[contig_idx]
            # Use full contig peptides (including redundant ones) if available
            full_peptides = contig_info.get("full_contig_peptides", contig_info["peptides"])
            brother_peptides = [p for p in full_peptides if p != peptide]
            data_list.append(
                {
                    "clean_peptide": peptide,
                    "BrotherPeptides": brother_peptides,
                    "ContigSequence": contig_info["sequence"],
                    "ContigPeptides": full_peptides,
                }
            )

    full_data_df = pd.DataFrame(data_list)
    self.full_data = self.overlap_data.merge(full_data_df, on="clean_peptide", how="left")
    return self.full_data

assign_brother_aggregated_feature(psms, feature_columns, overlapping_source, source_name='OverlappingGroupFeatures')

Assign aggregated features based on brother peptides to the PSMs.

For PSMs with the same ContigSequence (brother peptides), compute the mean of specified features and assign these aggregated features back to each PSM in the group. Additionally, compute the sum as mean * (contig_member_count + 1). If a PSM does not have a ContigSequence (no brothers), its new features will be set to the original values.

Parameters: psms (PsmContainer): PSM container containing the peptides and features. feature_columns (Union[str, List[str]]): Name of the feature column(s) to aggregate. overlapping_source (str): Source name of the overlapping peptide features. source_name (str): Name of the new feature source.

Returns: None

Source code in optimhc/feature/overlapping_peptide.py
def assign_brother_aggregated_feature(
    psms: PsmContainer,
    feature_columns: Union[str, List[str]],
    overlapping_source: str,
    source_name: str = "OverlappingGroupFeatures",
) -> None:
    """
    Assign aggregated features based on brother peptides to the PSMs.

    For PSMs with the same ContigSequence (brother peptides), compute the mean of specified features
    and assign these aggregated features back to each PSM in the group. Additionally, compute
    the sum as mean * (contig_member_count + 1). If a PSM does not have a ContigSequence (no brothers),
    its new features will be set to the original values.

    Parameters:
        psms (PsmContainer): PSM container containing the peptides and features.
        feature_columns (Union[str, List[str]]): Name of the feature column(s) to aggregate.
        overlapping_source (str): Source name of the overlapping peptide features.
        source_name (str): Name of the new feature source.

    Returns:
        None
    """
    if isinstance(feature_columns, str):
        feature_columns = [feature_columns]
    psms_df = psms.psms

    if psms.metadata_column is None:
        raise ValueError("The PSMs do not contain metadata.")
    metadata = psms_df[psms.metadata_column]

    def get_overlapping_data(x):
        if isinstance(x, dict):
            return x.get(overlapping_source, {})
        else:
            logger.warning(f"Invalid metadata entry: {x}")
            return {}

    overlapping_data = metadata.apply(get_overlapping_data)

    def get_contig_sequence(x):
        if isinstance(x, dict):
            return x.get("ContigSequence", None)
        else:
            logger.warning(f"Invalid overlapping data entry: {x}")
            return None

    contig_sequences = overlapping_data.apply(get_contig_sequence)

    psms_df["ContigSequence"] = contig_sequences

    if "contig_member_count" not in psms_df.columns:
        raise ValueError("'contig_member_count' column not found in PSMs.")

    missing_features = [feature for feature in feature_columns if feature not in psms_df.columns]
    if missing_features:
        raise ValueError(f"Feature columns not found in PSMs: {missing_features}")

    grouped_mean = psms_df.groupby("ContigSequence")[feature_columns].mean().reset_index()
    grouped_mean = grouped_mean.rename(
        columns={feature: f"{feature}_contig_avg" for feature in feature_columns}
    )

    psms_with_agg = psms_df.merge(grouped_mean, on="ContigSequence", how="left")

    for feature in feature_columns:
        mean_feature = f"{feature}_contig_avg"
        sum_feature = f"{feature}_contig_sum"
        psms_with_agg["contig_member_count"] = psms_with_agg["contig_member_count"].fillna(0)
        psms_with_agg[sum_feature] = (
            psms_with_agg[mean_feature] * (psms_with_agg["contig_member_count"])
        )
        psms_with_agg[sum_feature].fillna(psms_with_agg[feature], inplace=True)

    for feature in feature_columns:
        mean_feature = f"{feature}_contig_avg"
        psms_with_agg[mean_feature].fillna(psms_with_agg[feature], inplace=True)

    agg_feature_columns = [f"{feature}_contig_avg" for feature in feature_columns] + [
        f"{feature}_contig_sum" for feature in feature_columns
    ]

    new_features_df = psms_with_agg[agg_feature_columns]

    psms.add_features_by_index(features_df=new_features_df, source=source_name)

PWM

pwm

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

c_flank_pwm_data = {'A': [0.891194, 0.599125, 0.567353], 'C': [-3.952777, -4.34522, -4.612584], 'D': [0.291173, 0.550133, 0.32569], 'E': [0.687212, 0.662834, 0.834717], 'F': [-1.250652, -0.784627, -1.139232], 'G': [0.509354, -0.36837, 0.919885], 'H': [-0.808229, -0.836628, -0.591508], 'I': [-0.046196, -0.034123, -1.56436], 'K': [0.452471, 0.665617, 1.086677], 'L': [0.522681, 0.382607, 0.208291], 'M': [-2.131278, -2.144693, -2.008653], 'N': [-0.208044, -0.699085, 0.022668], 'P': [0.417673, 1.179052, -0.01885], 'Q': [-0.033656, -0.051822, 0.274208], 'R': [0.535173, 0.397917, 0.924583], 'S': [0.542669, 0.255087, 0.360228], 'T': [-0.359617, -0.322741, -0.028565], 'V': [0.507861, 0.582748, -0.584754], 'W': [-3.432776, -3.024952, -3.391619], 'Y': [-1.1447, -0.725718, -1.221789], 'X': [0.0, 0.0, 0.0]} module-attribute

c_flank_pwm = pd.DataFrame.from_dict(c_flank_pwm_data, orient='index', columns=['Pos1', 'Pos2', 'Pos3']) module-attribute

n_flank_pwm_data = {'A': [0.672938, 0.494511, 0.290216], 'C': [-5.464582, -5.140732, -5.112201], 'D': [0.85685, 0.732683, 0.398964], 'E': [0.692225, 0.660452, 0.746641], 'F': [-1.024461, -1.529751, -0.687119], 'G': [0.873872, 0.746332, 0.630604], 'H': [-1.386627, -1.212169, -1.011943], 'I': [0.138351, -0.461093, 0.095419], 'K': [0.801095, 0.639492, 0.847272], 'L': [0.56242, -0.162599, 0.201511], 'M': [-2.230132, -2.754557, -2.397489], 'N': [-0.198452, -0.099572, -0.214853], 'P': [-1.491966, 1.405721, 0.53271], 'Q': [-0.622442, -0.151155, -0.006518], 'R': [0.216375, 0.217991, 0.545768], 'S': [0.623057, 0.416164, 0.236042], 'T': [0.160517, 0.011151, -0.111494], 'V': [0.52378, 0.239183, 0.330202], 'W': [-3.27634, -4.050898, -2.959115], 'Y': [-1.06052, -1.689795, -0.674071], 'X': [0.0, 0.0, 0.0]} module-attribute

n_flank_pwm = pd.DataFrame.from_dict(n_flank_pwm_data, orient='index', columns=['Pos1', 'Pos2', 'Pos3']) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

PWMFeatureGenerator(peptides, alleles, anchors=2, mhc_class='I', pwm_path=None, remove_pre_nxt_aa=False, remove_modification=True, *args, **kwargs)

Bases: BaseFeatureGenerator

Generates PWM (Position Weight Matrix) features for peptides based on specified MHC alleles.

This generator calculates PWM scores for each peptide against the provided MHC class I or II allele PWMs.

Parameters:

Name Type Description Default
peptides list of str

Series of peptide sequences.

required
alleles list of str

List of MHC allele names (e.g., ['HLA-A01:01', 'HLA-B07:02']).

required
anchors int

Number of anchor positions to consider for MHC class I. Default is 2.

2
mhc_class str

MHC class, either 'I' or 'II'. Default is 'I'.

'I'
pwm_path str or PathLike

Custom path to PWM files. Defaults to '../../data/PWMs'.

None
remove_pre_nxt_aa bool

Whether to include the previous and next amino acids in peptides. If True, remove them. Default is False.

False
remove_modification bool

Whether to include modifications in peptides. If True, remove them. Default is True.

True

Attributes:

Name Type Description
peptides Series

Series of peptide sequences.

alleles list of str

List of MHC allele names.

mhc_class str

MHC class ('I' or 'II').

pwm_path str or PathLike

Path to PWM files.

pwms dict

Dictionary of PWMs for each allele and mer length.

anchors int

Number of anchor positions for MHC class I.

remove_pre_nxt_aa bool

Whether to remove pre/post neighboring amino acids.

remove_modification bool

Whether to remove modifications.

Notes

For MHC class I: - Generates 'PWM_Score_{allele}' and optionally 'Anchor_Score_{allele}' columns. For MHC class II: - Generates 'PWM_Score_{allele}' (core 9-mer), - 'N_Flank_PWM_Score_{allele}', - 'C_Flank_PWM_Score_{allele}' columns.

Initializes the PWMFeatureGenerator.

Parameters: peptides (List[str]): Series of peptide sequences. alleles (List[str]): List of MHC allele names (e.g., ['HLA-A01:01', 'HLA-B07:02']). mhc_class (str): MHC class, either 'I' or 'II'. Default is 'I'. pwm_path (Optional[Union[str, os.PathLike]]): Custom path to PWM files. Defaults to '../../data/PWMs'. remove_pre_nxt_aa (bool): Whether to include the previous and next amino acids in peptides. If True, remove them. Default is False. remove_modification (bool): Whether to include modifications in peptides. If True, remove them. Default is True.

Source code in optimhc/feature/pwm.py
def __init__(
    self,
    peptides: List[str],
    alleles: List[str],
    anchors: int = 2,
    mhc_class: str = "I",
    pwm_path: Optional[Union[str, os.PathLike]] = None,
    remove_pre_nxt_aa: bool = False,
    remove_modification: bool = True,
    *args,
    **kwargs,
):
    """
    Initializes the PWMFeatureGenerator.

    Parameters:
        peptides (List[str]): Series of peptide sequences.
        alleles (List[str]): List of MHC allele names (e.g., ['HLA-A01:01', 'HLA-B07:02']).
        mhc_class (str): MHC class, either 'I' or 'II'. Default is 'I'.
        pwm_path (Optional[Union[str, os.PathLike]]): Custom path to PWM files. Defaults to '../../data/PWMs'.
        remove_pre_nxt_aa (bool): Whether to include the previous and next amino acids in peptides.
            If True, remove them. Default is False.
        remove_modification (bool): Whether to include modifications in peptides.
            If True, remove them. Default is True.
    """
    self.peptides = pd.Series(peptides)
    self.alleles = alleles
    self.mhc_class = mhc_class.upper()
    if self.mhc_class not in {"I", "II"}:
        raise ValueError("MHC class must be 'I' or 'II'.")
    self.pwm_path = pwm_path if pwm_path else PWMFeatureGenerator.DEFAULT_PWM_PATH
    logger.info(f"PWM path: {self.pwm_path}")
    self.pwms: Dict[str, Dict[int, pd.DataFrame]] = (
        self._load_pwms()
    )  # Dict[allele, Dict[mer, pd.DataFrame]]

    for allele, pwms in self.pwms.items():
        for mer, pwm in pwms.items():
            logger.debug(
                f"Loaded PWM for allele {allele}, length {mer}: {pwm.shape[1]} positions"
            )
            logger.debug(pwm)
    self.anchors = anchors
    if mhc_class == "I" and anchors > 0:
        logger.info("Number of anchors: {}".format(self.anchors))
    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    logger.info("Remove pre and next amino acids: {}".format(self.remove_pre_nxt_aa))
    self.remove_modification = remove_modification
    logger.info("Remove modifications: {}".format(self.remove_modification))
    logger.info(
        f"Initialized PWMFeatureGenerator with {len(peptides)} peptides and alleles: {alleles}"
    )

id_column property

Get a list of input columns required for the feature generator.

Returns:

Type Description
list of str

List of column names required for feature generation.

feature_columns property

Returns a list of feature names generated by the feature generator.

set_pwms(pwms)

Set PWMs directly, allowing for custom PWMs to be provided.

Parameters:

Name Type Description Default
pwms dict of str to dict of int to pd.DataFrame

Dictionary of PWMs for each allele and mer length. Format: {allele: {mer_length: pwm_dataframe}}

required
Source code in optimhc/feature/pwm.py
def set_pwms(self, pwms: Dict[str, Dict[int, pd.DataFrame]]):
    """
    Set PWMs directly, allowing for custom PWMs to be provided.

    Parameters
    ----------
    pwms : dict of str to dict of int to pd.DataFrame
        Dictionary of PWMs for each allele and mer length.
        Format: {allele: {mer_length: pwm_dataframe}}
    """
    self.pwms = pwms
    logger.info(f"Set custom PWMs for alleles: {list(pwms.keys())}")

generate_features()

Generate PWM features for all peptides across specified alleles.

Returns:

Type Description
DataFrame

DataFrame containing generated features: For MHC class I: - 'PWM_Score_{allele}' and optionally 'Anchor_Score_{allele}' columns. For MHC class II: - 'PWM_Score_{allele}' (core 9-mer), - 'N_Flank_PWM_Score_{allele}', - 'C_Flank_PWM_Score_{allele}' columns.

Notes

Missing values are imputed with the median value for each feature.

Source code in optimhc/feature/pwm.py
def generate_features(self) -> pd.DataFrame:
    """
    Generate PWM features for all peptides across specified alleles.

    Returns
    -------
    pd.DataFrame
        DataFrame containing generated features:
        For MHC class I:
            - 'PWM_Score_{allele}' and optionally 'Anchor_Score_{allele}' columns.
        For MHC class II:
            - 'PWM_Score_{allele}' (core 9-mer),
            - 'N_Flank_PWM_Score_{allele}',
            - 'C_Flank_PWM_Score_{allele}' columns.

    Notes
    -----
    Missing values are imputed with the median value for each feature.
    """
    features_df = pd.DataFrame(self.peptides, columns=["Peptide"])
    features_df["clean_peptide"] = features_df["Peptide"]
    if self.remove_pre_nxt_aa:
        features_df["clean_peptide"] = features_df["Peptide"].apply(
            utils.strip_flanking_and_charge
        )
    if self.remove_modification:
        features_df["clean_peptide"] = features_df["clean_peptide"].apply(
            utils.remove_modifications
        )

    # Convert nonstandard amino acids: U -> C
    features_df["clean_peptide"] = features_df["clean_peptide"].apply(
        lambda x: x.replace("U", "C")
    )

    for allele in self.alleles:
        logger.info(
            f"Generating PWM scores for allele: {allele}, total peptides: {len(features_df)}"
        )

        if self.mhc_class == "I":
            # Class I returns a single score
            features_df[f"PWM_Score_{allele}"] = features_df["clean_peptide"].apply(
                lambda peptide: self._cal_PWM_score(peptide, allele)
            )
            na_count = features_df[f"PWM_Score_{allele}"].isna().sum()
            logger.info(
                f"Missing PWM scores for {na_count} peptides. Using median for imputation."
            )
            features_df.fillna(
                {f"PWM_Score_{allele}": features_df[f"PWM_Score_{allele}"].median()},
                inplace=True,
            )

            if self.anchors != 0:
                logger.info(
                    f"Generating anchor scores for allele: {allele}, total peptides: {len(features_df)}"
                )
                anchor_dict = {}
                min_mer = min(self.pwms[allele].keys())
                max_mer = max(self.pwms[allele].keys())
                for mer_len in range(min_mer, max_mer + 1):
                    anchor_dict[mer_len] = self._most_conserved_positions(
                        self.pwms[allele][mer_len], self.anchors
                    )
                logger.info(f"Most conserved positions for allele {allele}: {anchor_dict}")
                features_df[f"Anchor_Score_{allele}"] = features_df["clean_peptide"].apply(
                    lambda peptide: self._cal_anchor_score(peptide, allele, anchor_dict)
                )
                na_count = features_df[f"Anchor_Score_{allele}"].isna().sum()
                logger.info(
                    f"Missing anchor scores for {na_count} peptides. Using median for imputation."
                )
                features_df.fillna(
                    {f"Anchor_Score_{allele}": features_df[f"Anchor_Score_{allele}"].median()},
                    inplace=True,
                )

        else:
            # Class II returns (core_score, n_flank_score, c_flank_score)
            features_df[
                [
                    f"PWM_Score_{allele}",
                    f"N_Flank_PWM_Score_{allele}",
                    f"C_Flank_PWM_Score_{allele}",
                ]
            ] = features_df["clean_peptide"].apply(
                lambda pep: pd.Series(self._cal_PWM_score(pep, allele))
            )

            # Impute missing (NaN) with medians for each new column
            for col in [
                f"PWM_Score_{allele}",
                f"N_Flank_PWM_Score_{allele}",
                f"C_Flank_PWM_Score_{allele}",
            ]:
                na_count = features_df[col].isna().sum()
                logger.info(
                    f"Missing {col} for {na_count} peptides. Using median for imputation."
                )
                median_val = features_df[col].median()
                features_df.loc[:, col] = features_df[col].fillna(median_val)
                features_df[col] = features_df[col].infer_objects(copy=False)

    features_df.drop(columns=["clean_peptide"], inplace=True)

    return features_df

MHCflurry

mhcflurry

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

MHCflurryFeatureGenerator(peptides, alleles, remove_pre_nxt_aa=False, remove_modification=True, *args, **kwargs)

Bases: BaseFeatureGenerator

Generate MHCflurry features for peptides based on specified MHC class I alleles.

This generator calculates MHCflurry presentation scores for each peptide against the provided MHC class I alleles.

Parameters:

Name Type Description Default
peptides List[str]

List of peptide sequences.

required
alleles List[str]

List of MHC allele names (e.g., ['HLA-A01:01', 'HLA-B07:02']).

required
remove_pre_nxt_aa bool

Whether to include the previous and next amino acids in peptides. If True, remove them. Default is True.

False
remove_modification bool

Whether to include modifications in peptides. If True, remove them. Default is True.

True
Notes

The generated features include: - mhcflurry_affinity: Binding affinity score - mhcflurry_processing_score: Processing score - mhcflurry_presentation_score: Presentation score - mhcflurry_presentation_percentile: Presentation percentile

Source code in optimhc/feature/mhcflurry.py
def __init__(
    self,
    peptides: List[str],
    alleles: List[str],
    remove_pre_nxt_aa: bool = False,
    remove_modification: bool = True,
    *args,
    **kwargs,
):
    self.peptides = peptides
    self.alleles = alleles
    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    self.remove_modification = remove_modification
    self.predictor = Class1PresentationPredictor.load()
    self.predictions = None
    self._raw_predictions = None
    logger.info(
        f"Initialized MHCflurryFeatureGenerator with {len(peptides)} peptides and alleles: {alleles}"
    )

feature_columns property

Return the list of generated feature column names.

Returns:

Type Description
List[str]

List of feature column names: - mhcflurry_affinity - mhcflurry_processing_score - mhcflurry_presentation_score - mhcflurry_presentation_percentile

id_column property

Return the list of input columns required for the feature generator.

Returns:

Type Description
List[str]

List of input column names.

raw_predictions property

Return the raw predictions DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the raw predictions: - peptide: Cleaned peptide sequence - allele: MHC allele - affinity: Binding affinity - processing_score: Processing score - presentation_score: Presentation score - presentation_percentile: Presentation percentile

get_raw_predictions()

Get the raw prediction results DataFrame from MHCflurry.

Returns:

Type Description
DataFrame

Raw prediction results DataFrame containing: - peptide: Cleaned peptide sequence - allele: MHC allele - affinity: Binding affinity - processing_score: Processing score - presentation_score: Presentation score - presentation_percentile: Presentation percentile

Source code in optimhc/feature/mhcflurry.py
def get_raw_predictions(self) -> pd.DataFrame:
    """
    Get the raw prediction results DataFrame from MHCflurry.

    Returns
    -------
    pd.DataFrame
        Raw prediction results DataFrame containing:
        - peptide: Cleaned peptide sequence
        - allele: MHC allele
        - affinity: Binding affinity
        - processing_score: Processing score
        - presentation_score: Presentation score
        - presentation_percentile: Presentation percentile
    """
    return self.raw_predictions

save_raw_predictions(file_path, **kwargs)

Save the raw prediction results to a file.

Parameters:

Name Type Description Default
file_path str

Path to save the file.

required
**kwargs dict

Additional parameters passed to pandas.DataFrame.to_csv. If 'index' is not specified, it defaults to False.

{}
Notes

This method saves the raw predictions DataFrame to a CSV file. The DataFrame includes: - peptide: Cleaned peptide sequence - allele: MHC allele - affinity: Binding affinity - processing_score: Processing score - presentation_score: Presentation score - presentation_percentile: Presentation percentile

Source code in optimhc/feature/mhcflurry.py
def save_raw_predictions(self, file_path: str, **kwargs) -> None:
    """
    Save the raw prediction results to a file.

    Parameters
    ----------
    file_path : str
        Path to save the file.
    **kwargs : dict
        Additional parameters passed to pandas.DataFrame.to_csv.
        If 'index' is not specified, it defaults to False.

    Notes
    -----
    This method saves the raw predictions DataFrame to a CSV file.
    The DataFrame includes:
    - peptide: Cleaned peptide sequence
    - allele: MHC allele
    - affinity: Binding affinity
    - processing_score: Processing score
    - presentation_score: Presentation score
    - presentation_percentile: Presentation percentile
    """
    if "index" not in kwargs:
        kwargs["index"] = False
    if self.raw_predictions is not None:
        self.raw_predictions.to_csv(file_path, **kwargs)
        logger.info(f"Raw prediction results saved to: {file_path}")
    else:
        logger.warning("No raw prediction results available to save.")

generate_features()

Generate MHCflurry features for the provided peptides and alleles.

Returns:

Type Description
DataFrame

DataFrame containing the peptides and their predicted MHCflurry features: - Peptide: Original peptide sequence - mhcflurry_affinity: Binding affinity - mhcflurry_processing_score: Processing score - mhcflurry_presentation_score: Presentation score - mhcflurry_presentation_percentile: Presentation percentile

Notes

This method: 1. Runs MHCflurry predictions 2. Renames columns to include 'mhcflurry_' prefix 3. Fills missing values with median values 4. Returns the final feature DataFrame

Source code in optimhc/feature/mhcflurry.py
def generate_features(self) -> pd.DataFrame:
    """
    Generate MHCflurry features for the provided peptides and alleles.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the peptides and their predicted MHCflurry features:
        - Peptide: Original peptide sequence
        - mhcflurry_affinity: Binding affinity
        - mhcflurry_processing_score: Processing score
        - mhcflurry_presentation_score: Presentation score
        - mhcflurry_presentation_percentile: Presentation percentile

    Notes
    -----
    This method:
    1. Runs MHCflurry predictions
    2. Renames columns to include 'mhcflurry_' prefix
    3. Fills missing values with median values
    4. Returns the final feature DataFrame
    """
    self._predict()
    features_df = self.predictions.copy()
    features_df.rename(
        columns={
            "affinity": "mhcflurry_affinity",
            "processing_score": "mhcflurry_processing_score",
            "presentation_score": "mhcflurry_presentation_score",
            "presentation_percentile": "mhcflurry_presentation_percentile",
        },
        inplace=True,
    )
    features_df.fillna(
        value={
            "mhcflurry_affinity": features_df["mhcflurry_affinity"].median(),
            "mhcflurry_processing_score": features_df["mhcflurry_processing_score"].median(),
            "mhcflurry_presentation_score": features_df[
                "mhcflurry_presentation_score"
            ].median(),
            "mhcflurry_presentation_percentile": features_df[
                "mhcflurry_presentation_percentile"
            ].median(),
        },
        inplace=True,
    )
    logger.info(f"Generated MHCflurry features for {len(features_df)} peptides.")
    features_df = features_df[
        [
            "Peptide",
            "mhcflurry_affinity",
            "mhcflurry_processing_score",
            "mhcflurry_presentation_score",
            "mhcflurry_presentation_percentile",
        ]
    ]
    if features_df.isna().sum().sum() > 0:
        logger.warning("NaN values found in the generated features.")
    return features_df

get_best_allele()

Get the best allele for each peptide.

Returns:

Type Description
DataFrame

DataFrame containing the best alleles for the peptides: - Peptide: Original peptide sequence - mhcflurry_best_allele: Best binding allele

Notes

The best allele is determined by the lowest presentation percentile rank.

Source code in optimhc/feature/mhcflurry.py
def get_best_allele(self) -> pd.DataFrame:
    """
    Get the best allele for each peptide.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the best alleles for the peptides:
        - Peptide: Original peptide sequence
        - mhcflurry_best_allele: Best binding allele

    Notes
    -----
    The best allele is determined by the lowest presentation percentile rank.
    """
    best_allele_df = self.predictions[["Peptide", "best_allele"]]
    best_allele_df.rename(columns={"best_allele": "mhcflurry_best_allele"}, inplace=True)

    logger.info(f"Generated best allele information for {len(best_allele_df)} peptides.")

    return best_allele_df

predictions_to_dataframe()

Convert the predictions to a DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the predictions.

Raises:

Type Description
ValueError

If no predictions are available.

Source code in optimhc/feature/mhcflurry.py
def predictions_to_dataframe(self) -> pd.DataFrame:
    """
    Convert the predictions to a DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the predictions.

    Raises
    ------
    ValueError
        If no predictions are available.
    """
    if self.predictions is None:
        raise ValueError("No predictions available. Please run 'generate_features' first.")
    return self.predictions

NetMHCpan

netmhcpan

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

NetMHCpanFeatureGenerator(peptides, alleles, mode='best', remove_pre_nxt_aa=False, remove_modification=True, n_processes=1, show_progress=False, *args, **kwargs)

Bases: BaseFeatureGenerator

Generate NetMHCpan features for peptides based on specified MHC class I alleles.

This generator calculates NetMHCpan binding predictions for each peptide against the provided MHC class I alleles.

Parameters:

Name Type Description Default
peptides List[str]

List of peptide sequences.

required
alleles List[str]

List of MHC allele names (e.g., ['HLA-A02:01', 'HLA-B07:02']).

required
mode str

Mode of feature generation. Options: - 'best': Return only the best allele information for each peptide. - 'all': Return predictions for all alleles with allele-specific suffixes plus best allele info. Default is 'best'.

'best'
remove_pre_nxt_aa bool

Whether to include the previous and next amino acids in peptides. If True, remove them. Default is True.

False
remove_modification bool

Whether to include modifications in peptides. If True, remove them. Default is True.

True
n_processes int

Number of processes to use for multiprocessing. Default is 1 (no multiprocessing).

1
show_progress bool

Whether to display a progress bar. Default is False.

False
Notes

The generated features include: - netmhcpan_score: Raw binding score - netmhcpan_affinity: Binding affinity in nM - netmhcpan_percentile_rank: Percentile rank of the binding score

Source code in optimhc/feature/netmhcpan.py
def __init__(
    self,
    peptides: List[str],
    alleles: List[str],
    mode: str = "best",
    remove_pre_nxt_aa: bool = False,
    remove_modification: bool = True,
    n_processes: int = 1,
    show_progress: bool = False,
    *args,
    **kwargs,
):
    if mode not in ["best", "all"]:
        raise ValueError("Mode must be one of 'best' or 'all'.")

    self.peptides = peptides
    self.alleles = alleles
    self.mode = mode
    if len(alleles) == 1:
        self.mode = "best"
        logger.info("Only one allele provided. Switching to 'best' mode.")
    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    self.remove_modification = remove_modification
    self.n_processes = min(n_processes, cpu_count())
    self.show_progress = show_progress
    self.predictor = NetMHCpan41(alleles=self.alleles)
    self.predictions = None
    self._raw_predictions = None
    logger.info(
        f"Initialized NetMHCpanFeatureGenerator with {len(peptides)} peptides, alleles: {alleles}, mode: {mode}, n_processes: {self.n_processes}, show_progress: {self.show_progress}"
    )

feature_columns property

Return the list of generated feature column names, determined by the mode. Only includes numerical features, excluding any string features like allele names.

Returns:

Type Description
List[str]

List of feature column names: - For 'all' mode: netmhcpan_score_{allele}, netmhcpan_affinity_{allele}, netmhcpan_percentile_rank_{allele} for each allele - For both modes: netmhcpan_best_score, netmhcpan_best_affinity, netmhcpan_best_percentile_rank

id_column property

Return the list of input columns required for the feature generator.

Returns:

Type Description
List[str]

List of input column names.

raw_predictions property

Return the raw prediction results from NetMHCpan.

Returns:

Type Description
DataFrame

Raw prediction results DataFrame containing: - peptide: Cleaned peptide sequence - allele: MHC allele - score: Raw binding score - affinity: Binding affinity in nM - percentile_rank: Percentile rank

get_raw_predictions()

Get the raw prediction results DataFrame from NetMHCpan.

Returns:

Type Description
DataFrame

Raw prediction results DataFrame containing: - peptide: Cleaned peptide sequence - allele: MHC allele - score: Raw binding score - affinity: Binding affinity in nM - percentile_rank: Percentile rank

Source code in optimhc/feature/netmhcpan.py
def get_raw_predictions(self) -> pd.DataFrame:
    """
    Get the raw prediction results DataFrame from NetMHCpan.

    Returns
    -------
    pd.DataFrame
        Raw prediction results DataFrame containing:
        - peptide: Cleaned peptide sequence
        - allele: MHC allele
        - score: Raw binding score
        - affinity: Binding affinity in nM
        - percentile_rank: Percentile rank
    """
    return self.raw_predictions

save_raw_predictions(file_path, **kwargs)

Save the raw prediction results to a file.

Parameters:

Name Type Description Default
file_path str

Path to save the file.

required
**kwargs dict

Additional parameters passed to pandas.DataFrame.to_csv. If 'index' is not specified, it defaults to False.

{}
Notes

This method saves the raw predictions DataFrame to a CSV file. The DataFrame includes: - peptide: Cleaned peptide sequence - allele: MHC allele - score: Raw binding score - affinity: Binding affinity in nM - percentile_rank: Percentile rank

Source code in optimhc/feature/netmhcpan.py
def save_raw_predictions(self, file_path: str, **kwargs) -> None:
    """
    Save the raw prediction results to a file.

    Parameters
    ----------
    file_path : str
        Path to save the file.
    **kwargs : dict
        Additional parameters passed to pandas.DataFrame.to_csv.
        If 'index' is not specified, it defaults to False.

    Notes
    -----
    This method saves the raw predictions DataFrame to a CSV file.
    The DataFrame includes:
    - peptide: Cleaned peptide sequence
    - allele: MHC allele
    - score: Raw binding score
    - affinity: Binding affinity in nM
    - percentile_rank: Percentile rank
    """
    if "index" not in kwargs:
        kwargs["index"] = False
    if self.raw_predictions is not None:
        self.raw_predictions.to_csv(file_path, **kwargs)
        logger.info(f"Raw prediction results saved to: {file_path}")
    else:
        logger.warning("No raw prediction results available to save.")

generate_features()

Generate the final feature table with NetMHCpan features for each peptide.

Returns:

Type Description
DataFrame

DataFrame containing peptides and their predicted features: - Peptide: Original peptide sequence - For 'all' mode: netmhcpan_score_{allele}, netmhcpan_affinity_{allele}, netmhcpan_percentile_rank_{allele} for each allele - For both modes: netmhcpan_best_score, netmhcpan_best_affinity, netmhcpan_best_percentile_rank

Notes

The features generated depend on the mode: - 'best': Only the best allele information for each peptide - 'all': All allele predictions plus best allele information

Missing values are handled consistently by filling with median values for numeric columns.

Source code in optimhc/feature/netmhcpan.py
def generate_features(self) -> pd.DataFrame:
    """
    Generate the final feature table with NetMHCpan features for each peptide.

    Returns
    -------
    pd.DataFrame
        DataFrame containing peptides and their predicted features:
        - Peptide: Original peptide sequence
        - For 'all' mode: netmhcpan_score_{allele}, netmhcpan_affinity_{allele},
          netmhcpan_percentile_rank_{allele} for each allele
        - For both modes: netmhcpan_best_score, netmhcpan_best_affinity,
          netmhcpan_best_percentile_rank

    Notes
    -----
    The features generated depend on the mode:
    - 'best': Only the best allele information for each peptide
    - 'all': All allele predictions plus best allele information

    Missing values are handled consistently by filling with median values
    for numeric columns.
    """
    predictions_df = self._predict()

    features_df = pd.DataFrame({"Peptide": self.peptides})

    # Generate allele-specific features if mode is 'all', otherwise generate best allele features
    if self.mode == "all":
        features_df = self._generate_all_allele_features(predictions_df, features_df)
    features_df = self._generate_best_allele_features(predictions_df, features_df)

    features_df = self._fill_missing_values(features_df)

    selected_columns = ["Peptide"] + self.feature_columns
    logger.info(f"Final selected feature columns: {selected_columns}")
    features_df = features_df[selected_columns]

    if features_df.isna().sum().sum() > 0:
        logger.warning(
            "NaN values still exist in the generated features after filling with median/mode values."
        )

    return features_df

predictions_to_dataframe()

Convert the predictions to a DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the predictions.

Raises:

Type Description
ValueError

If no predictions are available.

Source code in optimhc/feature/netmhcpan.py
def predictions_to_dataframe(self) -> pd.DataFrame:
    """
    Convert the predictions to a DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the predictions.

    Raises
    ------
    ValueError
        If no predictions are available.
    """
    if self.predictions is None:
        raise ValueError("No predictions available. Please run 'generate_features' first.")
    return self.predictions

_predict_peptide_chunk(peptides_chunk, alleles)

Predict NetMHCpan scores for a chunk of peptides.

Parameters:

Name Type Description Default
peptides_chunk List[str]

List of peptide sequences.

required
alleles List[str]

List of MHC allele names.

required

Returns:

Type Description
DataFrame

DataFrame containing predictions: - peptide: Peptide sequence - allele: MHC allele - score: Raw binding score - affinity: Binding affinity in nM - percentile_rank: Percentile rank

Source code in optimhc/feature/netmhcpan.py
def _predict_peptide_chunk(peptides_chunk: List[str], alleles: List[str]) -> pd.DataFrame:
    """
    Predict NetMHCpan scores for a chunk of peptides.

    Parameters
    ----------
    peptides_chunk : List[str]
        List of peptide sequences.
    alleles : List[str]
        List of MHC allele names.

    Returns
    -------
    pd.DataFrame
        DataFrame containing predictions:
        - peptide: Peptide sequence
        - allele: MHC allele
        - score: Raw binding score
        - affinity: Binding affinity in nM
        - percentile_rank: Percentile rank
    """
    predictor = NetMHCpan41(alleles=alleles)
    results = predictor.predict_peptides(peptides_chunk)
    return results.to_dataframe()

NetMHCIIpan

netmhciipan

feature_generator_factory = FeatureGeneratorFactory() module-attribute

logger = logging.getLogger(__name__) module-attribute

BaseFeatureGenerator

Bases: ABC

Abstract base class for all feature generators in the rescoring pipeline.

Subclasses must implement: - feature_columns -- names of generated feature columns - id_column -- merge key column(s) - generate_features() -- pure computation, returns a DataFrame - from_config() -- construct an instance from pipeline config

The default apply() merges features by peptide column. Override it for index-based merges, composite keys, or post-processing.

feature_columns abstractmethod property

Return a list of feature column names produced by this generator.

id_column abstractmethod property

Return the column(s) used as merge key(s) with the PsmContainer.

generate_features() abstractmethod

Generate features and return them as a DataFrame.

Source code in optimhc/feature/base_feature_generator.py
@abstractmethod
def generate_features(self) -> pd.DataFrame:
    """Generate features and return them as a DataFrame."""
    ...

from_config(psms, config, params) classmethod

Construct a generator instance from pipeline configuration.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container with all current data.

required
config dict

The full pipeline configuration.

required
params dict

Generator-specific parameters from config["featureGenerator"][i]["params"].

required
Source code in optimhc/feature/base_feature_generator.py
@classmethod
def from_config(
    cls,
    psms: PsmContainer,
    config: dict,
    params: dict,
) -> "BaseFeatureGenerator":
    """Construct a generator instance from pipeline configuration.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container with all current data.
    config : dict
        The full pipeline configuration.
    params : dict
        Generator-specific parameters from
        ``config["featureGenerator"][i]["params"]``.
    """
    raise NotImplementedError(f"{cls.__name__} must implement from_config()")

apply(psms, source)

Generate features and merge them into the PsmContainer.

The default implementation merges by peptide column using add_features(). Override for different merge strategies (index-based, composite key) or additional post-processing.

Parameters:

Name Type Description Default
psms PsmContainer

The PSM container to add features to (modified in-place).

required
source str

Name of this feature source (e.g. "Basic", "PWM").

required
Source code in optimhc/feature/base_feature_generator.py
def apply(self, psms: PsmContainer, source: str) -> None:
    """Generate features and merge them into the PsmContainer.

    The default implementation merges by peptide column using
    ``add_features()``.  Override for different merge strategies
    (index-based, composite key) or additional post-processing.

    Parameters
    ----------
    psms : PsmContainer
        The PSM container to add features to (modified in-place).
    source : str
        Name of this feature source (e.g. ``"Basic"``, ``"PWM"``).
    """
    features = self.generate_features()
    psms.add_features(
        features,
        psms_key=psms.peptide_column,
        feature_key=self.id_column,
        source=source,
    )

NetMHCIIpanFeatureGenerator(peptides, alleles, mode='best', remove_pre_nxt_aa=True, remove_modification=True, n_processes=1, show_progress=False, *args, **kwargs)

Bases: BaseFeatureGenerator

Generate NetMHCIIpan features for given peptides based on specified MHC Class II alleles.

This feature generator uses the NetMHCIIpan43_BA interface to predict MHC Class II binding for each peptide and returns scores and features based on the specified parameters.

Parameters:

Name Type Description Default
peptides List[str]

List of peptide sequences.

required
alleles List[str]

List of MHC Class II alleles, e.g., ['DRB1_0101', 'DRB1_0102'].

required
mode str

Feature generation mode. Options: - 'best': Return only the best result for each peptide across all alleles. - 'all': Return prediction results for each peptide across all alleles (with allele-specific column suffixes). Default is 'best'.

'best'
remove_pre_nxt_aa bool

Whether to remove the amino acids flanking the peptide (e.g., removing X-AA/AA-X forms). Default is True.

True
remove_modification bool

Whether to remove modification information from peptides, e.g., (Phospho). Default is True.

True
n_processes int

Number of processes to use. Default is 1 (no multiprocessing).

1
show_progress bool

Whether to display a progress bar. Default is False.

False
Notes

The generated features include: - netmhciipan_score: Raw binding score - netmhciipan_affinity: Binding affinity in nM - netmhciipan_percentile_rank: Percentile rank of the binding score

Source code in optimhc/feature/netmhciipan.py
def __init__(
    self,
    peptides: List[str],
    alleles: List[str],
    mode: str = "best",
    remove_pre_nxt_aa: bool = True,
    remove_modification: bool = True,
    n_processes: int = 1,
    show_progress: bool = False,
    *args,
    **kwargs,
):
    if mode not in ["best", "all"]:
        raise ValueError("Mode must be one of 'best' or 'all'.")

    self.peptides = peptides
    self.alleles = alleles
    self.mode = mode
    if len(alleles) == 1:
        self.mode = "best"
        logger.info("Only one allele provided. Switching to 'best' mode.")

    self.remove_pre_nxt_aa = remove_pre_nxt_aa
    self.remove_modification = remove_modification
    self.n_processes = min(n_processes, cpu_count())
    self.show_progress = show_progress
    self.predictor = NetMHCIIpan43_BA(alleles=self.alleles)
    self.predictions = None
    self._raw_predictions = None

    logger.info(
        f"Initialized NetMHCIIpanFeatureGenerator with {len(peptides)} peptides, "
        f"alleles={alleles}, mode={self.mode}, "
        f"n_processes={self.n_processes}, show_progress={self.show_progress}"
    )

feature_columns property

Return the list of generated feature column names, determined by the mode. Only includes numerical features, excluding any string features like allele names.

Returns:

Type Description
List[str]

List of feature column names: - For 'all' mode: netmhciipan_score_{allele}, netmhciipan_affinity_{allele}, netmhciipan_percentile_rank_{allele} for each allele - For both modes: netmhciipan_best_score, netmhciipan_best_affinity, netmhciipan_best_percentile_rank

id_column property

Return the list of input columns required for the feature generator.

Returns:

Type Description
List[str]

List of input column names.

raw_predictions property

Return the raw prediction results from NetMHCIIpan.

Returns:

Type Description
DataFrame

Raw prediction results DataFrame containing: - peptide: Cleaned peptide sequence - allele: MHC allele - score: Raw binding score - affinity: Binding affinity in nM - percentile_rank: Percentile rank

generate_features()

Generate the final feature table with NetMHCIIpan features for each peptide.

Returns:

Type Description
DataFrame

DataFrame containing peptides and their predicted features: - Peptide: Original peptide sequence - For 'all' mode: netmhciipan_score_{allele}, netmhciipan_affinity_{allele}, netmhciipan_percentile_rank_{allele} for each allele - For both modes: netmhciipan_best_score, netmhciipan_best_affinity, netmhciipan_best_percentile_rank

Notes

The features generated depend on the mode: - 'best': Only the best allele information for each peptide - 'all': All allele predictions plus best allele information

Missing values are handled consistently by filling with median values for numeric columns.

Source code in optimhc/feature/netmhciipan.py
def generate_features(self) -> pd.DataFrame:
    """
    Generate the final feature table with NetMHCIIpan features for each peptide.

    Returns
    -------
    pd.DataFrame
        DataFrame containing peptides and their predicted features:
        - Peptide: Original peptide sequence
        - For 'all' mode: netmhciipan_score_{allele}, netmhciipan_affinity_{allele},
          netmhciipan_percentile_rank_{allele} for each allele
        - For both modes: netmhciipan_best_score, netmhciipan_best_affinity,
          netmhciipan_best_percentile_rank

    Notes
    -----
    The features generated depend on the mode:
    - 'best': Only the best allele information for each peptide
    - 'all': All allele predictions plus best allele information

    Missing values are handled consistently by filling with median values
    for numeric columns.
    """
    predictions_df = self._predict()

    features_df = pd.DataFrame({"Peptide": self.peptides})

    # Generate allele-specific features if mode is 'all', otherwise generate best allele features
    if self.mode == "all":
        features_df = self._generate_all_allele_features(predictions_df, features_df)
    features_df = self._generate_best_allele_features(predictions_df, features_df)

    features_df = self._fill_missing_values(features_df)

    selected_columns = ["Peptide"] + self.feature_columns
    logger.info(f"Final selected feature columns: {selected_columns}")
    features_df = features_df[selected_columns]

    if features_df.isna().sum().sum() > 0:
        logger.warning(
            "NaN values still exist in the generated features after filling with median/mode values."
        )

    return features_df

predictions_to_dataframe()

Convert the predictions to a DataFrame.

Returns:

Type Description
DataFrame

DataFrame containing the predictions.

Raises:

Type Description
ValueError

If no predictions are available.

Source code in optimhc/feature/netmhciipan.py
def predictions_to_dataframe(self) -> pd.DataFrame:
    """
    Convert the predictions to a DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the predictions.

    Raises
    ------
    ValueError
        If no predictions are available.
    """
    if self.predictions is None:
        raise ValueError("No predictions available. Please run 'generate_features' first.")
    return self.predictions

get_raw_predictions()

Get the raw prediction results DataFrame from NetMHCIIpan.

Returns:

Type Description
DataFrame

Raw prediction results DataFrame containing: - peptide: Cleaned peptide sequence - allele: MHC allele - score: Raw binding score - affinity: Binding affinity in nM - percentile_rank: Percentile rank

Source code in optimhc/feature/netmhciipan.py
def get_raw_predictions(self) -> pd.DataFrame:
    """
    Get the raw prediction results DataFrame from NetMHCIIpan.

    Returns
    -------
    pd.DataFrame
        Raw prediction results DataFrame containing:
        - peptide: Cleaned peptide sequence
        - allele: MHC allele
        - score: Raw binding score
        - affinity: Binding affinity in nM
        - percentile_rank: Percentile rank
    """
    return self.raw_predictions

save_raw_predictions(file_path, **kwargs)

Save the raw prediction results to a file.

Parameters:

Name Type Description Default
file_path str

Path to save the file.

required
**kwargs dict

Additional parameters passed to pandas.DataFrame.to_csv. If 'index' is not specified, it defaults to False.

{}
Notes

This method saves the raw predictions DataFrame to a CSV file. The DataFrame includes: - peptide: Cleaned peptide sequence - allele: MHC allele - score: Raw binding score - affinity: Binding affinity in nM - percentile_rank: Percentile rank

Source code in optimhc/feature/netmhciipan.py
def save_raw_predictions(self, file_path: str, **kwargs) -> None:
    """
    Save the raw prediction results to a file.

    Parameters
    ----------
    file_path : str
        Path to save the file.
    **kwargs : dict
        Additional parameters passed to pandas.DataFrame.to_csv.
        If 'index' is not specified, it defaults to False.

    Notes
    -----
    This method saves the raw predictions DataFrame to a CSV file.
    The DataFrame includes:
    - peptide: Cleaned peptide sequence
    - allele: MHC allele
    - score: Raw binding score
    - affinity: Binding affinity in nM
    - percentile_rank: Percentile rank
    """
    if "index" not in kwargs:
        kwargs["index"] = False
    if self.raw_predictions is not None:
        self.raw_predictions.to_csv(file_path, **kwargs)
        logger.info(f"Raw prediction results saved to: {file_path}")
    else:
        logger.warning("No raw prediction results available to save.")

_predict_peptide_chunk_class2(peptides_chunk, alleles)

Use NetMHCIIpan43_BA to predict a batch of peptides (MHC Class II).

Parameters: peptides_chunk (List[str]): A batch of peptide sequences to predict. alleles (List[str]): List of MHC Class II alleles, e.g., ['DRB1_0101', 'DRB1_0102'].

Returns: pd.DataFrame: A DataFrame containing prediction results.

Source code in optimhc/feature/netmhciipan.py
def _predict_peptide_chunk_class2(peptides_chunk: List[str], alleles: List[str]) -> pd.DataFrame:
    """
    Use NetMHCIIpan43_BA to predict a batch of peptides (MHC Class II).

    Parameters:
        peptides_chunk (List[str]): A batch of peptide sequences to predict.
        alleles (List[str]): List of MHC Class II alleles, e.g., ['DRB1_0101', 'DRB1_0102'].

    Returns:
        pd.DataFrame: A DataFrame containing prediction results.
    """
    predictor = NetMHCIIpan43_BA(alleles=alleles)
    results = predictor.predict_peptides(peptides_chunk)
    return results.to_dataframe()