I/O¶

Parser Package¶

`parser` ¶

`all = ['read_pin', 'read_pepxml', 'extract_mzml_data']` `module-attribute` ¶

`extract_mzml_data(mzml_filename, scan_ids=None)` ¶

Extract scan data from an mzML file.

Parameters:

Name	Type	Description	Default
`mzml_filename`	`str`	The path to the mzML file.	required
`scan_ids`	`list[int] or None`	A list of scan IDs to extract. If None, extracts all scans.	`None`

Returns:

Type	Description
`DataFrame`	A DataFrame containing the extracted scan data with columns: - source: The source file name - scan: The scan ID - mz: The m/z values array - intensity: The intensity values array - charge: The charge state - retention_time: The retention time

Notes

This function: 1. Reads the mzML file using pyteomics 2. Extracts scan data including retention time, charge state, m/z values, and intensities 3. Filters scans based on provided scan IDs if specified 4. Returns a DataFrame with the extracted data

Source code in optimhc/parser/mzml.py

def extract_mzml_data(mzml_filename, scan_ids=None):
    """
    Extract scan data from an mzML file.

    Parameters
    ----------
    mzml_filename : str
        The path to the mzML file.
    scan_ids : list[int] or None, optional
        A list of scan IDs to extract. If None, extracts all scans.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the extracted scan data with columns:
        - source: The source file name
        - scan: The scan ID
        - mz: The m/z values array
        - intensity: The intensity values array
        - charge: The charge state
        - retention_time: The retention time

    Notes
    -----
    This function:
    1. Reads the mzML file using pyteomics
    2. Extracts scan data including retention time, charge state, m/z values, and intensities
    3. Filters scans based on provided scan IDs if specified
    4. Returns a DataFrame with the extracted data
    """
    filename = mzml_filename.split("/")[-1].replace(".mzML", "")
    logger.info(f"Extracting scans from {mzml_filename}")

    scan_ids = set(scan_ids) if scan_ids is not None else None

    (
        extracted_scan_ids,
        mzml_filenames,
        intensities,
        mz_values,
        charges,
        retention_times,
    ) = ([], [], [], [], [], [])

    try:
        with mzml.read(mzml_filename) as reader:
            for spectrum in reader:
                try:
                    scan_id = int(spectrum["id"].split("scan=")[-1])

                    if scan_ids is not None and scan_id not in scan_ids:
                        continue

                    mz_array = np.array(spectrum.get("m/z array", []))
                    intensity_array = np.array(spectrum.get("intensity array", []))

                    charge = None
                    try:
                        charge = int(
                            spectrum["precursorList"]["precursor"][0]["selectedIonList"][
                                "selectedIon"
                            ][0]["charge state"]
                        )
                    except (KeyError, ValueError, IndexError):
                        pass

                    retention_time = None
                    try:
                        retention_time = float(spectrum["scanList"]["scan"][0]["scan start time"])
                    except (KeyError, ValueError, IndexError):
                        pass

                    extracted_scan_ids.append(scan_id)
                    mzml_filenames.append(filename)
                    intensities.append(intensity_array)
                    mz_values.append(mz_array)
                    charges.append(charge)
                    retention_times.append(retention_time)

                except Exception as e:
                    logger.warning(f"Skipping scan {scan_id} due to error: {e}")

    except Exception as e:
        logger.error(f"Failed to parse mzML file {mzml_filename}: {e}")
        raise RuntimeError(f"Error processing mzML file {mzml_filename}: {e}")

    data_dict = {
        "source": mzml_filenames,
        "scan": extracted_scan_ids,
        "mz": mz_values,
        "intensity": intensities,
        "charge": charges,
        "retention_time": retention_times,
    }

    scans_df = pd.DataFrame(data_dict)
    scans_df = scans_df.drop_duplicates(subset=["source", "scan"])

    logger.info(f"Successfully extracted {len(scans_df)} scans from {mzml_filename}")

    return scans_df

`read_pepxml(pepxml_files, decoy_prefix='DECOY_')` ¶

Read PSMs from a list of PepXML files.

Parameters:

Name	Type	Description	Default
`pepxml_files`	`Union[str, List[str]]`	The file path to the PepXML file or a list of file paths.	required
`decoy_prefix`	`str`	The prefix used to indicate a decoy protein in the description lines of the FASTA file. Default is "DECOY_".	`'DECOY_'`

Returns:

Type	Description
`PsmContainer`	A PsmContainer object containing the PSM data.

Raises:

Type	Description
`ValueError`	If the PepXML files were generated by Percolator or PeptideProphet.

Notes

This function: 1. Reads and parses PepXML files 2. Calculates mass difference features 3. Processes matched ions and complementary ions 4. Creates charge columns 5. Log-transforms p-values 6. Returns a PsmContainer with the processed data

Source code in optimhc/parser/pepxml.py

def read_pepxml(pepxml_files, decoy_prefix="DECOY_"):
    """
    Read PSMs from a list of PepXML files.

    Parameters
    ----------
    pepxml_files : Union[str, List[str]]
        The file path to the PepXML file or a list of file paths.
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the description lines
        of the FASTA file. Default is "DECOY_".

    Returns
    -------
    PsmContainer
        A PsmContainer object containing the PSM data.

    Raises
    ------
    ValueError
        If the PepXML files were generated by Percolator or PeptideProphet.

    Notes
    -----
    This function:
    1. Reads and parses PepXML files
    2. Calculates mass difference features
    3. Processes matched ions and complementary ions
    4. Creates charge columns
    5. Log-transforms p-values
    6. Returns a PsmContainer with the processed data
    """
    proton = 1.00727646677
    if isinstance(pepxml_files, str):
        pepxml_files = [pepxml_files]
    psms = pd.concat([_parse_pepxml(f, decoy_prefix) for f in pepxml_files])

    # Check that these PSMs are not from Percolator or PeptideProphet:
    illegal_cols = {
        "Percolator q-Value",
        "Percolator PEP",
        "Percolator SVMScore",
    }

    if illegal_cols.intersection(set(psms.columns)):
        raise ValueError(
            "The PepXML files appear to have generated by Percolator or "
            "PeptideProphet; hence, they should not be analyzed with mokapot."
        )

    # For open modification searches:
    psms["mass_diff"] = psms["exp_mass"] - psms["calc_mass"]
    # Calculate massdiff features
    exp_mz = psms["exp_mass"] / psms["charge"] + proton
    calc_mz = psms["calc_mass"] / psms["charge"] + proton
    psms["abs_mz_diff"] = (exp_mz - calc_mz).abs()

    # Calculate matched ions and complementary ions
    if "num_matched_ions" in psms.columns and "tot_num_ions" in psms.columns:
        if (psms["tot_num_ions"] != 0).all():
            psms["matched_ions_ratio"] = psms["num_matched_ions"] / psms["tot_num_ions"]

    # Log number of candidates:
    if "num_matched_peptides" in psms.columns:
        psms["num_matched_peptides"] = np.log10(psms["num_matched_peptides"])

    # Create charge columns:
    psms = pd.concat([psms, pd.get_dummies(psms["charge"], prefix="charge")], axis=1)

    # psms = psms.drop("charge", axis=1)
    # -log10 p-values
    nonfeat_cols = [
        "ms_data_file",
        "scan",
        "spectrum",
        "label",
        "calc_mass",  # Retain calc_mass for FlashLFQ compatibility
        "peptide",
        "proteins",
        "charge",
        "retention_time",
    ]

    feat_cols = [c for c in psms.columns if c not in nonfeat_cols]
    psms = psms.apply(_log_features, features=feat_cols)
    rescoring_features = {"Original": feat_cols}

    return PsmContainer(
        psms=psms,
        label_column="label",
        scan_column="scan",
        spectrum_column="spectrum",
        ms_data_file_column="ms_data_file",
        peptide_column="peptide",
        protein_column="proteins",
        charge_column="charge",
        rescoring_features=rescoring_features,
        hit_rank_column="rank",
        retention_time_column="retention_time",
        calculated_mass_column="calc_mass",
    )

`read_pin(pin_files, retention_time_column=None, remove_pre_nxt_aa=False)` ¶

Read PSMs from a Percolator INput (PIN) file.

Parameters:

Name	Type	Description	Default
`pin_files`	`Union[str, List[str]]`	The file path to the PIN file or a list of file paths.	required
`retention_time_column`	`Optional[str]`	The column containing the retention time. If None, no retention time will be included.	`None`

Returns:

Type	Description
`PsmContainer`	A PsmContainer object containing the PSM data.

Notes

This function: 1. Reads PIN file(s) into a DataFrame 2. Identifies required columns (case-insensitive) 3. Processes scan IDs and hit ranks (Only support FragPipe PIN) 4. Converts data types appropriately 5. Creates a PsmContainer with the processed data

Source code in optimhc/parser/pin.py

def read_pin(
    pin_files: Union[str, List[str]],
    retention_time_column: Optional[str] = None,
    remove_pre_nxt_aa: bool = False,
) -> PsmContainer:
    """
    Read PSMs from a Percolator INput (PIN) file.

    Parameters
    ----------
    pin_files : Union[str, List[str]]
        The file path to the PIN file or a list of file paths.
    retention_time_column : Optional[str], optional
        The column containing the retention time. If None, no retention time
        will be included.

    Returns
    -------
    PsmContainer
        A PsmContainer object containing the PSM data.

    Notes
    -----
    This function:
    1. Reads PIN file(s) into a DataFrame
    2. Identifies required columns (case-insensitive)
    3. Processes scan IDs and hit ranks (Only support FragPipe PIN)
    4. Converts data types appropriately
    5. Creates a PsmContainer with the processed data
    """
    logger.info("Reading PIN file(s) into PsmContainer.")
    if isinstance(pin_files, str):
        pin_files = [pin_files]

    pin_df = pd.concat([_read_single_pin_as_df(pin_file) for pin_file in pin_files])
    logger.info(f"Read {len(pin_df)} PSMs from {len(pin_files)} PIN files.")
    logger.debug(pin_df.head())
    logger.debug(pin_df.columns)
    logger.debug(pin_df.iloc[0])

    def find_required_columns(col: str, columns: List[str]) -> str:
        """
        Case-insensitive search for a column in the DataFrame.
        Returns the matching column name with original casing.
        """
        col_lower = col.lower()
        column_map = {c.lower(): c for c in columns}
        if col_lower not in column_map:
            raise ValueError(f"Column '{col}' not found in PSM data (case-insensitive).")
        return column_map[col_lower]

    # non-feature columns (case-insensitive search)
    label = find_required_columns("Label", pin_df.columns)
    scan = find_required_columns("ScanNr", pin_df.columns)
    specid = find_required_columns("SpecId", pin_df.columns)
    peptide = find_required_columns("Peptide", pin_df.columns)
    protein = find_required_columns("Proteins", pin_df.columns)

    # Comet: P2PI20160713_pilling_C1RA2_BB72_P1_31_3_1
    # Fragpipe: P2PI20160713_pilling_C1RA2_BB72_P1.3104.3104.2_1

    # Try to parse rank from SpecId
    def parse_specid(specid: str) -> Tuple[str, int]:
        if "_" in specid:
            parts = specid.rsplit("_", 1)
            if len(parts) != 2:
                logger.warning(f"SpecId format unexpected: {specid}, using default rank 1")
                return 1
            try:
                hit_rank = int(parts[1])
                return hit_rank
            except ValueError:
                logger.warning(f"Could not parse rank from SpecId: {specid}, using default rank 1")
                return 1
        else:
            return 1

    hit_rank = "rank"
    if "rank" in [c.lower() for c in pin_df.columns]:
        pass
    else:
        # Parse SpecId to extract hit rank and update both columns
        pin_df["rank"] = pin_df[specid].apply(parse_specid)

    retention_time_column = (
        find_required_columns(retention_time_column, pin_df.columns)
        if retention_time_column
        else None
    )

    # col: charge_[1,2,3,...] = 0, 1
    charge_map = {
        col: int(re.search(r"(\d+)", col).group(1))
        for col in pin_df.columns
        if re.search(r"charge[_]?(\d+)", col, re.IGNORECASE)
    }

    def extract_charge(row):
        for col, num in charge_map.items():
            if int(float(row[col])) == 1:
                return num
        return None

    pin_df["Charge"] = pin_df.apply(extract_charge, axis=1)

    # feature columns: columns that are not non-feature columns
    non_feature_columns = [label, scan, specid, peptide, protein, hit_rank, "Charge"]
    feature_columns = [col for col in pin_df.columns if col not in non_feature_columns]

    logger.info(
        f"Columns: label={label}, scan={scan}, specid={specid}, peptide={peptide}, "
        f"protein={protein}, hit_rank={hit_rank}, retention_time={retention_time_column}, "
        f"features={feature_columns}"
    )

    pin_df[scan] = pin_df[scan].astype(int)
    pin_df[specid] = pin_df[specid].astype(str)
    pin_df[peptide] = pin_df[peptide].astype(str)
    pin_df[protein] = pin_df[protein].astype(str)
    pin_df[hit_rank] = pin_df[hit_rank].astype(float).astype(int)
    pin_df["Charge"] = pin_df["Charge"].astype(float).astype(int)
    if retention_time_column:
        pin_df[retention_time_column] = pin_df[retention_time_column].astype(float)
    for col in feature_columns:
        pin_df[col] = pin_df[col].astype(float)

    # label = 1 for target, -1 for decoy. Convert to Boolean.
    pin_df[label] = pin_df[label] == "1"
    rescoring_features = {"Original": feature_columns}

    return PsmContainer(
        psms=pin_df,
        label_column=label,
        scan_column=scan,
        spectrum_column=specid,
        ms_data_file_column=None,
        peptide_column=peptide,
        protein_column=protein,
        charge_column="Charge",
        rescoring_features=rescoring_features,
        hit_rank_column=hit_rank,
        retention_time_column=retention_time_column,
    )

PepXML Parser¶

`pepxml` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)` ¶

A container for managing peptide-spectrum matches (PSMs) in immunopeptidomics rescoring pipelines.

Parameters:

Name	Type	Description	Default
`psms`	`DataFrame`	DataFrame containing the PSM data.	required
`label_column`	`str`	Column containing the label (True for target, False for decoy).	required
`scan_column`	`str`	Column containing the scan number.	required
`spectrum_column`	`str`	Column containing the spectrum identifier.	required
`ms_data_file_column`	`str`	Column containing the MS data file that the PSM originated from.	required
`peptide_column`	`str`	Column containing the peptide sequence.	required
`protein_column`	`str`	Column containing the protein accessions.	required
`rescoring_features`	`dict of str to list of str`	Dictionary of feature columns for rescoring.	required
`hit_rank_column`	`str`	Column containing the hit rank.	`None`
`charge_column`	`str`	Column containing the charge state.	`None`
`retention_time_column`	`str`	Column containing the retention time.	`None`
`calculated_mass_column`	`str`	Column containing the calculated mass.	`None`
`metadata_column`	`str`	Column containing metadata.	`None`

Attributes:

Name	Type	Description
`psms`	`DataFrame`	Copy of the DataFrame containing the PSM data.
`target_psms`	`DataFrame`	DataFrame containing only target PSMs (label = True).
`decoy_psms`	`DataFrame`	DataFrame containing only decoy PSMs (label = False).
`peptides`	`list of str`	List containing all peptides from the PSM data.
`columns`	`list of str`	List of column names in the PSM DataFrame.
`rescoring_features`	`dict of str to list of str`	Dictionary of rescoring feature columns in the PSM DataFrame.

Source code in optimhc/psm_container.py

def __init__(
    self,
    psms: pd.DataFrame,
    label_column: str,
    scan_column: str,
    spectrum_column: str,
    ms_data_file_column: str,
    peptide_column: str,
    protein_column: str,
    rescoring_features: Dict[str, List[str]],
    hit_rank_column: Optional[str] = None,
    charge_column: Optional[str] = None,
    retention_time_column: Optional[str] = None,
    calculated_mass_column: Optional[str] = None,
    metadata_column: Optional[str] = None,
):
    self._psms = psms.copy()
    self._psms.reset_index(drop=True, inplace=True)
    self.label_column = label_column
    self.scan_column = scan_column
    self.spectrum_column = spectrum_column
    self.ms_data_file_column = ms_data_file_column
    self.peptide_column = peptide_column
    self.protein_column = protein_column
    self.hit_rank_column = hit_rank_column
    self.retention_time_column = retention_time_column
    self.metadata_column = metadata_column
    self.rescoring_features = rescoring_features
    self.charge_column = charge_column
    self.calculated_mass_column = calculated_mass_column
    # rescore result column
    self.rescore_result_column = None

    # check if the columns are in the dataframe
    def check_column(col):
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")

    check_column(label_column)
    check_column(scan_column)
    check_column(spectrum_column)
    check_column(ms_data_file_column)
    check_column(peptide_column)
    check_column(protein_column)
    check_column(hit_rank_column)
    check_column(retention_time_column)
    check_column(charge_column)
    check_column(calculated_mass_column)

    # ensure the label column is boolean
    if psms[label_column].dtype != "bool":
        raise ValueError(f"Column '{label_column}' must be boolean.")

    if psms[label_column].nunique() == 1 and psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as target. No decoy PSMs found.")
    elif psms[label_column].nunique() == 1 and not psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as decoy. No target PSMs found.")

    def check_metadata_column(col):
        # check the type is Dict[str, Dict[str, str]]
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")
        if not all(isinstance(x, dict) for x in self._psms[col]):
            raise ValueError(f"Column '{col}' must contain dictionaries.")

    if metadata_column:
        check_metadata_column(metadata_column)

    def check_rescoring_features(features: Dict[str, List[str]]):
        for key, cols in features.items():
            for col in cols:
                if col not in psms.columns:
                    raise ValueError(
                        f"Column '{col}' not found in PSM data for feature '{key}'."
                    )

    check_rescoring_features(rescoring_features)

    # check if the number of decoy psms is not 0
    if len(self.decoy_psms) == 0:
        logger.error("No decoy PSMs found. Please check the decoy prefix.")
        raise ValueError("No decoy PSMs found.")

    logger.info("PsmContainer initialized with %d PSM entries.", len(self._psms))
    if self.ms_data_file_column:
        logger.info(
            "PSMs originated from %d MS data file(s).",
            len(self._psms[ms_data_file_column].unique()),
        )
    logger.info("target psms: %d", len(self.target_psms))
    logger.info("decoy psms: %d", len(self.decoy_psms))
    logger.info("unique peptides: %d", len(np.unique(self.peptides)))
    logger.info("rescoring features: %s", rescoring_features)

`psms` `property` ¶

Get a copy of the PSM DataFrame to prevent external modification.

Returns:

Type	Description
`DataFrame`	A copy of the PSM DataFrame.

`target_psms` `property` ¶

Get a DataFrame containing only target PSMs.

Returns:

Type	Description
`DataFrame`	DataFrame with only target PSMs (label = True).

`decoy_psms` `property` ¶

Get a DataFrame containing only decoy PSMs.

Returns:

Type	Description
`DataFrame`	DataFrame with only decoy PSMs (label = False).

`columns` `property` ¶

Get the column names of the PSM DataFrame.

Returns:

Type	Description
`list of str`	List of column names.

`feature_columns` `property` ¶

Get a list of all feature columns in the PSM DataFrame.

Returns:

Type	Description
`list of str`	List of feature column names.

`feature_sources` `property` ¶

Get a list of all feature sources in the PSM DataFrame.

Returns:

Type	Description
`list of str`	List of feature source names.

`peptides` `property` ¶

Get the peptide sequences from the PSM data.

Returns:

Type	Description
`list of str`	List of peptide sequences.

`ms_data_files` `property` ¶

Get the MS data files from the PSM data.

Returns:

Type	Description
`list of str`	List of MS data file names.

`scan_ids` `property` ¶

Get the scan numbers from the PSM data.

Returns:

Type	Description
`list of int`	List of scan numbers.

`charges` `property` ¶

Get the charge states from the PSM data.

Returns:

Type	Description
`list of int`	List of charge states.

`metadata` `property` ¶

Get the metadata from the PSM data.

Returns:

Type	Description
`Series`	Series containing metadata for each PSM.

`spectrum_ids` `property` ¶

Get the spectrum identifiers from the PSM data.

Returns:

Type	Description
`list of str`	List of spectrum identifiers.

`identifier_columns` `property` ¶

Get the columns that uniquely identify each PSM.

Returns:

Type	Description
`list of str`	List of identifier column names.

`len()` ¶

Get the number of PSMs in the container.

Returns:

Type	Description
`int`	Number of PSMs.

Source code in optimhc/psm_container.py

def __len__(self) -> int:
    """
    Get the number of PSMs in the container.

    Returns
    -------
    int
        Number of PSMs.
    """
    return len(self._psms)

`copy()` ¶

Return a deep copy of the PsmContainer object.

Returns:

Type	Description
`PsmContainer`	A deep copy of the current PsmContainer.

Source code in optimhc/psm_container.py

def copy(self) -> "PsmContainer":
    """
    Return a deep copy of the PsmContainer object.

    Returns
    -------
    PsmContainer
        A deep copy of the current PsmContainer.
    """
    import copy

    return copy.deepcopy(self)

`repr()` ¶

Return a string representation of the PsmContainer.

Returns:

Type	Description
`str`	String summary of the PsmContainer.

Source code in optimhc/psm_container.py

def __repr__(self) -> str:
    """
    Return a string representation of the PsmContainer.

    Returns
    -------
    str
        String summary of the PsmContainer.
    """
    return (
        f"PsmContainer with {len(self)} PSMs\n"
        f"\t - Target PSMs: {len(self.target_psms)}\n"
        f"\t - Decoy PSMs: {len(self.decoy_psms)}\n"
        f"\t - Unique Peptides: {len(np.unique(self.peptides))}\n"
        f"\t - Unique Spectra: {len(self._psms[self.spectrum_column].unique())}\n"
        f"\t - Rescoring Features: {self.rescoring_features}\n"
    )

`drop_features(features)` ¶

Drop specified features from the PSM DataFrame.

Parameters:

Name	Type	Description	Default
`features`	`list of str`	List of feature column names to drop.	required

Raises:

Type	Description
`ValueError`	If any of the features do not exist in the DataFrame.

Source code in optimhc/psm_container.py

def drop_features(self, features: List[str]) -> None:
    """
    Drop specified features from the PSM DataFrame.

    Parameters
    ----------
    features : list of str
        List of feature column names to drop.

    Raises
    ------
    ValueError
        If any of the features do not exist in the DataFrame.
    """
    missing_features = [f for f in features if f not in self._psms.columns]
    if missing_features:
        raise ValueError(f"Features not found in PSM data: {missing_features}")

    self._psms.drop(columns=features, inplace=True)
    # Create a list of sources to update
    sources_to_update = []
    for source, cols in self.rescoring_features.items():
        self.rescoring_features[source] = [col for col in cols if col not in features]
        if not self.rescoring_features[source]:
            sources_to_update.append(source)

    logger.info(
        f"Sources to be removed: {sources_to_update}. Because all the features are removed."
    )
    # Remove sources with no features left
    for source in sources_to_update:
        del self.rescoring_features[source]

`drop_source(source)` ¶

Drop all features associated with a specific source from the PSM DataFrame.

Parameters:

Name	Type	Description	Default
`source`	`str`	Name of the source to drop.	required

Raises:

Type	Description
`ValueError`	If the source does not exist in the rescoring features.

Source code in optimhc/psm_container.py

def drop_source(self, source: str) -> None:
    """
    Drop all features associated with a specific source from the PSM DataFrame.

    Parameters
    ----------
    source : str
        Name of the source to drop.

    Raises
    ------
    ValueError
        If the source does not exist in the rescoring features.
    """
    if source not in self.rescoring_features:
        raise ValueError(f"Source '{source}' not found in rescoring features.")
    self.drop_features(self.rescoring_features[source])

`add_metadata(metadata_df, psms_key, metadata_key, source)` ¶

Merge new metadata into the PSM DataFrame based on specified columns. Metadata from the specified source is stored as a nested dictionary inside the metadata column.

Parameters:

Name	Type	Description	Default
`metadata_df`	`DataFrame`	DataFrame containing new metadata to add.	required
`psms_key`	`str or list of str`	Column name(s) in the PSM data to merge on.	required
`metadata_key`	`str or list of str`	Column name(s) in the metadata data to merge on.	required
`source`	`str`	Name of the source of the new metadata.	required

Source code in optimhc/psm_container.py

def add_metadata(
    self,
    metadata_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    metadata_key: Union[str, List[str]],
    source,
) -> None:
    """
    Merge new metadata into the PSM DataFrame based on specified columns.
    Metadata from the specified source is stored as a nested dictionary inside the metadata column.

    Parameters
    ----------
    metadata_df : pd.DataFrame
        DataFrame containing new metadata to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    metadata_key : str or list of str
        Column name(s) in the metadata data to merge on.
    source : str
        Name of the source of the new metadata.
    """
    if self.metadata_column is None:
        logger.info("No existing metadata column. Creating new metadata column.")
        self.metadata_column = "metadata"
        self._psms["metadata"] = [{} for _ in range(len(self._psms))]

    metadata_cols = [col for col in metadata_df.columns if col not in metadata_key]
    merged_df = self.psms.merge(
        metadata_df, left_on=psms_key, right_on=metadata_key, how="left"
    )
    if source in self._psms["metadata"]:
        logger.warning(f"{source} already exists in metadata. Overwriting.")
    for col in metadata_cols:
        merged_df["metadata"] = merged_df.apply(
            lambda row: {
                **row["metadata"],
                source: (
                    {col: row[col]}
                    if source not in row["metadata"]
                    else {**row["metadata"][source], col: row[col]}
                ),
            },
            axis=1,
        )

    self._psms["metadata"] = merged_df["metadata"]

`get_top_hits(n=1)` ¶

Get the top n hits based on the hit rank column. If the hit rank column is not specified, returns the original PSMs.

Parameters:

Name	Type	Description	Default
`n`	`int`	The number of top hits to return. Default is 1.	`1`

Returns:

Type	Description
`PsmContainer`	A new PsmContainer object containing the top n hits.

Source code in optimhc/psm_container.py

def get_top_hits(self, n: int = 1):
    """
    Get the top n hits based on the hit rank column.
    If the hit rank column is not specified, returns the original PSMs.

    Parameters
    ----------
    n : int, optional
        The number of top hits to return. Default is 1.

    Returns
    -------
    PsmContainer
        A new PsmContainer object containing the top n hits.
    """
    if self.hit_rank_column is None:
        logger.warning("Rank column not specified. Return the original PSMs.")
        return self.copy()

    psms = self.copy()
    psms._psms = psms._psms[psms._psms[self.hit_rank_column] <= n]
    return psms

`add_features(features_df, psms_key, feature_key, source, suffix=None)` ¶

Merge new features into the PSM DataFrame based on specified columns.

This method performs a left join between the PSM data and feature data, ensuring that all PSMs are preserved while adding new features. It handles column name conflicts through optional suffixing and maintains feature source tracking.

Parameters:

Name	Type	Description	Default
`features_df`	`DataFrame`	DataFrame containing new features to add.	required
`psms_key`	`str or list of str`	Column name(s) in the PSM data to merge on.	required
`feature_key`	`str or list of str`	Column name(s) in the features data to merge on.	required
`source`	`str`	Name of the source of the new features (e.g., 'deeplc', 'netmhc').	required
`suffix`	`str`	Suffix to add to the new columns if there's a name conflict. Required when new feature columns have the same names as existing columns. For example, if adding features from different sources (e.g., 'score' from DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.	`None`

Returns:

Type	Description
`None`

Raises:

Type	Description
`ValueError`	If duplicate columns exist without suffix. If merging features changes the number of PSMs.

Notes

The method follows these steps: 1. Validates input and prepares merge keys 2. Checks for column name conflicts 3. Manages feature source: if the source already exists, it will be overwritten 4. Performs left join merge 5. Verifies data integrity

Suffix Usage

The suffix parameter is used to handle column name conflicts: - When adding features from different sources that might have the same column names - When you want to keep both the original and new features with the same name - When you need to track the source of features in the column names

If suffix is not provided and there are duplicate column names: - The method will raise a ValueError - You must either provide a suffix or rename the columns before adding

Examples:

>>> container = PsmContainer(...)
>>> # Adding features without suffix (no conflicts)
>>> features_df1 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'feature1': [0.1, 0.2, 0.3],
...     'feature2': [0.4, 0.5, 0.6]
... })
>>> container.add_features(
...     features_df1,
...     psms_key='scan',
...     feature_key='scan',
...     source='source1'
... )
>>> # Adding features with suffix (handling conflicts)
>>> features_df2 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
...     'feature3': [0.7, 0.8, 0.9]
... })
>>> container.add_features(
...     features_df2,
...     psms_key='scan',
...     feature_key='scan',
...     source='source2',
...     suffix='_new'  # 'score' becomes 'score_new'
... )

Source code in optimhc/psm_container.py

def add_features(
    self,
    features_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    feature_key: Union[str, List[str]],
    source: str,
    suffix: Optional[str] = None,
) -> None:
    """Merge new features into the PSM DataFrame based on specified columns.

    This method performs a left join between the PSM data and feature data,
    ensuring that all PSMs are preserved while adding new features. It handles
    column name conflicts through optional suffixing and maintains feature source
    tracking.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    feature_key : str or list of str
        Column name(s) in the features data to merge on.
    source : str
        Name of the source of the new features (e.g., 'deeplc', 'netmhc').
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
        Required when new feature columns have the same names as existing columns.
        For example, if adding features from different sources (e.g., 'score' from
        DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If duplicate columns exist without suffix.
        If merging features changes the number of PSMs.

    Notes
    -----
    The method follows these steps:
    1. Validates input and prepares merge keys
    2. Checks for column name conflicts
    3. Manages feature source: if the source already exists, it will be overwritten
    4. Performs left join merge
    5. Verifies data integrity

    Suffix Usage
    -----------
    The suffix parameter is used to handle column name conflicts:
    - When adding features from different sources that might have the same column names
    - When you want to keep both the original and new features with the same name
    - When you need to track the source of features in the column names

    If suffix is not provided and there are duplicate column names:
    - The method will raise a ValueError
    - You must either provide a suffix or rename the columns before adding

    Examples
    --------
    >>> container = PsmContainer(...)
    >>> # Adding features without suffix (no conflicts)
    >>> features_df1 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'feature1': [0.1, 0.2, 0.3],
    ...     'feature2': [0.4, 0.5, 0.6]
    ... })
    >>> container.add_features(
    ...     features_df1,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source1'
    ... )
    >>> # Adding features with suffix (handling conflicts)
    >>> features_df2 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
    ...     'feature3': [0.7, 0.8, 0.9]
    ... })
    >>> container.add_features(
    ...     features_df2,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source2',
    ...     suffix='_new'  # 'score' becomes 'score_new'
    ... )
    """
    if isinstance(psms_key, str):
        psms_key = [psms_key]

    if isinstance(feature_key, str):
        feature_key = [feature_key]

    new_feature_cols = [col for col in features_df.columns if col not in feature_key]

    for cols in new_feature_cols:
        if cols in self._psms.columns:
            logger.warning(f"Column '{cols}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix ")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")
    logger.info(f"Adding {len(new_feature_cols)} new features from {source}.")

    if not new_feature_cols:
        logger.warning("No new features to add. Check the feature key and PSMs key.")
        logger.warning(f"Feature key: {feature_key}; PSMs key: {psms_key}")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    # TODO: reluctant logic
    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df = features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}
    )
    original_len = len(self._psms)
    # avoid merge the right key to the psms
    self._psms = self._psms.merge(
        features_df, left_on=psms_key, right_on=feature_key, how="left"
    )

    if feature_key != psms_key:
        cols_to_drop = [
            col for col in feature_key if col not in psms_key and col in self._psms.columns
        ]
        if cols_to_drop:
            logger.debug(f"Dropping columns from feature_key not in psms_key: {cols_to_drop}")
            self._psms.drop(columns=cols_to_drop, inplace=True)

    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate keys."
        )

`add_features_by_index(features_df, source, suffix=None)` ¶

Merge new features into the PSM DataFrame based on the DataFrame index.

Parameters:

Name	Type	Description	Default
`features_df`	`DataFrame`	DataFrame containing new features to add.	required
`source`	`str`	Name of the source of the new features.	required
`suffix`	`str`	Suffix to add to the new columns if there's a name conflict.	`None`

Source code in optimhc/psm_container.py

def add_features_by_index(
    self, features_df: pd.DataFrame, source: str, suffix: Optional[str] = None
) -> None:
    """
    Merge new features into the PSM DataFrame based on the DataFrame index.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    source : str
        Name of the source of the new features.
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
    """
    new_feature_cols = [col for col in features_df.columns]
    for col in new_feature_cols:
        if col in self._psms.columns:
            logger.warning(f"Column '{col}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix.")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")

    logger.info(f"Adding {len(new_feature_cols)} new features from {source} by index.")

    if not new_feature_cols:
        logger.warning("No new features to add.")
        raise ValueError("No new features to add.")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}, inplace=True
    )
    original_len = len(self._psms)
    self._psms = self._psms.merge(
        features_df,
        left_index=True,
        right_index=True,
        how="left",  # Perform a left join to preserve all original PSM data
    )

    # Ensure that the merge did not change the number of rows in the PSM DataFrame
    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate indices."
        )

`add_results(results_df, psms_key, result_key)` ¶

Add results of rescore engine to the PSM DataFrame based on specified columns.

Parameters:

Name	Type	Description	Default
`results_df`	`DataFrame`	DataFrame containing new results to add.	required
`psms_key`	`str or list of str`	Column name(s) in the PSM data to merge on.	required
`result_key`	`str or list of str`	Column name(s) in the results data to merge on.	required

Source code in optimhc/psm_container.py

def add_results(
    self,
    results_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    result_key: Union[str, List[str]],
) -> None:
    """
    Add results of rescore engine to the PSM DataFrame based on specified columns.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing new results to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    result_key : str or list of str
        Column name(s) in the results data to merge on.
    """
    if self.rescore_result_column is not None:
        logger.warning("Rescore result column already exists. Overwriting.")

    if set(self._psms.columns) & set(results_df.columns):
        raise ValueError(
            "Duplicate columns exist. Please rename the columns in the results data."
        )

    self.rescore_result_column = result_key
    self._psms = self._psms.merge(
        results_df,
        left_on=psms_key,
        right_on=result_key,
        how="left",
        validate="one_to_one",
    )
    self._psms.drop(columns=result_key, inplace=True)
    logger.info("Added rescore results to PSM data.")

`write_pin(output_file, style='default', source=None)` ¶

Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats. The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

Parameters:

Name	Type	Description	Default
`output_file`	`str`	Path to the output PIN file.	required
`style`	`str`	If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins). If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.	`'default'`
`source`	`list of str`	List of feature sources to include. If None, includes all sources.	`None`

Returns:

Type	Description
`DataFrame`	The DataFrame written to the PIN file.

Notes

The first three columns are always: SpecID, Label, ScanNr.
For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
If hit_rank_column is not specified, rank is set to 1 for all rows.
Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
The 'Peptide' column is formatted with underscores (e.g., _.PEPTIDE._).
For standard format, all features from rescoring_features are appended between ScanNr and Peptide columns.
The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

Example output (default style): SpecId Label ScanNr feature1 feature2 ... Peptide Proteins

Example output (msbooster style): SpecId Label ScanNr retentiontime rank hyperscore Peptide Proteins or SpecId Label ScanNr retentiontime rank log10_evalue Peptide Proteins

Raises:

Type	Description
`ValueError`	If required columns are missing for the selected style.

Source code in optimhc/psm_container.py

def write_pin(
    self, output_file: str, style: str = "default", source: List[str] = None
) -> None:
    """
    Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats.
    The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

    Parameters
    ----------
    output_file : str
        Path to the output PIN file.
    style : str, optional
        If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins).
        If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.
    source : list of str, optional
        List of feature sources to include. If None, includes all sources.

    Returns
    -------
    pd.DataFrame
        The DataFrame written to the PIN file.

    Notes
    -----
    - The first three columns are always: SpecID, Label, ScanNr.
    - For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
    - If `hit_rank_column` is not specified, rank is set to 1 for all rows.
    - Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
    - The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
    - The 'Peptide' column is formatted with underscores (e.g., `_.PEPTIDE._`).
    - For standard format, all features from `rescoring_features` are appended between ScanNr and Peptide columns.
    - The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
    - Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

    Example output (default style):
        SpecId	Label	ScanNr	feature1	feature2	...	Peptide	Proteins

    Example output (msbooster style):
        SpecId	Label	ScanNr	retentiontime	rank	hyperscore	Peptide	Proteins
        or
        SpecId	Label	ScanNr	retentiontime	rank	log10_evalue	Peptide	Proteins

    Raises
    ------
    ValueError
        If required columns are missing for the selected style.
    """
    df = self._psms.copy()
    # Check if the label column is str
    # Case1: label column is str
    if df[self.label_column].dtype == "str":
        df["PercolatorLabel"] = df[self.label_column].map({"True": 1, "False": -1})
    # Case2: label column is bool
    elif df[self.label_column].dtype == "bool":
        df["PercolatorLabel"] = df[self.label_column].map({True: 1, False: -1})
    else:
        # try to convert to bool
        logger.warning("Label column is not str or bool. Converting to bool.")
        df["PercolatorLabel"] = df[self.label_column].astype(bool).map({True: 1, False: -1})
    logger.info("Writing PIN file to %s", output_file)
    logger.info("Using style: %s", style)

    feature_cols = []
    if source is None:
        for _, cols in self.rescoring_features.items():
            feature_cols.extend(cols)
    else:
        for s in source:
            if s not in self.rescoring_features:
                raise ValueError(f"Source '{s}' not found in rescoring features.")
            feature_cols.extend(self.rescoring_features[s])

    pin_df = pd.DataFrame()
    pin_df["SpecId"] = df[self.spectrum_column]
    pin_df["Label"] = df["PercolatorLabel"]
    pin_df["ScanNr"] = df[self.scan_column]

    if style == "msbooster":
        if self.retention_time_column:
            pin_df["retentiontime"] = df[self.retention_time_column]
        else:
            raise ValueError("Retention time column is required for msbooster style.")

        pin_df["rank"] = df[self.hit_rank_column].astype(int) if self.hit_rank_column else 1
        if "hyperscore" in self.feature_columns:
            pin_df["hyperscore"] = df["hyperscore"]
        elif "expect" in self.feature_columns:
            pin_df["log10_evalue"] = df["expect"]
        else:
            raise ValueError(
                "Either 'hyperscore' or 'expect' column is required for msbooster style."
            )

        # Add other features and jump the hyperscore or expect column
        for col in feature_cols:
            if col not in [
                "hyperscore",
                "expect",
                self.hit_rank_column,
                self.retention_time_column,
            ]:
                pin_df[col] = df[col]

        # PEPTIDE -> _.PEPTIDE._
        # Add _. at the front and ._ at the end of the peptide column
        pin_df["Peptide"] = df[self.peptide_column].apply(
            lambda x: f"_.{x}._" if isinstance(x, str) else x
        )

    elif style == "default":
        for col in feature_cols:
            pin_df[col] = df[col]
        pin_df["Peptide"] = df[self.peptide_column]
    else:
        raise ValueError(f"Unknown style: {style}. Use 'msbooster' or 'default'.")

    pin_df["Proteins"] = df[self.protein_column].apply(
        lambda x: ";".join(x) if isinstance(x, (list, tuple)) else x
    )
    pin_df = self._convert_float_to_int(pin_df)
    pin_df.to_csv(output_file, sep="\t", index=False)
    logger.info("PIN file written to %s", output_file)
    return pin_df

`read_pepxml(pepxml_files, decoy_prefix='DECOY_')` ¶

Read PSMs from a list of PepXML files.

Parameters:

Name	Type	Description	Default
`pepxml_files`	`Union[str, List[str]]`	The file path to the PepXML file or a list of file paths.	required
`decoy_prefix`	`str`	The prefix used to indicate a decoy protein in the description lines of the FASTA file. Default is "DECOY_".	`'DECOY_'`

Returns:

Type	Description
`PsmContainer`	A PsmContainer object containing the PSM data.

Raises:

Type	Description
`ValueError`	If the PepXML files were generated by Percolator or PeptideProphet.

Notes

This function: 1. Reads and parses PepXML files 2. Calculates mass difference features 3. Processes matched ions and complementary ions 4. Creates charge columns 5. Log-transforms p-values 6. Returns a PsmContainer with the processed data

Source code in optimhc/parser/pepxml.py

def read_pepxml(pepxml_files, decoy_prefix="DECOY_"):
    """
    Read PSMs from a list of PepXML files.

    Parameters
    ----------
    pepxml_files : Union[str, List[str]]
        The file path to the PepXML file or a list of file paths.
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the description lines
        of the FASTA file. Default is "DECOY_".

    Returns
    -------
    PsmContainer
        A PsmContainer object containing the PSM data.

    Raises
    ------
    ValueError
        If the PepXML files were generated by Percolator or PeptideProphet.

    Notes
    -----
    This function:
    1. Reads and parses PepXML files
    2. Calculates mass difference features
    3. Processes matched ions and complementary ions
    4. Creates charge columns
    5. Log-transforms p-values
    6. Returns a PsmContainer with the processed data
    """
    proton = 1.00727646677
    if isinstance(pepxml_files, str):
        pepxml_files = [pepxml_files]
    psms = pd.concat([_parse_pepxml(f, decoy_prefix) for f in pepxml_files])

    # Check that these PSMs are not from Percolator or PeptideProphet:
    illegal_cols = {
        "Percolator q-Value",
        "Percolator PEP",
        "Percolator SVMScore",
    }

    if illegal_cols.intersection(set(psms.columns)):
        raise ValueError(
            "The PepXML files appear to have generated by Percolator or "
            "PeptideProphet; hence, they should not be analyzed with mokapot."
        )

    # For open modification searches:
    psms["mass_diff"] = psms["exp_mass"] - psms["calc_mass"]
    # Calculate massdiff features
    exp_mz = psms["exp_mass"] / psms["charge"] + proton
    calc_mz = psms["calc_mass"] / psms["charge"] + proton
    psms["abs_mz_diff"] = (exp_mz - calc_mz).abs()

    # Calculate matched ions and complementary ions
    if "num_matched_ions" in psms.columns and "tot_num_ions" in psms.columns:
        if (psms["tot_num_ions"] != 0).all():
            psms["matched_ions_ratio"] = psms["num_matched_ions"] / psms["tot_num_ions"]

    # Log number of candidates:
    if "num_matched_peptides" in psms.columns:
        psms["num_matched_peptides"] = np.log10(psms["num_matched_peptides"])

    # Create charge columns:
    psms = pd.concat([psms, pd.get_dummies(psms["charge"], prefix="charge")], axis=1)

    # psms = psms.drop("charge", axis=1)
    # -log10 p-values
    nonfeat_cols = [
        "ms_data_file",
        "scan",
        "spectrum",
        "label",
        "calc_mass",  # Retain calc_mass for FlashLFQ compatibility
        "peptide",
        "proteins",
        "charge",
        "retention_time",
    ]

    feat_cols = [c for c in psms.columns if c not in nonfeat_cols]
    psms = psms.apply(_log_features, features=feat_cols)
    rescoring_features = {"Original": feat_cols}

    return PsmContainer(
        psms=psms,
        label_column="label",
        scan_column="scan",
        spectrum_column="spectrum",
        ms_data_file_column="ms_data_file",
        peptide_column="peptide",
        protein_column="proteins",
        charge_column="charge",
        rescoring_features=rescoring_features,
        hit_rank_column="rank",
        retention_time_column="retention_time",
        calculated_mass_column="calc_mass",
    )

`_parse_pepxml(pepxml_file, decoy_prefix)` ¶

Parse the PSMs of a PepXML into a DataFrame.

Parameters:

Name	Type	Description	Default
`pepxml_file`	`str`	The PepXML file to parse.	required
`decoy_prefix`	`str`	The prefix used to indicate a decoy protein in the description lines of the FASTA file.	required

Returns:

Type	Description
`DataFrame`	A DataFrame containing the information about each PSM.

Raises:

Type	Description
`ValueError`	If the file is not a PepXML file or is malformed.

Source code in optimhc/parser/pepxml.py

def _parse_pepxml(pepxml_file, decoy_prefix):
    """
    Parse the PSMs of a PepXML into a DataFrame.

    Parameters
    ----------
    pepxml_file : str
        The PepXML file to parse.
    decoy_prefix : str
        The prefix used to indicate a decoy protein in the description lines
        of the FASTA file.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the information about each PSM.

    Raises
    ------
    ValueError
        If the file is not a PepXML file or is malformed.
    """
    logger.info("Reading %s...", pepxml_file)
    parser = etree.iterparse(str(pepxml_file), tag="{*}msms_run_summary")
    parse_fun = partial(_parse_msms_run, decoy_prefix=decoy_prefix)
    spectra = map(parse_fun, parser)
    try:
        psms = itertools.chain.from_iterable(spectra)
        df = pd.DataFrame.from_records(itertools.chain.from_iterable(psms))
        df["ms_data_file"] = df["ms_data_file"].astype("category")
    except etree.XMLSyntaxError:
        raise ValueError(f"{pepxml_file} is not a PepXML file or is malformed.")
    return df

`_parse_msms_run(msms_run, decoy_prefix)` ¶

Parse a single MS/MS run.

Parameters:

Name	Type	Description	Default
`msms_run`	`tuple of anything, lxml.etree.Element`	The second element of the tuple should be the XML element for a single msms_run. The first is not used, but is necessary for compatibility with using :code:`map()`.	required
`decoy_prefix`	`str`	The prefix used to indicate a decoy protein in the description lines of the FASTA file.	required

Yields:

Type	Description
`dict`	A dictionary describing all of the PSMs in a run.

Source code in optimhc/parser/pepxml.py

def _parse_msms_run(msms_run, decoy_prefix):
    """
    Parse a single MS/MS run.

    Parameters
    ----------
    msms_run : tuple of anything, lxml.etree.Element
        The second element of the tuple should be the XML element for a single
        msms_run. The first is not used, but is necessary for compatibility
        with using :code:`map()`.
    decoy_prefix : str
        The prefix used to indicate a decoy protein in the description lines
        of the FASTA file.

    Yields
    ------
    dict
        A dictionary describing all of the PSMs in a run.
    """
    msms_run = msms_run[1]
    ms_data_file = msms_run.get("base_name")
    run_ext = msms_run.get("raw_data")
    if not ms_data_file.endswith(run_ext):
        ms_data_file += run_ext

    run_info = {"ms_data_file": ms_data_file}
    for spectrum in msms_run.iter("{*}spectrum_query"):
        yield _parse_spectrum(spectrum, run_info, decoy_prefix)

`_parse_spectrum(spectrum, run_info, decoy_prefix)` ¶

Parse the PSMs for a single mass spectrum.

Parameters:

Name	Type	Description	Default
`spectrum`	`Element`	The XML element for a single spectrum.	required
`run_info`	`dict`	The parsed run data.	required
`decoy_prefix`	`str`	The prefix used to indicate a decoy protein in the description lines of the FASTA file.	required

Yields:

Type	Description
`dict`	A dictionary describing all of the PSMs for a spectrum.

Source code in optimhc/parser/pepxml.py

def _parse_spectrum(spectrum, run_info, decoy_prefix):
    """
    Parse the PSMs for a single mass spectrum.

    Parameters
    ----------
    spectrum : lxml.etree.Element
        The XML element for a single spectrum.
    run_info : dict
        The parsed run data.
    decoy_prefix : str
        The prefix used to indicate a decoy protein in the description lines
        of the FASTA file.

    Yields
    ------
    dict
        A dictionary describing all of the PSMs for a spectrum.
    """
    spec_info = run_info.copy()
    spec_info["spectrum"] = str(spectrum.get("spectrum"))
    spec_info["scan"] = int(spectrum.get("end_scan"))
    spec_info["charge"] = int(spectrum.get("assumed_charge"))
    spec_info["retention_time"] = float(spectrum.get("retention_time_sec"))
    spec_info["exp_mass"] = float(spectrum.get("precursor_neutral_mass"))
    for psms in spectrum.iter("{*}search_result"):
        for psm in psms.iter("{*}search_hit"):
            yield _parse_psm(psm, spec_info, decoy_prefix=decoy_prefix)

`_parse_psm(psm_info, spec_info, decoy_prefix)` ¶

Parse a single PSM.

Parameters:

Name	Type	Description	Default
`psm_info`	`Element`	The XML element containing information about the PSM.	required
`spec_info`	`dict`	The parsed spectrum data.	required
`decoy_prefix`	`str`	The prefix used to indicate a decoy protein in the description lines of the FASTA file.	required

Returns:

Type	Description
`dict`	A dictionary containing parsed data about the PSM.

Source code in optimhc/parser/pepxml.py

def _parse_psm(psm_info, spec_info, decoy_prefix):
    """
    Parse a single PSM.

    Parameters
    ----------
    psm_info : lxml.etree.Element
        The XML element containing information about the PSM.
    spec_info : dict
        The parsed spectrum data.
    decoy_prefix : str
        The prefix used to indicate a decoy protein in the description lines
        of the FASTA file.

    Returns
    -------
    dict
        A dictionary containing parsed data about the PSM.
    """
    psm = spec_info.copy()
    psm["calc_mass"] = float(psm_info.get("calc_neutral_pep_mass"))
    psm["peptide"] = psm_info.get("peptide")
    psm["proteins"] = [psm_info.get("protein").split(" ")[0]]
    psm["label"] = not psm["proteins"][0].startswith(decoy_prefix)
    psm["rank"] = int(psm_info.get("hit_rank"))

    # Begin features:
    try:
        psm["missed_cleavages"] = int(psm_info.get("num_missed_cleavages"))
    except TypeError:
        pass

    try:
        psm["ntt"] = int(psm_info.get("num_tol_term"))
    except TypeError:
        pass

    try:
        psm["num_matched_peptides"] = int(psm_info.get("num_matched_peptides"))
    except TypeError:
        pass

    try:
        psm["num_matched_ions"] = int(psm_info.get("num_matched_ions"))
    except TypeError:
        pass

    try:
        psm["tot_num_ions"] = int(psm_info.get("tot_num_ions"))
    except TypeError:
        pass

    queries = [
        "{*}modification_info",
        "{*}search_score",
        "{*}alternative_protein",
    ]

    # TODO: Directly read modification_info
    # TODO: The current code can not parse the n-terminal modification
    for element in psm_info.iter(*queries):
        if "modification_info" in element.tag:
            offset = 0
            mod_pep = psm["peptide"]
            for mod in element.iter("{*}mod_aminoacid_mass"):
                idx = offset + int(mod.get("position"))
                mass = mod.get("mass")
                mod_pep = mod_pep[:idx] + "[" + mass + "]" + mod_pep[idx:]
                offset += 2 + len(mass)

            psm["peptide"] = mod_pep

        elif "alternative_protein" in element.tag:
            psm["proteins"].append(element.get("protein").split(" ")[0])
            if not psm["label"]:
                psm["label"] = not psm["proteins"][-1].startswith(decoy_prefix)

        else:
            psm[element.get("name")] = element.get("value")

    psm["proteins"] = "\t".join(psm["proteins"])
    return psm

`_log_features(col, features)` ¶

Log-transform columns that are p-values or E-values.

Parameters:

Name	Type	Description	Default
`col`	`Series`	A column of the dataset.	required
`features`	`list of str`	The features of the dataset. Only feature columns will be considered for transformation.	required

Returns:

Type	Description
`Series`	The log-transformed values of the column if the feature was determined to be a p-value.

Notes

This function: 1. Detects columns written in scientific notation and log them 2. Uses a simple heuristic to find p-value / E-value features 3. Only transforms if values span >4 orders of magnitude 4. Preserves precision for scientific notation values

Source code in optimhc/parser/pepxml.py

def _log_features(col, features):
    """
    Log-transform columns that are p-values or E-values.

    Parameters
    ----------
    col : pandas.Series
        A column of the dataset.
    features : list of str
        The features of the dataset. Only feature columns will be considered
        for transformation.

    Returns
    -------
    pandas.Series
        The log-transformed values of the column if the feature was determined
        to be a p-value.

    Notes
    -----
    This function:
    1. Detects columns written in scientific notation and log them
    2. Uses a simple heuristic to find p-value / E-value features
    3. Only transforms if values span >4 orders of magnitude
    4. Preserves precision for scientific notation values
    """
    if col.name not in features:
        return col
    elif col.dtype == "bool":
        return col.astype(float)

    col = col.astype(str).str.lower()

    # Detect columns written in scientific notation and log them:
    # This is specifically needed to preserve precision.
    if col.str.contains("e").any() and (col.astype(float) > 0).all():
        split = col.str.split("e", expand=True)
        root = split.loc[:, 0]
        root = root.astype(float)
        power = split.loc[:, 1]
        power[pd.isna(power)] = "0"
        power = power.astype(int)

        zero_idx = root == 0
        root[zero_idx] = 1
        power[zero_idx] = power[~zero_idx].min()
        diff = power.max() - power.min()
        if abs(diff) >= 4:
            logger.info("  - log-transformed the '%s' feature.", col.name)
            return np.log10(root) + power
        else:
            return col.astype(float)

    col = col.astype(float)

    # A simple heuristic to find p-value / E-value features:
    # Non-negative:
    if col.min() >= 0:
        # Make sure this isn't a binary column:
        if not np.array_equal(col.values, col.values.astype(bool)):
            # Only log if values span >4 orders of magnitude,
            # excluding values that are exactly zero:
            zero_idx = col == 0
            col_min = col[~zero_idx].min()
            if col.max() / col_min >= 10000:
                col[~zero_idx] = np.log10(col[~zero_idx])
                col[zero_idx] = col[~zero_idx].min() - 1
                logger.info("  - log-transformed the '%s' feature.", col.name)

    return col

PIN Parser¶

`pin` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)` ¶

A container for managing peptide-spectrum matches (PSMs) in immunopeptidomics rescoring pipelines.

Parameters:

Name	Type	Description	Default
`psms`	`DataFrame`	DataFrame containing the PSM data.	required
`label_column`	`str`	Column containing the label (True for target, False for decoy).	required
`scan_column`	`str`	Column containing the scan number.	required
`spectrum_column`	`str`	Column containing the spectrum identifier.	required
`ms_data_file_column`	`str`	Column containing the MS data file that the PSM originated from.	required
`peptide_column`	`str`	Column containing the peptide sequence.	required
`protein_column`	`str`	Column containing the protein accessions.	required
`rescoring_features`	`dict of str to list of str`	Dictionary of feature columns for rescoring.	required
`hit_rank_column`	`str`	Column containing the hit rank.	`None`
`charge_column`	`str`	Column containing the charge state.	`None`
`retention_time_column`	`str`	Column containing the retention time.	`None`
`calculated_mass_column`	`str`	Column containing the calculated mass.	`None`
`metadata_column`	`str`	Column containing metadata.	`None`

Attributes:

Name	Type	Description
`psms`	`DataFrame`	Copy of the DataFrame containing the PSM data.
`target_psms`	`DataFrame`	DataFrame containing only target PSMs (label = True).
`decoy_psms`	`DataFrame`	DataFrame containing only decoy PSMs (label = False).
`peptides`	`list of str`	List containing all peptides from the PSM data.
`columns`	`list of str`	List of column names in the PSM DataFrame.
`rescoring_features`	`dict of str to list of str`	Dictionary of rescoring feature columns in the PSM DataFrame.

Source code in optimhc/psm_container.py

def __init__(
    self,
    psms: pd.DataFrame,
    label_column: str,
    scan_column: str,
    spectrum_column: str,
    ms_data_file_column: str,
    peptide_column: str,
    protein_column: str,
    rescoring_features: Dict[str, List[str]],
    hit_rank_column: Optional[str] = None,
    charge_column: Optional[str] = None,
    retention_time_column: Optional[str] = None,
    calculated_mass_column: Optional[str] = None,
    metadata_column: Optional[str] = None,
):
    self._psms = psms.copy()
    self._psms.reset_index(drop=True, inplace=True)
    self.label_column = label_column
    self.scan_column = scan_column
    self.spectrum_column = spectrum_column
    self.ms_data_file_column = ms_data_file_column
    self.peptide_column = peptide_column
    self.protein_column = protein_column
    self.hit_rank_column = hit_rank_column
    self.retention_time_column = retention_time_column
    self.metadata_column = metadata_column
    self.rescoring_features = rescoring_features
    self.charge_column = charge_column
    self.calculated_mass_column = calculated_mass_column
    # rescore result column
    self.rescore_result_column = None

    # check if the columns are in the dataframe
    def check_column(col):
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")

    check_column(label_column)
    check_column(scan_column)
    check_column(spectrum_column)
    check_column(ms_data_file_column)
    check_column(peptide_column)
    check_column(protein_column)
    check_column(hit_rank_column)
    check_column(retention_time_column)
    check_column(charge_column)
    check_column(calculated_mass_column)

    # ensure the label column is boolean
    if psms[label_column].dtype != "bool":
        raise ValueError(f"Column '{label_column}' must be boolean.")

    if psms[label_column].nunique() == 1 and psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as target. No decoy PSMs found.")
    elif psms[label_column].nunique() == 1 and not psms[label_column].iloc[0]:
        raise ValueError("All PSMs are labeled as decoy. No target PSMs found.")

    def check_metadata_column(col):
        # check the type is Dict[str, Dict[str, str]]
        if col and col not in psms.columns:
            raise ValueError(f"Column '{col}' not found in PSM data.")
        if not all(isinstance(x, dict) for x in self._psms[col]):
            raise ValueError(f"Column '{col}' must contain dictionaries.")

    if metadata_column:
        check_metadata_column(metadata_column)

    def check_rescoring_features(features: Dict[str, List[str]]):
        for key, cols in features.items():
            for col in cols:
                if col not in psms.columns:
                    raise ValueError(
                        f"Column '{col}' not found in PSM data for feature '{key}'."
                    )

    check_rescoring_features(rescoring_features)

    # check if the number of decoy psms is not 0
    if len(self.decoy_psms) == 0:
        logger.error("No decoy PSMs found. Please check the decoy prefix.")
        raise ValueError("No decoy PSMs found.")

    logger.info("PsmContainer initialized with %d PSM entries.", len(self._psms))
    if self.ms_data_file_column:
        logger.info(
            "PSMs originated from %d MS data file(s).",
            len(self._psms[ms_data_file_column].unique()),
        )
    logger.info("target psms: %d", len(self.target_psms))
    logger.info("decoy psms: %d", len(self.decoy_psms))
    logger.info("unique peptides: %d", len(np.unique(self.peptides)))
    logger.info("rescoring features: %s", rescoring_features)

`psms` `property` ¶

Get a copy of the PSM DataFrame to prevent external modification.

Returns:

Type	Description
`DataFrame`	A copy of the PSM DataFrame.

`target_psms` `property` ¶

Get a DataFrame containing only target PSMs.

Returns:

Type	Description
`DataFrame`	DataFrame with only target PSMs (label = True).

`decoy_psms` `property` ¶

Get a DataFrame containing only decoy PSMs.

Returns:

Type	Description
`DataFrame`	DataFrame with only decoy PSMs (label = False).

`columns` `property` ¶

Get the column names of the PSM DataFrame.

Returns:

Type	Description
`list of str`	List of column names.

`feature_columns` `property` ¶

Get a list of all feature columns in the PSM DataFrame.

Returns:

Type	Description
`list of str`	List of feature column names.

`feature_sources` `property` ¶

Get a list of all feature sources in the PSM DataFrame.

Returns:

Type	Description
`list of str`	List of feature source names.

`peptides` `property` ¶

Get the peptide sequences from the PSM data.

Returns:

Type	Description
`list of str`	List of peptide sequences.

`ms_data_files` `property` ¶

Get the MS data files from the PSM data.

Returns:

Type	Description
`list of str`	List of MS data file names.

`scan_ids` `property` ¶

Get the scan numbers from the PSM data.

Returns:

Type	Description
`list of int`	List of scan numbers.

`charges` `property` ¶

Get the charge states from the PSM data.

Returns:

Type	Description
`list of int`	List of charge states.

`metadata` `property` ¶

Get the metadata from the PSM data.

Returns:

Type	Description
`Series`	Series containing metadata for each PSM.

`spectrum_ids` `property` ¶

Get the spectrum identifiers from the PSM data.

Returns:

Type	Description
`list of str`	List of spectrum identifiers.

`identifier_columns` `property` ¶

Get the columns that uniquely identify each PSM.

Returns:

Type	Description
`list of str`	List of identifier column names.

`len()` ¶

Get the number of PSMs in the container.

Returns:

Type	Description
`int`	Number of PSMs.

Source code in optimhc/psm_container.py

def __len__(self) -> int:
    """
    Get the number of PSMs in the container.

    Returns
    -------
    int
        Number of PSMs.
    """
    return len(self._psms)

`copy()` ¶

Return a deep copy of the PsmContainer object.

Returns:

Type	Description
`PsmContainer`	A deep copy of the current PsmContainer.

Source code in optimhc/psm_container.py

def copy(self) -> "PsmContainer":
    """
    Return a deep copy of the PsmContainer object.

    Returns
    -------
    PsmContainer
        A deep copy of the current PsmContainer.
    """
    import copy

    return copy.deepcopy(self)

`repr()` ¶

Return a string representation of the PsmContainer.

Returns:

Type	Description
`str`	String summary of the PsmContainer.

Source code in optimhc/psm_container.py

def __repr__(self) -> str:
    """
    Return a string representation of the PsmContainer.

    Returns
    -------
    str
        String summary of the PsmContainer.
    """
    return (
        f"PsmContainer with {len(self)} PSMs\n"
        f"\t - Target PSMs: {len(self.target_psms)}\n"
        f"\t - Decoy PSMs: {len(self.decoy_psms)}\n"
        f"\t - Unique Peptides: {len(np.unique(self.peptides))}\n"
        f"\t - Unique Spectra: {len(self._psms[self.spectrum_column].unique())}\n"
        f"\t - Rescoring Features: {self.rescoring_features}\n"
    )

`drop_features(features)` ¶

Drop specified features from the PSM DataFrame.

Parameters:

Name	Type	Description	Default
`features`	`list of str`	List of feature column names to drop.	required

Raises:

Type	Description
`ValueError`	If any of the features do not exist in the DataFrame.

Source code in optimhc/psm_container.py

def drop_features(self, features: List[str]) -> None:
    """
    Drop specified features from the PSM DataFrame.

    Parameters
    ----------
    features : list of str
        List of feature column names to drop.

    Raises
    ------
    ValueError
        If any of the features do not exist in the DataFrame.
    """
    missing_features = [f for f in features if f not in self._psms.columns]
    if missing_features:
        raise ValueError(f"Features not found in PSM data: {missing_features}")

    self._psms.drop(columns=features, inplace=True)
    # Create a list of sources to update
    sources_to_update = []
    for source, cols in self.rescoring_features.items():
        self.rescoring_features[source] = [col for col in cols if col not in features]
        if not self.rescoring_features[source]:
            sources_to_update.append(source)

    logger.info(
        f"Sources to be removed: {sources_to_update}. Because all the features are removed."
    )
    # Remove sources with no features left
    for source in sources_to_update:
        del self.rescoring_features[source]

`drop_source(source)` ¶

Drop all features associated with a specific source from the PSM DataFrame.

Parameters:

Name	Type	Description	Default
`source`	`str`	Name of the source to drop.	required

Raises:

Type	Description
`ValueError`	If the source does not exist in the rescoring features.

Source code in optimhc/psm_container.py

def drop_source(self, source: str) -> None:
    """
    Drop all features associated with a specific source from the PSM DataFrame.

    Parameters
    ----------
    source : str
        Name of the source to drop.

    Raises
    ------
    ValueError
        If the source does not exist in the rescoring features.
    """
    if source not in self.rescoring_features:
        raise ValueError(f"Source '{source}' not found in rescoring features.")
    self.drop_features(self.rescoring_features[source])

`add_metadata(metadata_df, psms_key, metadata_key, source)` ¶

Merge new metadata into the PSM DataFrame based on specified columns. Metadata from the specified source is stored as a nested dictionary inside the metadata column.

Parameters:

Name	Type	Description	Default
`metadata_df`	`DataFrame`	DataFrame containing new metadata to add.	required
`psms_key`	`str or list of str`	Column name(s) in the PSM data to merge on.	required
`metadata_key`	`str or list of str`	Column name(s) in the metadata data to merge on.	required
`source`	`str`	Name of the source of the new metadata.	required

Source code in optimhc/psm_container.py

def add_metadata(
    self,
    metadata_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    metadata_key: Union[str, List[str]],
    source,
) -> None:
    """
    Merge new metadata into the PSM DataFrame based on specified columns.
    Metadata from the specified source is stored as a nested dictionary inside the metadata column.

    Parameters
    ----------
    metadata_df : pd.DataFrame
        DataFrame containing new metadata to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    metadata_key : str or list of str
        Column name(s) in the metadata data to merge on.
    source : str
        Name of the source of the new metadata.
    """
    if self.metadata_column is None:
        logger.info("No existing metadata column. Creating new metadata column.")
        self.metadata_column = "metadata"
        self._psms["metadata"] = [{} for _ in range(len(self._psms))]

    metadata_cols = [col for col in metadata_df.columns if col not in metadata_key]
    merged_df = self.psms.merge(
        metadata_df, left_on=psms_key, right_on=metadata_key, how="left"
    )
    if source in self._psms["metadata"]:
        logger.warning(f"{source} already exists in metadata. Overwriting.")
    for col in metadata_cols:
        merged_df["metadata"] = merged_df.apply(
            lambda row: {
                **row["metadata"],
                source: (
                    {col: row[col]}
                    if source not in row["metadata"]
                    else {**row["metadata"][source], col: row[col]}
                ),
            },
            axis=1,
        )

    self._psms["metadata"] = merged_df["metadata"]

`get_top_hits(n=1)` ¶

Get the top n hits based on the hit rank column. If the hit rank column is not specified, returns the original PSMs.

Parameters:

Name	Type	Description	Default
`n`	`int`	The number of top hits to return. Default is 1.	`1`

Returns:

Type	Description
`PsmContainer`	A new PsmContainer object containing the top n hits.

Source code in optimhc/psm_container.py

def get_top_hits(self, n: int = 1):
    """
    Get the top n hits based on the hit rank column.
    If the hit rank column is not specified, returns the original PSMs.

    Parameters
    ----------
    n : int, optional
        The number of top hits to return. Default is 1.

    Returns
    -------
    PsmContainer
        A new PsmContainer object containing the top n hits.
    """
    if self.hit_rank_column is None:
        logger.warning("Rank column not specified. Return the original PSMs.")
        return self.copy()

    psms = self.copy()
    psms._psms = psms._psms[psms._psms[self.hit_rank_column] <= n]
    return psms

`add_features(features_df, psms_key, feature_key, source, suffix=None)` ¶

Merge new features into the PSM DataFrame based on specified columns.

This method performs a left join between the PSM data and feature data, ensuring that all PSMs are preserved while adding new features. It handles column name conflicts through optional suffixing and maintains feature source tracking.

Parameters:

Name	Type	Description	Default
`features_df`	`DataFrame`	DataFrame containing new features to add.	required
`psms_key`	`str or list of str`	Column name(s) in the PSM data to merge on.	required
`feature_key`	`str or list of str`	Column name(s) in the features data to merge on.	required
`source`	`str`	Name of the source of the new features (e.g., 'deeplc', 'netmhc').	required
`suffix`	`str`	Suffix to add to the new columns if there's a name conflict. Required when new feature columns have the same names as existing columns. For example, if adding features from different sources (e.g., 'score' from DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.	`None`

Returns:

Type	Description
`None`

Raises:

Type	Description
`ValueError`	If duplicate columns exist without suffix. If merging features changes the number of PSMs.

Notes

The method follows these steps: 1. Validates input and prepares merge keys 2. Checks for column name conflicts 3. Manages feature source: if the source already exists, it will be overwritten 4. Performs left join merge 5. Verifies data integrity

Suffix Usage

The suffix parameter is used to handle column name conflicts: - When adding features from different sources that might have the same column names - When you want to keep both the original and new features with the same name - When you need to track the source of features in the column names

If suffix is not provided and there are duplicate column names: - The method will raise a ValueError - You must either provide a suffix or rename the columns before adding

Examples:

>>> container = PsmContainer(...)
>>> # Adding features without suffix (no conflicts)
>>> features_df1 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'feature1': [0.1, 0.2, 0.3],
...     'feature2': [0.4, 0.5, 0.6]
... })
>>> container.add_features(
...     features_df1,
...     psms_key='scan',
...     feature_key='scan',
...     source='source1'
... )
>>> # Adding features with suffix (handling conflicts)
>>> features_df2 = pd.DataFrame({
...     'scan': [1, 2, 3],
...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
...     'feature3': [0.7, 0.8, 0.9]
... })
>>> container.add_features(
...     features_df2,
...     psms_key='scan',
...     feature_key='scan',
...     source='source2',
...     suffix='_new'  # 'score' becomes 'score_new'
... )

Source code in optimhc/psm_container.py

def add_features(
    self,
    features_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    feature_key: Union[str, List[str]],
    source: str,
    suffix: Optional[str] = None,
) -> None:
    """Merge new features into the PSM DataFrame based on specified columns.

    This method performs a left join between the PSM data and feature data,
    ensuring that all PSMs are preserved while adding new features. It handles
    column name conflicts through optional suffixing and maintains feature source
    tracking.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    feature_key : str or list of str
        Column name(s) in the features data to merge on.
    source : str
        Name of the source of the new features (e.g., 'deeplc', 'netmhc').
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
        Required when new feature columns have the same names as existing columns.
        For example, if adding features from different sources (e.g., 'score' from
        DeepLC and NetMHC), use suffixes like '_deeplc' or '_netmhc' to distinguish them.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If duplicate columns exist without suffix.
        If merging features changes the number of PSMs.

    Notes
    -----
    The method follows these steps:
    1. Validates input and prepares merge keys
    2. Checks for column name conflicts
    3. Manages feature source: if the source already exists, it will be overwritten
    4. Performs left join merge
    5. Verifies data integrity

    Suffix Usage
    -----------
    The suffix parameter is used to handle column name conflicts:
    - When adding features from different sources that might have the same column names
    - When you want to keep both the original and new features with the same name
    - When you need to track the source of features in the column names

    If suffix is not provided and there are duplicate column names:
    - The method will raise a ValueError
    - You must either provide a suffix or rename the columns before adding

    Examples
    --------
    >>> container = PsmContainer(...)
    >>> # Adding features without suffix (no conflicts)
    >>> features_df1 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'feature1': [0.1, 0.2, 0.3],
    ...     'feature2': [0.4, 0.5, 0.6]
    ... })
    >>> container.add_features(
    ...     features_df1,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source1'
    ... )
    >>> # Adding features with suffix (handling conflicts)
    >>> features_df2 = pd.DataFrame({
    ...     'scan': [1, 2, 3],
    ...     'score': [0.8, 0.9, 0.7],  # This would conflict with existing 'score'
    ...     'feature3': [0.7, 0.8, 0.9]
    ... })
    >>> container.add_features(
    ...     features_df2,
    ...     psms_key='scan',
    ...     feature_key='scan',
    ...     source='source2',
    ...     suffix='_new'  # 'score' becomes 'score_new'
    ... )
    """
    if isinstance(psms_key, str):
        psms_key = [psms_key]

    if isinstance(feature_key, str):
        feature_key = [feature_key]

    new_feature_cols = [col for col in features_df.columns if col not in feature_key]

    for cols in new_feature_cols:
        if cols in self._psms.columns:
            logger.warning(f"Column '{cols}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix ")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")
    logger.info(f"Adding {len(new_feature_cols)} new features from {source}.")

    if not new_feature_cols:
        logger.warning("No new features to add. Check the feature key and PSMs key.")
        logger.warning(f"Feature key: {feature_key}; PSMs key: {psms_key}")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    # TODO: reluctant logic
    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df = features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}
    )
    original_len = len(self._psms)
    # avoid merge the right key to the psms
    self._psms = self._psms.merge(
        features_df, left_on=psms_key, right_on=feature_key, how="left"
    )

    if feature_key != psms_key:
        cols_to_drop = [
            col for col in feature_key if col not in psms_key and col in self._psms.columns
        ]
        if cols_to_drop:
            logger.debug(f"Dropping columns from feature_key not in psms_key: {cols_to_drop}")
            self._psms.drop(columns=cols_to_drop, inplace=True)

    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate keys."
        )

`add_features_by_index(features_df, source, suffix=None)` ¶

Merge new features into the PSM DataFrame based on the DataFrame index.

Parameters:

Name	Type	Description	Default
`features_df`	`DataFrame`	DataFrame containing new features to add.	required
`source`	`str`	Name of the source of the new features.	required
`suffix`	`str`	Suffix to add to the new columns if there's a name conflict.	`None`

Source code in optimhc/psm_container.py

def add_features_by_index(
    self, features_df: pd.DataFrame, source: str, suffix: Optional[str] = None
) -> None:
    """
    Merge new features into the PSM DataFrame based on the DataFrame index.

    Parameters
    ----------
    features_df : pd.DataFrame
        DataFrame containing new features to add.
    source : str
        Name of the source of the new features.
    suffix : str, optional
        Suffix to add to the new columns if there's a name conflict.
    """
    new_feature_cols = [col for col in features_df.columns]
    for col in new_feature_cols:
        if col in self._psms.columns:
            logger.warning(f"Column '{col}' already exists in PSM data.")
            if suffix is None:
                logger.warning("No suffix provided. Using default suffix.")
                raise ValueError("Duplicate columns exist. No suffix provided.")
            else:
                logger.warning(f"Suffix '{suffix}' provided. Using suffix '{suffix}'.")

    logger.info(f"Adding {len(new_feature_cols)} new features from {source} by index.")

    if not new_feature_cols:
        logger.warning("No new features to add.")
        raise ValueError("No new features to add.")

    if source in self.rescoring_features:
        logger.warning(f"{source} already exists in rescoring features. Overwriting.")
        self.drop_source(source)

    if suffix is None:
        suffixes = ("", "")
    else:
        suffixes = ("", suffix)

    self.rescoring_features[source] = [col + suffixes[1] for col in new_feature_cols]
    features_df.rename(
        columns={col: col + suffixes[1] for col in new_feature_cols}, inplace=True
    )
    original_len = len(self._psms)
    self._psms = self._psms.merge(
        features_df,
        left_index=True,
        right_index=True,
        how="left",  # Perform a left join to preserve all original PSM data
    )

    # Ensure that the merge did not change the number of rows in the PSM DataFrame
    if len(self._psms) != original_len:
        raise ValueError(
            "Merging features resulted in a change in the number of PSMs. Check for duplicate indices."
        )

`add_results(results_df, psms_key, result_key)` ¶

Add results of rescore engine to the PSM DataFrame based on specified columns.

Parameters:

Name	Type	Description	Default
`results_df`	`DataFrame`	DataFrame containing new results to add.	required
`psms_key`	`str or list of str`	Column name(s) in the PSM data to merge on.	required
`result_key`	`str or list of str`	Column name(s) in the results data to merge on.	required

Source code in optimhc/psm_container.py

def add_results(
    self,
    results_df: pd.DataFrame,
    psms_key: Union[str, List[str]],
    result_key: Union[str, List[str]],
) -> None:
    """
    Add results of rescore engine to the PSM DataFrame based on specified columns.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing new results to add.
    psms_key : str or list of str
        Column name(s) in the PSM data to merge on.
    result_key : str or list of str
        Column name(s) in the results data to merge on.
    """
    if self.rescore_result_column is not None:
        logger.warning("Rescore result column already exists. Overwriting.")

    if set(self._psms.columns) & set(results_df.columns):
        raise ValueError(
            "Duplicate columns exist. Please rename the columns in the results data."
        )

    self.rescore_result_column = result_key
    self._psms = self._psms.merge(
        results_df,
        left_on=psms_key,
        right_on=result_key,
        how="left",
        validate="one_to_one",
    )
    self._psms.drop(columns=result_key, inplace=True)
    logger.info("Added rescore results to PSM data.")

`write_pin(output_file, style='default', source=None)` ¶

Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats. The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

Parameters:

Name	Type	Description	Default
`output_file`	`str`	Path to the output PIN file.	required
`style`	`str`	If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins). If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.	`'default'`
`source`	`list of str`	List of feature sources to include. If None, includes all sources.	`None`

Returns:

Type	Description
`DataFrame`	The DataFrame written to the PIN file.

Notes

The first three columns are always: SpecID, Label, ScanNr.
For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
If hit_rank_column is not specified, rank is set to 1 for all rows.
Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
The 'Peptide' column is formatted with underscores (e.g., _.PEPTIDE._).
For standard format, all features from rescoring_features are appended between ScanNr and Peptide columns.
The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

Example output (default style): SpecId Label ScanNr feature1 feature2 ... Peptide Proteins

Example output (msbooster style): SpecId Label ScanNr retentiontime rank hyperscore Peptide Proteins or SpecId Label ScanNr retentiontime rank log10_evalue Peptide Proteins

Raises:

Type	Description
`ValueError`	If required columns are missing for the selected style.

Source code in optimhc/psm_container.py

def write_pin(
    self, output_file: str, style: str = "default", source: List[str] = None
) -> None:
    """
    Write the PSM data to a Percolator PIN file, supporting both generic Percolator and MSBooster-compatible formats.
    The style parameter is actually used to output a unified pin format file to benchmark the performance of different rescoring methods.

    Parameters
    ----------
    output_file : str
        Path to the output PIN file.
    style : str, optional
        If set to 'msbooster', outputs only the columns required by MSBooster (SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins).
        If set to 'default', outputs all features specified in `rescoring_features`, plus required Percolator columns.
    source : list of str, optional
        List of feature sources to include. If None, includes all sources.

    Returns
    -------
    pd.DataFrame
        The DataFrame written to the PIN file.

    Notes
    -----
    - The first three columns are always: SpecID, Label, ScanNr.
    - For 'msbooster' style, the columns are: SpecId, Label, ScanNr, retentiontime, rank, hyperscore or log10_evalue, Peptide, Proteins.
    - If `hit_rank_column` is not specified, rank is set to 1 for all rows.
    - Either 'hyperscore' or 'expect' must be present in features; for 'expect', the column is written as 'log10_evalue'.
    - The 'log10_evalue' column should contain the base-10 logarithm of the e-value.
    - The 'Peptide' column is formatted with underscores (e.g., `_.PEPTIDE._`).
    - For standard format, all features from `rescoring_features` are appended between ScanNr and Peptide columns.
    - The 'Proteins' column is a semicolon-separated list if stored as a list or tuple.
    - Label column is converted to 1 (target) and -1 (decoy), as required by Percolator.

    Example output (default style):
        SpecId	Label	ScanNr	feature1	feature2	...	Peptide	Proteins

    Example output (msbooster style):
        SpecId	Label	ScanNr	retentiontime	rank	hyperscore	Peptide	Proteins
        or
        SpecId	Label	ScanNr	retentiontime	rank	log10_evalue	Peptide	Proteins

    Raises
    ------
    ValueError
        If required columns are missing for the selected style.
    """
    df = self._psms.copy()
    # Check if the label column is str
    # Case1: label column is str
    if df[self.label_column].dtype == "str":
        df["PercolatorLabel"] = df[self.label_column].map({"True": 1, "False": -1})
    # Case2: label column is bool
    elif df[self.label_column].dtype == "bool":
        df["PercolatorLabel"] = df[self.label_column].map({True: 1, False: -1})
    else:
        # try to convert to bool
        logger.warning("Label column is not str or bool. Converting to bool.")
        df["PercolatorLabel"] = df[self.label_column].astype(bool).map({True: 1, False: -1})
    logger.info("Writing PIN file to %s", output_file)
    logger.info("Using style: %s", style)

    feature_cols = []
    if source is None:
        for _, cols in self.rescoring_features.items():
            feature_cols.extend(cols)
    else:
        for s in source:
            if s not in self.rescoring_features:
                raise ValueError(f"Source '{s}' not found in rescoring features.")
            feature_cols.extend(self.rescoring_features[s])

    pin_df = pd.DataFrame()
    pin_df["SpecId"] = df[self.spectrum_column]
    pin_df["Label"] = df["PercolatorLabel"]
    pin_df["ScanNr"] = df[self.scan_column]

    if style == "msbooster":
        if self.retention_time_column:
            pin_df["retentiontime"] = df[self.retention_time_column]
        else:
            raise ValueError("Retention time column is required for msbooster style.")

        pin_df["rank"] = df[self.hit_rank_column].astype(int) if self.hit_rank_column else 1
        if "hyperscore" in self.feature_columns:
            pin_df["hyperscore"] = df["hyperscore"]
        elif "expect" in self.feature_columns:
            pin_df["log10_evalue"] = df["expect"]
        else:
            raise ValueError(
                "Either 'hyperscore' or 'expect' column is required for msbooster style."
            )

        # Add other features and jump the hyperscore or expect column
        for col in feature_cols:
            if col not in [
                "hyperscore",
                "expect",
                self.hit_rank_column,
                self.retention_time_column,
            ]:
                pin_df[col] = df[col]

        # PEPTIDE -> _.PEPTIDE._
        # Add _. at the front and ._ at the end of the peptide column
        pin_df["Peptide"] = df[self.peptide_column].apply(
            lambda x: f"_.{x}._" if isinstance(x, str) else x
        )

    elif style == "default":
        for col in feature_cols:
            pin_df[col] = df[col]
        pin_df["Peptide"] = df[self.peptide_column]
    else:
        raise ValueError(f"Unknown style: {style}. Use 'msbooster' or 'default'.")

    pin_df["Proteins"] = df[self.protein_column].apply(
        lambda x: ";".join(x) if isinstance(x, (list, tuple)) else x
    )
    pin_df = self._convert_float_to_int(pin_df)
    pin_df.to_csv(output_file, sep="\t", index=False)
    logger.info("PIN file written to %s", output_file)
    return pin_df

`read_pin(pin_files, retention_time_column=None, remove_pre_nxt_aa=False)` ¶

Read PSMs from a Percolator INput (PIN) file.

Parameters:

Name	Type	Description	Default
`pin_files`	`Union[str, List[str]]`	The file path to the PIN file or a list of file paths.	required
`retention_time_column`	`Optional[str]`	The column containing the retention time. If None, no retention time will be included.	`None`

Returns:

Type	Description
`PsmContainer`	A PsmContainer object containing the PSM data.

Notes

This function: 1. Reads PIN file(s) into a DataFrame 2. Identifies required columns (case-insensitive) 3. Processes scan IDs and hit ranks (Only support FragPipe PIN) 4. Converts data types appropriately 5. Creates a PsmContainer with the processed data

Source code in optimhc/parser/pin.py

def read_pin(
    pin_files: Union[str, List[str]],
    retention_time_column: Optional[str] = None,
    remove_pre_nxt_aa: bool = False,
) -> PsmContainer:
    """
    Read PSMs from a Percolator INput (PIN) file.

    Parameters
    ----------
    pin_files : Union[str, List[str]]
        The file path to the PIN file or a list of file paths.
    retention_time_column : Optional[str], optional
        The column containing the retention time. If None, no retention time
        will be included.

    Returns
    -------
    PsmContainer
        A PsmContainer object containing the PSM data.

    Notes
    -----
    This function:
    1. Reads PIN file(s) into a DataFrame
    2. Identifies required columns (case-insensitive)
    3. Processes scan IDs and hit ranks (Only support FragPipe PIN)
    4. Converts data types appropriately
    5. Creates a PsmContainer with the processed data
    """
    logger.info("Reading PIN file(s) into PsmContainer.")
    if isinstance(pin_files, str):
        pin_files = [pin_files]

    pin_df = pd.concat([_read_single_pin_as_df(pin_file) for pin_file in pin_files])
    logger.info(f"Read {len(pin_df)} PSMs from {len(pin_files)} PIN files.")
    logger.debug(pin_df.head())
    logger.debug(pin_df.columns)
    logger.debug(pin_df.iloc[0])

    def find_required_columns(col: str, columns: List[str]) -> str:
        """
        Case-insensitive search for a column in the DataFrame.
        Returns the matching column name with original casing.
        """
        col_lower = col.lower()
        column_map = {c.lower(): c for c in columns}
        if col_lower not in column_map:
            raise ValueError(f"Column '{col}' not found in PSM data (case-insensitive).")
        return column_map[col_lower]

    # non-feature columns (case-insensitive search)
    label = find_required_columns("Label", pin_df.columns)
    scan = find_required_columns("ScanNr", pin_df.columns)
    specid = find_required_columns("SpecId", pin_df.columns)
    peptide = find_required_columns("Peptide", pin_df.columns)
    protein = find_required_columns("Proteins", pin_df.columns)

    # Comet: P2PI20160713_pilling_C1RA2_BB72_P1_31_3_1
    # Fragpipe: P2PI20160713_pilling_C1RA2_BB72_P1.3104.3104.2_1

    # Try to parse rank from SpecId
    def parse_specid(specid: str) -> Tuple[str, int]:
        if "_" in specid:
            parts = specid.rsplit("_", 1)
            if len(parts) != 2:
                logger.warning(f"SpecId format unexpected: {specid}, using default rank 1")
                return 1
            try:
                hit_rank = int(parts[1])
                return hit_rank
            except ValueError:
                logger.warning(f"Could not parse rank from SpecId: {specid}, using default rank 1")
                return 1
        else:
            return 1

    hit_rank = "rank"
    if "rank" in [c.lower() for c in pin_df.columns]:
        pass
    else:
        # Parse SpecId to extract hit rank and update both columns
        pin_df["rank"] = pin_df[specid].apply(parse_specid)

    retention_time_column = (
        find_required_columns(retention_time_column, pin_df.columns)
        if retention_time_column
        else None
    )

    # col: charge_[1,2,3,...] = 0, 1
    charge_map = {
        col: int(re.search(r"(\d+)", col).group(1))
        for col in pin_df.columns
        if re.search(r"charge[_]?(\d+)", col, re.IGNORECASE)
    }

    def extract_charge(row):
        for col, num in charge_map.items():
            if int(float(row[col])) == 1:
                return num
        return None

    pin_df["Charge"] = pin_df.apply(extract_charge, axis=1)

    # feature columns: columns that are not non-feature columns
    non_feature_columns = [label, scan, specid, peptide, protein, hit_rank, "Charge"]
    feature_columns = [col for col in pin_df.columns if col not in non_feature_columns]

    logger.info(
        f"Columns: label={label}, scan={scan}, specid={specid}, peptide={peptide}, "
        f"protein={protein}, hit_rank={hit_rank}, retention_time={retention_time_column}, "
        f"features={feature_columns}"
    )

    pin_df[scan] = pin_df[scan].astype(int)
    pin_df[specid] = pin_df[specid].astype(str)
    pin_df[peptide] = pin_df[peptide].astype(str)
    pin_df[protein] = pin_df[protein].astype(str)
    pin_df[hit_rank] = pin_df[hit_rank].astype(float).astype(int)
    pin_df["Charge"] = pin_df["Charge"].astype(float).astype(int)
    if retention_time_column:
        pin_df[retention_time_column] = pin_df[retention_time_column].astype(float)
    for col in feature_columns:
        pin_df[col] = pin_df[col].astype(float)

    # label = 1 for target, -1 for decoy. Convert to Boolean.
    pin_df[label] = pin_df[label] == "1"
    rescoring_features = {"Original": feature_columns}

    return PsmContainer(
        psms=pin_df,
        label_column=label,
        scan_column=scan,
        spectrum_column=specid,
        ms_data_file_column=None,
        peptide_column=peptide,
        protein_column=protein,
        charge_column="Charge",
        rescoring_features=rescoring_features,
        hit_rank_column=hit_rank,
        retention_time_column=retention_time_column,
    )

`_read_single_pin_as_df(pin_file)` ¶

Read a single PIN file into a DataFrame.

Parameters:

Name	Type	Description	Default
`pin_file`	`str`	The file path to the PIN file.	required

Returns:

Type	Description
`DataFrame`	A DataFrame containing the PSM data.

Notes

This function: 1. Reads the PIN file header 2. Processes the proteins column as a tab-separated list 3. Creates a DataFrame with the processed data

Source code in optimhc/parser/pin.py

def _read_single_pin_as_df(pin_file: str) -> pd.DataFrame:
    """
    Read a single PIN file into a DataFrame.

    Parameters
    ----------
    pin_file : str
        The file path to the PIN file.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the PSM data.

    Notes
    -----
    This function:
    1. Reads the PIN file header
    2. Processes the proteins column as a tab-separated list
    3. Creates a DataFrame with the processed data
    """
    logger.info(f"Reading PIN file: {pin_file}")
    with open(pin_file, "r") as f:
        header = f.readline().strip().split("\t")
        header_len = len(header)
        data = []
        for line in f:
            parts = line.strip().split("\t")
            proteins_column_num = len(parts) - header_len + 1
            proteins = "\t".join(parts[-proteins_column_num:])
            data.append(parts[: len(parts) - proteins_column_num] + [proteins])
    df = pd.DataFrame(data, columns=header)
    logger.debug(f"Header: {header}")
    return df

mzML Parser¶

`mzml` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`extract_mzml_data(mzml_filename, scan_ids=None)` ¶

Extract scan data from an mzML file.

Parameters:

Name	Type	Description	Default
`mzml_filename`	`str`	The path to the mzML file.	required
`scan_ids`	`list[int] or None`	A list of scan IDs to extract. If None, extracts all scans.	`None`

Returns:

Type	Description
`DataFrame`	A DataFrame containing the extracted scan data with columns: - source: The source file name - scan: The scan ID - mz: The m/z values array - intensity: The intensity values array - charge: The charge state - retention_time: The retention time

Notes

This function: 1. Reads the mzML file using pyteomics 2. Extracts scan data including retention time, charge state, m/z values, and intensities 3. Filters scans based on provided scan IDs if specified 4. Returns a DataFrame with the extracted data

Source code in optimhc/parser/mzml.py

def extract_mzml_data(mzml_filename, scan_ids=None):
    """
    Extract scan data from an mzML file.

    Parameters
    ----------
    mzml_filename : str
        The path to the mzML file.
    scan_ids : list[int] or None, optional
        A list of scan IDs to extract. If None, extracts all scans.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the extracted scan data with columns:
        - source: The source file name
        - scan: The scan ID
        - mz: The m/z values array
        - intensity: The intensity values array
        - charge: The charge state
        - retention_time: The retention time

    Notes
    -----
    This function:
    1. Reads the mzML file using pyteomics
    2. Extracts scan data including retention time, charge state, m/z values, and intensities
    3. Filters scans based on provided scan IDs if specified
    4. Returns a DataFrame with the extracted data
    """
    filename = mzml_filename.split("/")[-1].replace(".mzML", "")
    logger.info(f"Extracting scans from {mzml_filename}")

    scan_ids = set(scan_ids) if scan_ids is not None else None

    (
        extracted_scan_ids,
        mzml_filenames,
        intensities,
        mz_values,
        charges,
        retention_times,
    ) = ([], [], [], [], [], [])

    try:
        with mzml.read(mzml_filename) as reader:
            for spectrum in reader:
                try:
                    scan_id = int(spectrum["id"].split("scan=")[-1])

                    if scan_ids is not None and scan_id not in scan_ids:
                        continue

                    mz_array = np.array(spectrum.get("m/z array", []))
                    intensity_array = np.array(spectrum.get("intensity array", []))

                    charge = None
                    try:
                        charge = int(
                            spectrum["precursorList"]["precursor"][0]["selectedIonList"][
                                "selectedIon"
                            ][0]["charge state"]
                        )
                    except (KeyError, ValueError, IndexError):
                        pass

                    retention_time = None
                    try:
                        retention_time = float(spectrum["scanList"]["scan"][0]["scan start time"])
                    except (KeyError, ValueError, IndexError):
                        pass

                    extracted_scan_ids.append(scan_id)
                    mzml_filenames.append(filename)
                    intensities.append(intensity_array)
                    mz_values.append(mz_array)
                    charges.append(charge)
                    retention_times.append(retention_time)

                except Exception as e:
                    logger.warning(f"Skipping scan {scan_id} due to error: {e}")

    except Exception as e:
        logger.error(f"Failed to parse mzML file {mzml_filename}: {e}")
        raise RuntimeError(f"Error processing mzML file {mzml_filename}: {e}")

    data_dict = {
        "source": mzml_filenames,
        "scan": extracted_scan_ids,
        "mz": mz_values,
        "intensity": intensities,
        "charge": charges,
        "retention_time": retention_times,
    }

    scans_df = pd.DataFrame(data_dict)
    scans_df = scans_df.drop_duplicates(subset=["source", "scan"])

    logger.info(f"Successfully extracted {len(scans_df)} scans from {mzml_filename}")

    return scans_df

I/O¶

Parser Package¶

parser ¶

__all__ = ['read_pin', 'read_pepxml', 'extract_mzml_data'] module-attribute ¶

extract_mzml_data(mzml_filename, scan_ids=None) ¶

read_pepxml(pepxml_files, decoy_prefix='DECOY_') ¶

read_pin(pin_files, retention_time_column=None, remove_pre_nxt_aa=False) ¶

PepXML Parser¶

pepxml ¶

logger = logging.getLogger(__name__) module-attribute ¶

PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None) ¶

psms property ¶

target_psms property ¶

decoy_psms property ¶

columns property ¶

feature_columns property ¶

feature_sources property ¶

peptides property ¶

ms_data_files property ¶

scan_ids property ¶

charges property ¶

metadata property ¶

spectrum_ids property ¶

identifier_columns property ¶

__len__() ¶

copy() ¶

__repr__() ¶

drop_features(features) ¶

drop_source(source) ¶

add_metadata(metadata_df, psms_key, metadata_key, source) ¶

get_top_hits(n=1) ¶

add_features(features_df, psms_key, feature_key, source, suffix=None) ¶

add_features_by_index(features_df, source, suffix=None) ¶

add_results(results_df, psms_key, result_key) ¶

write_pin(output_file, style='default', source=None) ¶

read_pepxml(pepxml_files, decoy_prefix='DECOY_') ¶

_parse_pepxml(pepxml_file, decoy_prefix) ¶

_parse_msms_run(msms_run, decoy_prefix) ¶

_parse_spectrum(spectrum, run_info, decoy_prefix) ¶

_parse_psm(psm_info, spec_info, decoy_prefix) ¶

_log_features(col, features) ¶

PIN Parser¶

pin ¶

logger = logging.getLogger(__name__) module-attribute ¶

PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None) ¶

psms property ¶

target_psms property ¶

decoy_psms property ¶

columns property ¶

feature_columns property ¶

feature_sources property ¶

peptides property ¶

ms_data_files property ¶

scan_ids property ¶

charges property ¶

metadata property ¶

spectrum_ids property ¶

identifier_columns property ¶

__len__() ¶

copy() ¶

__repr__() ¶

drop_features(features) ¶

drop_source(source) ¶

add_metadata(metadata_df, psms_key, metadata_key, source) ¶

get_top_hits(n=1) ¶

add_features(features_df, psms_key, feature_key, source, suffix=None) ¶

add_features_by_index(features_df, source, suffix=None) ¶

add_results(results_df, psms_key, result_key) ¶

write_pin(output_file, style='default', source=None) ¶

read_pin(pin_files, retention_time_column=None, remove_pre_nxt_aa=False) ¶

_read_single_pin_as_df(pin_file) ¶

mzML Parser¶

mzml ¶

logger = logging.getLogger(__name__) module-attribute ¶

extract_mzml_data(mzml_filename, scan_ids=None) ¶

`parser` ¶

`all = ['read_pin', 'read_pepxml', 'extract_mzml_data']` `module-attribute` ¶

`extract_mzml_data(mzml_filename, scan_ids=None)` ¶

`read_pepxml(pepxml_files, decoy_prefix='DECOY_')` ¶

`read_pin(pin_files, retention_time_column=None, remove_pre_nxt_aa=False)` ¶

`pepxml` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)` ¶

`psms` `property` ¶

`target_psms` `property` ¶

`decoy_psms` `property` ¶

`columns` `property` ¶

`feature_columns` `property` ¶

`feature_sources` `property` ¶

`peptides` `property` ¶

`ms_data_files` `property` ¶

`scan_ids` `property` ¶

`charges` `property` ¶

`metadata` `property` ¶

`spectrum_ids` `property` ¶

`identifier_columns` `property` ¶

`len()` ¶

`copy()` ¶

`repr()` ¶

`drop_features(features)` ¶

`drop_source(source)` ¶

`add_metadata(metadata_df, psms_key, metadata_key, source)` ¶

`get_top_hits(n=1)` ¶

`add_features(features_df, psms_key, feature_key, source, suffix=None)` ¶

`add_features_by_index(features_df, source, suffix=None)` ¶

`add_results(results_df, psms_key, result_key)` ¶

`write_pin(output_file, style='default', source=None)` ¶

`read_pepxml(pepxml_files, decoy_prefix='DECOY_')` ¶

`_parse_pepxml(pepxml_file, decoy_prefix)` ¶

`_parse_msms_run(msms_run, decoy_prefix)` ¶

`_parse_spectrum(spectrum, run_info, decoy_prefix)` ¶

`_parse_psm(psm_info, spec_info, decoy_prefix)` ¶

`_log_features(col, features)` ¶

`pin` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`PsmContainer(psms, label_column, scan_column, spectrum_column, ms_data_file_column, peptide_column, protein_column, rescoring_features, hit_rank_column=None, charge_column=None, retention_time_column=None, calculated_mass_column=None, metadata_column=None)` ¶

`psms` `property` ¶

`target_psms` `property` ¶

`decoy_psms` `property` ¶

`columns` `property` ¶

`feature_columns` `property` ¶

`feature_sources` `property` ¶

`peptides` `property` ¶

`ms_data_files` `property` ¶

`scan_ids` `property` ¶

`charges` `property` ¶

`metadata` `property` ¶

`spectrum_ids` `property` ¶

`identifier_columns` `property` ¶

`len()` ¶

`copy()` ¶

`repr()` ¶

`drop_features(features)` ¶

`drop_source(source)` ¶

`add_metadata(metadata_df, psms_key, metadata_key, source)` ¶

`get_top_hits(n=1)` ¶

`add_features(features_df, psms_key, feature_key, source, suffix=None)` ¶

`add_features_by_index(features_df, source, suffix=None)` ¶

`add_results(results_df, psms_key, result_key)` ¶

`write_pin(output_file, style='default', source=None)` ¶

`read_pin(pin_files, retention_time_column=None, remove_pre_nxt_aa=False)` ¶

`_read_single_pin_as_df(pin_file)` ¶

`mzml` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`extract_mzml_data(mzml_filename, scan_ids=None)` ¶