Source code for alleleTools.format.vcf

"""
VCF File Handling Module.

This module provides a VCF (Variant Call Format) class for reading, parsing,
and manipulating VCF files containing genetic variant data, particularly
optimized for HLA and KIR allele data.
"""

import pandas as pd


[docs] class VCF: """ A class for handling VCF (Variant Call Format) files. This class provides methods to read, parse, and manipulate VCF files, with specific functionality for handling allele data from polymorphic genes like HLA and KIR. Attributes: metadata (str): VCF header metadata lines dataframe (pd.DataFrame): Main VCF data with ID as index Args: path (str): Path to the VCF file to read """ def __init__(self, path): self.metadata = str() self.__read_file(path) self.__static_columns = [ "CHROM", "POS", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", ] def __read_file(self, path): """ Read and parse a VCF file into metadata and dataframe components. Separates the VCF header (metadata) from the data section and loads the data into a pandas DataFrame with appropriate data types. Args: path (str): Path to the VCF file to read Raises: FileNotFoundError: If the specified file doesn't exist pd.errors.ParserError: If the file format is invalid """ last_pos = 0 with open(path, "r") as file: # Store header in metadata while True: line = file.readline() if line.startswith("#CHROM"): break self.metadata += line last_pos = file.tell() # Read the rest of the file as df file.seek(last_pos) self.dataframe = pd.read_csv( file, sep="\t", on_bad_lines="warn", dtype={ "#CHROM": str, "POS": int, "ID": str, "REF": str, "ALT": str, "QUAL": str, "FILTER": str, "INFO": str, "FORMAT": str, }, ) self.dataframe.rename(columns={"#CHROM": "CHROM"}, inplace=True) self.dataframe.set_index("ID", inplace=True)
[docs] def remove_id_prefix(self, prefix: str): """ Remove a prefix from allele IDs in the dataframe. This is commonly used to remove gene prefixes like `'HLA_' or 'KIR_'` from allele identifiers to standardize naming. Args: prefix (str): The prefix string to remove from allele IDs Example: >>> vcf.remove_id_prefix("HLA_") # "HLA_A*01:01" becomes "A*01:01" """ self.dataframe.index = self.dataframe.index.str.replace( prefix, "", regex=False)
[docs] def get_format(self): """ Extract the format field specification from the VCF. Parses the FORMAT column to determine the structure of genotype information fields (e.g., GT:DS:AA:AB:BB). Returns: List[str]: List of format field names in order Example: >>> vcf.get_format() ['GT', 'DS', 'AA', 'AB', 'BB'] """ formats = self.dataframe["FORMAT"].str.split(":", expand=True) return formats.iloc[0].tolist()
[docs] def samples(self): """ Get the list of sample column names from the VCF. Returns all column names that are not part of the standard VCF format (i.e., sample-specific genotype columns). Returns: set: Set of sample column names """ columns = set(self.dataframe.columns) sample_columns = columns.difference(self.__static_columns) return sample_columns
[docs] def samples_dataframe(self): """ Get a dataframe containing only the sample genotype columns. Returns: pd.DataFrame: DataFrame with only sample columns, indexed by variant ID """ return self.dataframe.loc[:, self.samples()]
[docs] def save(self, path: str): """ Save the VCF data to a file. Writes the metadata header followed by the dataframe in standard VCF format. Args: path (str): Output file path Note: This method modifies the internal dataframe structure during saving. """ with open(path, "w") as file: file.write(self.metadata) # Prepare dataframe for output output_df = self.dataframe.reset_index() output_df.rename(columns={"CHROM": "#CHROM"}, inplace=True) output_df.to_csv(path, mode="a", sep="\t", index=False)