Source code for alleleTools.format.alleleTable

"""
Allele Table Data Structure Module.

This module defines the AlleleTable class, which provides a standardized
data structure for storing and manipulating allele data along with associated
phenotype and covariate information.
"""

import numpy as np
import pandas as pd


[docs] class AlleleTable: """ A standardized data structure for storing allele data with metadata. This class provides a unified interface for handling allele data from polymorphic genes, along with associated phenotype and covariate information. It serves as the core data structure for allele analysis workflows. Attributes: alleles (pd.DataFrame): Main allele data with samples as rows and genes/alleles as columns phenotype (pd.Series): Phenotype information indexed by sample ID covariates (pd.DataFrame): Covariate data with samples as rows and covariates as columns Example: >>> table = AlleleTable() >>> # Load allele data >>> table.alleles = pd.DataFrame(...) >>> # Add phenotype information >>> table.phenotype = pd.Series(...) """ def __init__( self, alleles: pd.DataFrame = pd.DataFrame(), phenotype: pd.Series = pd.Series(), covariates: pd.DataFrame = pd.DataFrame(), ): """ Initialize an empty AlleleTable. Creates empty pandas structures for alleles, phenotypes, and covariates that can be populated with data. """ # Some safety checks if not alleles.empty: if alleles.index.name != "sample": raise ValueError("Alleles DataFrame index must be named 'sample'") if len(alleles.columns) % 2 != 0: raise ValueError("Alleles DataFrame must have an even number of columns") self.alleles = alleles self.phenotype = phenotype self.covariates = covariates
[docs] def load_phenotype(self, phenotype_file: str) -> None: """ Load phenotype information from a file into the AlleleTable. Args: phenotype_file (str): The path to the phenotype file. It should be a whitespace-separated values file with a header. It should contain columns "IID" and "phenotype". """ if len(phenotype_file) == 0: self.set_phenotype( pd.Series(index=self.alleles.index, data=float(0)) ) return phe = pd.read_csv(phenotype_file, sep=" ", comment="#") # Check that the file was formatted correctly if "IID" not in phe.columns or "phenotype" not in phe.columns: print( f"ERROR: Phenotype file ({phenotype_file}) must contain 'IID' and 'phenotype' columns.") print("Check if the file is well formatted and whitespace-separated") exit(1) phe.set_index(phe["IID"].astype(str), inplace=True) self.set_phenotype(phe["phenotype"])
[docs] def remove_phenotype_zero(self) -> None: """ Remove samples with phenotype value equal to zero from the AlleleTable. """ if self.phenotype.empty: print("WARNING: Phenotype is empty, cannot remove samples with phenotype zero.") return non_zero_samples = self.phenotype[self.phenotype != 0].index self.alleles = self.alleles.loc[non_zero_samples] self.phenotype = self.phenotype.loc[non_zero_samples] if not self.covariates.empty: self.covariates = self.covariates.loc[non_zero_samples]
[docs] def set_phenotype(self, phenotype: pd.Series) -> None: """ Stores the phenotype series in the AlleleTable Args: phenotype (pd.Series): A series with the phenotypes for the AlleleTable. It should have the samples' IDs as index and the phenotype values. """ self.phenotype = phenotype self.phenotype.name = "phenotype" self.phenotype.index.name = "sample" self.__verify_phe_samples__()
def __verify_phe_samples__(self): """ Check if the phenotype series has the same samples all the samples that the allele table has. """ missing_samples = self.alleles.index.difference(self.phenotype.index) if len(missing_samples) > 0: print("ERROR: Phenotype list does not contain all the samples from alleles.") print("%d samples are missing:" % len(missing_samples)) print(missing_samples.to_list()) exit(1)
[docs] @classmethod def open(cls, filename: str, sep: str = "\t") -> "AlleleTable": """ Load allele data from a CSV file into the AlleleTable. Args: filename (str): The path to the input CSV file. """ ins = cls() ins.__init__() df = pd.read_csv( filename, sep=sep, index_col=0, header=0, dtype=str, na_values=["NA"] ) if df.columns[0] != "phenotype": raise ValueError("The second column of %s must be 'phenotype'" % filename) # pop the second column as phenotype ins.phenotype = df.pop(df.columns[0]) ins.alleles = df return ins
def _alleles_as_str_(self) -> pd.DataFrame: """ Convert alleles DataFrame to string representation. """ # Convert alleles to string df = self.alleles.copy().astype(str) # Replace empty strings and None with NaN df = df.replace({"": np.nan, None: np.nan, "nan": np.nan}) return df
[docs] def to_csv( self, filename: str, header: bool = True, population: str = "" ): """ Export the allele table to a CSV file. Args: filename (str): The name of the output CSV file. header (bool): Flag to store the file with column names or not population (str): Adds an extra column in the position left to phenotype with a population name. Currently, only one population per allele table is supported. """ df = self._alleles_as_str_() if not self.phenotype.empty: # Add phenotype to df, checking that the index matches df = df.join(self.phenotype, how="inner") # Move phenotype to the first column col = df.pop("phenotype") df.insert(0, "phenotype", col) # add population column at the beginning if population: df.insert( 0, "population", population ) # move sample id to column df.index.name = "sample" df.reset_index(inplace=True) df.to_csv( filename, sep="\t", index=False, na_rep="NA", header=header )