"""
Allele Table Data Structure Module.
This module defines the AlleleTable class, which provides a standardized
data structure for storing and manipulating allele data along with associated
phenotype and covariate information.
"""
import numpy as np
import pandas as pd
[docs]
class AlleleTable:
"""
A standardized data structure for storing allele data with metadata.
This class provides a unified interface for handling allele data from
polymorphic genes, along with associated phenotype and covariate information.
It serves as the core data structure for allele analysis workflows.
Attributes:
alleles (pd.DataFrame): Main allele data with samples as rows and
genes/alleles as columns
phenotype (pd.Series): Phenotype information indexed by sample ID
covariates (pd.DataFrame): Covariate data with samples as rows and
covariates as columns
Example:
>>> table = AlleleTable()
>>> # Load allele data
>>> table.alleles = pd.DataFrame(...)
>>> # Add phenotype information
>>> table.phenotype = pd.Series(...)
"""
def __init__(
self,
alleles: pd.DataFrame = pd.DataFrame(),
phenotype: pd.Series = pd.Series(),
covariates: pd.DataFrame = pd.DataFrame(),
):
"""
Initialize an empty AlleleTable.
Creates empty pandas structures for alleles, phenotypes, and covariates
that can be populated with data.
"""
# Some safety checks
if not alleles.empty:
if alleles.index.name != "sample":
raise ValueError("Alleles DataFrame index must be named 'sample'")
if len(alleles.columns) % 2 != 0:
raise ValueError("Alleles DataFrame must have an even number of columns")
self.alleles = alleles
self.phenotype = phenotype
self.covariates = covariates
[docs]
def load_phenotype(self, phenotype_file: str) -> None:
"""
Load phenotype information from a file into the AlleleTable.
Args:
phenotype_file (str): The path to the phenotype file. It should be
a whitespace-separated values file with a header. It should
contain columns "IID" and "phenotype".
"""
if len(phenotype_file) == 0:
self.set_phenotype(
pd.Series(index=self.alleles.index, data=float(0))
)
return
phe = pd.read_csv(phenotype_file, sep=" ", comment="#")
# Check that the file was formatted correctly
if "IID" not in phe.columns or "phenotype" not in phe.columns:
print(
f"ERROR: Phenotype file ({phenotype_file}) must contain 'IID' and 'phenotype' columns.")
print("Check if the file is well formatted and whitespace-separated")
exit(1)
phe.set_index(phe["IID"].astype(str), inplace=True)
self.set_phenotype(phe["phenotype"])
[docs]
def remove_phenotype_zero(self) -> None:
"""
Remove samples with phenotype value equal to zero from the AlleleTable.
"""
if self.phenotype.empty:
print("WARNING: Phenotype is empty, cannot remove samples with phenotype zero.")
return
non_zero_samples = self.phenotype[self.phenotype != 0].index
self.alleles = self.alleles.loc[non_zero_samples]
self.phenotype = self.phenotype.loc[non_zero_samples]
if not self.covariates.empty:
self.covariates = self.covariates.loc[non_zero_samples]
[docs]
def set_phenotype(self, phenotype: pd.Series) -> None:
"""
Stores the phenotype series in the AlleleTable
Args:
phenotype (pd.Series): A series with the phenotypes for the
AlleleTable. It should have the samples' IDs as index and the
phenotype values.
"""
self.phenotype = phenotype
self.phenotype.name = "phenotype"
self.phenotype.index.name = "sample"
self.__verify_phe_samples__()
def __verify_phe_samples__(self):
"""
Check if the phenotype series has the same samples all the samples
that the allele table has.
"""
missing_samples = self.alleles.index.difference(self.phenotype.index)
if len(missing_samples) > 0:
print("ERROR: Phenotype list does not contain all the samples from alleles.")
print("%d samples are missing:" % len(missing_samples))
print(missing_samples.to_list())
exit(1)
[docs]
@classmethod
def open(cls, filename: str, sep: str = "\t") -> "AlleleTable":
"""
Load allele data from a CSV file into the AlleleTable.
Args:
filename (str): The path to the input CSV file.
"""
ins = cls()
ins.__init__()
df = pd.read_csv(
filename,
sep=sep,
index_col=0,
header=0,
dtype=str,
na_values=["NA"]
)
if df.columns[0] != "phenotype":
raise ValueError("The second column of %s must be 'phenotype'" % filename)
# pop the second column as phenotype
ins.phenotype = df.pop(df.columns[0])
ins.alleles = df
return ins
def _alleles_as_str_(self) -> pd.DataFrame:
"""
Convert alleles DataFrame to string representation.
"""
# Convert alleles to string
df = self.alleles.copy().astype(str)
# Replace empty strings and None with NaN
df = df.replace({"": np.nan, None: np.nan, "nan": np.nan})
return df
[docs]
def to_csv(
self, filename: str, header: bool = True, population: str = ""
):
"""
Export the allele table to a CSV file.
Args:
filename (str): The name of the output CSV file.
header (bool): Flag to store the file with column names or not
population (str): Adds an extra column in the position left to
phenotype with a population name. Currently, only one
population per allele table is supported.
"""
df = self._alleles_as_str_()
if not self.phenotype.empty:
# Add phenotype to df, checking that the index matches
df = df.join(self.phenotype, how="inner")
# Move phenotype to the first column
col = df.pop("phenotype")
df.insert(0, "phenotype", col)
# add population column at the beginning
if population:
df.insert(
0, "population", population
)
# move sample id to column
df.index.name = "sample"
df.reset_index(inplace=True)
df.to_csv(
filename,
sep="\t",
index=False,
na_rep="NA",
header=header
)