Source code for alleleTools.format.alleleTable

"""
Allele Table Data Structure Module.

This module defines the AlleleTable class, which provides a standardized
data structure for storing and manipulating allele data along with associated
phenotype and covariate information.
"""

import numpy as np
import pandas as pd



[docs]
class AlleleTable:
    """
    A standardized data structure for storing allele data with metadata.

    This class provides a unified interface for handling allele data from
    polymorphic genes, along with associated phenotype and covariate information.
    It serves as the core data structure for allele analysis workflows.

    Attributes:
        alleles (pd.DataFrame): Main allele data with samples as rows and
           genes/alleles as columns
        phenotype (pd.Series): Phenotype information indexed by sample ID
        covariates (pd.DataFrame): Covariate data with samples as rows and
          covariates as columns

    Example:
        >>> table = AlleleTable()
        >>> # Load allele data
        >>> table.alleles = pd.DataFrame(...)
        >>> # Add phenotype information
        >>> table.phenotype = pd.Series(...)
    """

    def __init__(
            self,
            alleles: pd.DataFrame = pd.DataFrame(),
            phenotype: pd.Series = pd.Series(),
            covariates: pd.DataFrame = pd.DataFrame(),
            ):
        """
        Initialize an empty AlleleTable.

        Creates empty pandas structures for alleles, phenotypes, and covariates
        that can be populated with data.
        """
        # Some safety checks
        if not alleles.empty:
            if alleles.index.name != "sample":
                raise ValueError("Alleles DataFrame index must be named 'sample'")
            if len(alleles.columns) % 2 != 0:
                raise ValueError("Alleles DataFrame must have an even number of columns")

        self.alleles = alleles
        self.phenotype = phenotype
        self.covariates = covariates


[docs]
    def load_phenotype(self, phenotype_file: str) -> None:
        """
        Load phenotype information from a file into the AlleleTable.

        Args:
            phenotype_file (str): The path to the phenotype file. It should be
                a whitespace-separated values file with a header.  It should
                contain columns "IID" and "phenotype".
        """
        if len(phenotype_file) == 0:
            self.set_phenotype(
                pd.Series(index=self.alleles.index, data=float(0))
            )
            return

        phe = pd.read_csv(phenotype_file, sep=" ", comment="#")

        # Check that the file was formatted correctly
        if "IID" not in phe.columns or "phenotype" not in phe.columns:
            print(
                f"ERROR: Phenotype file ({phenotype_file}) must contain 'IID' and 'phenotype' columns.")
            print("Check if the file is well formatted and whitespace-separated")
            exit(1)

        phe.set_index(phe["IID"].astype(str), inplace=True)

        self.set_phenotype(phe["phenotype"])



[docs]
    def remove_phenotype_zero(self) -> None:
        """
        Remove samples with phenotype value equal to zero from the AlleleTable.
        """
        if self.phenotype.empty:
            print("WARNING: Phenotype is empty, cannot remove samples with phenotype zero.")
            return

        non_zero_samples = self.phenotype[self.phenotype != 0].index

        self.alleles = self.alleles.loc[non_zero_samples]
        self.phenotype = self.phenotype.loc[non_zero_samples]
        if not self.covariates.empty:
            self.covariates = self.covariates.loc[non_zero_samples]



[docs]
    def set_phenotype(self, phenotype: pd.Series) -> None:
        """
        Stores the phenotype series in the AlleleTable

        Args:
            phenotype (pd.Series): A series with the phenotypes for the
                AlleleTable. It should have the samples' IDs as index and the
                phenotype values.
        """
        self.phenotype = phenotype
        self.phenotype.name = "phenotype"
        self.phenotype.index.name = "sample"
        self.__verify_phe_samples__()


    def __verify_phe_samples__(self):
        """
        Check if the phenotype series has the same samples all the samples
        that the allele table has.
        """
        missing_samples = self.alleles.index.difference(self.phenotype.index)

        if len(missing_samples) > 0:
            print("ERROR: Phenotype list does not contain all the samples from alleles.")
            print("%d samples are missing:" % len(missing_samples))
            print(missing_samples.to_list())
            exit(1)
    

[docs]
    @classmethod
    def open(cls, filename: str, sep: str = "\t") -> "AlleleTable":
        """
        Load allele data from a CSV file into the AlleleTable.

        Args:
            filename (str): The path to the input CSV file.
        """
        ins = cls()
        ins.__init__()
        df = pd.read_csv(
            filename,
            sep=sep,
            index_col=0,
            header=0,
            dtype=str,
            na_values=["NA"]
        )

        if df.columns[0] != "phenotype":
            raise ValueError("The second column of %s must be 'phenotype'" % filename)

        # pop the second column as phenotype
        ins.phenotype = df.pop(df.columns[0])

        ins.alleles = df

        return ins

    
    def _alleles_as_str_(self) -> pd.DataFrame:
        """
        Convert alleles DataFrame to string representation.
        """
        # Convert alleles to string
        df = self.alleles.copy().astype(str)
        # Replace empty strings and None with NaN
        df = df.replace({"": np.nan, None: np.nan, "nan": np.nan})

        return df


[docs]
    def to_csv(
            self, filename: str, header: bool = True, population: str = ""
    ):
        """
        Export the allele table to a CSV file.

        Args:
            filename (str): The name of the output CSV file.
            header (bool): Flag to store the file with column names or not
            population (str): Adds an extra column in the position left to
                phenotype with a population name. Currently, only one
                population per allele table is supported.
        """
        df = self._alleles_as_str_()

        if not self.phenotype.empty:
            # Add phenotype to df, checking that the index matches
            df = df.join(self.phenotype, how="inner")
            # Move phenotype to the first column
            col = df.pop("phenotype")
            df.insert(0, "phenotype", col)

        # add population column at the beginning
        if population:
            df.insert(
                0, "population", population
            )

        # move sample id to column
        df.index.name = "sample"
        df.reset_index(inplace=True)

        df.to_csv(
            filename,
            sep="\t",
            index=False,
            na_rep="NA",
            header=header
        )