Source code for alleleTools.format.from_ikmb_hla

"""
Consensus HLA Genotyping Module.

This module generates consensus HLA genotypes from multiple genotyping
algorithm results. It handles the integration of results from different HLA
typing tools, resolves conflicts, and produces high-confidence consensus calls.
It is initially designed to work with the JSON reports generated by the IKMB
HLA genotyping pipeline.

Input formats:
- JSON reports from IKMB HLA genotyping pipeline

Author: Nicolás Mendoza Mejía (2023)
"""

from typing import List, Tuple

import numpy as np
import pandas as pd

from ..allele import AlleleParser, FieldTree
from ..argtypes import file_path, output_path
from .alleleTable import AlleleTable
from .gene_report import Gene, Report, read_json



[docs]
def setup_parser(subparsers):
    """
    Set up the argument parser for the consensus command.

    Args:
        subparsers: The subparsers object to add this command to.

    Returns:
        argparse.ArgumentParser: The configured parser for consensus.
    """
    parser = subparsers.add_parser(
        name="from_ikmb",
        help="Find a consensus between multiple HLA genotyping reports",
        description="Find a consensus between multiple HLA genotyping reports, which were" \
        " generated by the IKMB HLA genotyping pipeline (https://github.com/ikmb/hla).",
        epilog="Author: Nicolás Mendoza Mejía (2023)",
    )
    parser.add_argument(
        "input",
        metavar="path",
        type=str,
        nargs="+",
        help="JSON files with HLA genotyping reports from the IKMB pipeline",
    )
    parser.add_argument(
        "--min_coverage",
        type=float,
        help="Minimum coverage threshold",
        default=100.0,
    )
    parser.add_argument(
        "--min_support",
        type=float,
        help="Proportion of algorithms supporting the consensus call",
        default=0.6,
    )
    parser.add_argument(
        "--phe",
        type=file_path,
        help="input phe file name (to add phenotype column)",
        default="",
    )
    parser.add_argument(
        "--output",
        metavar="path",
        type=output_path,
        help="Path to output file",
        default="output.alt",
    )
    parser.add_argument(
        "--gene_family",
        type=str,
        help="Specify the gene family e.i. 'hla', 'kir'",
        default="hla",
    )
    parser.add_argument(
        "--config_file",
        type=file_path,
        help="Path to a custom allele parsing configuration file",
        default="",
    )

    parser.set_defaults(func=call_function)

    return parser




[docs]
def call_function(args):
    """
    Main function to execute consensus genotyping.

    Processes multiple JSON genotyping reports and generates consensus
    HLA calls using evidence-based algorithms. Outputs results in
    allele table format.

    Args:
        args: Parsed command line arguments containing:
            - input: List of JSON file handles with genotyping reports
            - output: Path to output consensus file
    """
    parser = AlleleParser(gene_family=args.gene_family,
                          config_file=args.config_file)

    reports = list()
    for file in args.input:
        j = read_json(file)
        report = ConsensusReport(j, allele_parser=parser)
        reports.extend(report.consensus(args.min_support))

    # Filter under covered sample's genes
    df = pd.DataFrame(reports)
    df = df[df["coverage"] >= args.min_coverage]

    # Convert and save file
    alt = reports_as_allele_table(df, args.phe)
    alt.to_csv(args.output)




[docs]
def reports_as_allele_table(
        reports: pd.DataFrame, phe_file: str
) -> AlleleTable:
    # reports["alleles"].fillna("").apply(list)
    # pivot table
    df_pivot = reports.pivot_table(
        values="alleles",
        index=["sample"],
        columns=["gene"],
        aggfunc="sum",
    )

    # Gene columns have a list of one or two alleles
    # extract those alleles into [gene_name]_1 and [gene_name]_2
    genes = df_pivot.columns
    genes = [gene for gene in genes if gene != "sample"]
    for gene in genes:
        # Fill non genotyped alleles or filtered alleles
        pairs = df_pivot.pop(gene).fillna("").apply(list).tolist()
        # Expand the pair to two columns
        df_pivot[[f"{gene}_1", f"{gene}_2"]] = pd.DataFrame(
            pairs, index=df_pivot.index)

    # Replace "" with nan
    df_pivot.replace("", np.nan, inplace=True)

    alt = AlleleTable()
    alt.alleles = df_pivot
    alt.load_phenotype(phe_file)

    return alt




[docs]
class ConsensusGene(Gene):
    def __init__(self,
                 name: str,
                 calls: dict,
                 allele_parser: AlleleParser,
                 coverage: List[dict] = [],
                 ) -> None:
        super().__init__(
            name=name,
            calls=calls,
            allele_parser=allele_parser,
            coverage=coverage
        )
        # Set default settings for consensus
        self.consensus_normalize_weight = True
        self.max_support = 0


[docs]
    def set_consensus_settings(
        self,
        normalize_weight: bool,
        max_support: float
    ):
        self.consensus_normalize_weight = normalize_weight
        self.max_support = max_support



[docs]
    def get_consensus_call(
            self, min_support: float
    ) -> Tuple[List[str], List[float]]:
        """
        Determine the consensus alleles and returns two values

        Args:
            min_support (float): Minimum support threshold for consensus calls.

        Returns:
            alleles: list of consensus alleles
            support: percentage of algorithms supported the consensus alleles
        """
        tree = FieldTree(self.name)
        for tool, alleles in self.calls.items():
            if len(alleles) == 0:
                continue

            # Create a new tree for each tool
            tool_tree = FieldTree(self.name)
            # alleles = set(alleles)  # Remove duplicates
            alleles = [
                self.allele_parser.parse(self.name + '*' + allele).get_fields()
                for allele in alleles
            ]
            tool_tree.add_batch(alleles)

            if self.consensus_normalize_weight:
                # All children count only as one vote (one tool)
                tool_tree.set_support(new_weight=1, recursive=True)

            # Now we submit the vote from this tool
            tree.merge_tree(tool_tree)

        # Get delimiters
        gene_delimiter, field_delimiter = self.allele_parser.get_delimiters()

        # Get the consensus from the main tree
        alleles, support = tree.get_consensus(
            min_support=min_support,
            gene_delimiter=gene_delimiter,
            field_delimiter=field_delimiter,
            max_support=self.max_support,
        )
        return alleles, support



[docs]
    def consensus_dict(self, min_support: float) -> dict:
        """
        Convert this object into a dictionary

        Args:
            min_support (float): Minimum support threshold for consensus calls.

        Returns:
            dict: Dictionary representation of the consensus gene.
        """
        alleles, support = self.get_consensus_call(min_support)
        return {
            "gene": self.name,
            "coverage": self.mean_coverage(),
            "alleles": alleles,
            "support": support,
        }





[docs]
class ConsensusReport(Report):
    def __parse_gene__(
        self, name: str, coverage: List[dict], calls: dict
    ) -> ConsensusGene:
        return ConsensusGene(
            name=name,
            calls=calls,
            allele_parser=self.allele_parser,
            coverage=coverage,
            )


[docs]
    def consensus(self, min_support: float) -> List[dict]:
        """
        Generate a consensus report for the given sample.

        Args:
            min_support (float): Minimum support threshold for consensus calls.

        Returns:
            List[dict]: Consensus report for the given sample.
        """
        result = list()
        for gene in self.genes:
            d = gene.consensus_dict(min_support)
            d["sample"] = self.sample
            result.append(d)
        return result