"""
Consensus HLA Genotyping Module.
This module generates consensus HLA genotypes from multiple genotyping
algorithm results. It handles the integration of results from different HLA
typing tools, resolves conflicts, and produces high-confidence consensus calls.
It is initially designed to work with the JSON reports generated by the IKMB
HLA genotyping pipeline.
Input formats:
- JSON reports from IKMB HLA genotyping pipeline
Author: Nicolás Mendoza Mejía (2023)
"""
from typing import List, Tuple
import numpy as np
import pandas as pd
from ..allele import AlleleParser, FieldTree
from ..argtypes import file_path, output_path
from .alleleTable import AlleleTable
from .gene_report import Gene, Report, read_json
[docs]
def setup_parser(subparsers):
"""
Set up the argument parser for the consensus command.
Args:
subparsers: The subparsers object to add this command to.
Returns:
argparse.ArgumentParser: The configured parser for consensus.
"""
parser = subparsers.add_parser(
name="from_ikmb",
help="Find a consensus between multiple HLA genotyping reports",
description="Find a consensus between multiple HLA genotyping reports, which were" \
" generated by the IKMB HLA genotyping pipeline (https://github.com/ikmb/hla).",
epilog="Author: Nicolás Mendoza Mejía (2023)",
)
parser.add_argument(
"input",
metavar="path",
type=str,
nargs="+",
help="JSON files with HLA genotyping reports from the IKMB pipeline",
)
parser.add_argument(
"--min_coverage",
type=float,
help="Minimum coverage threshold",
default=100.0,
)
parser.add_argument(
"--min_support",
type=float,
help="Proportion of algorithms supporting the consensus call",
default=0.6,
)
parser.add_argument(
"--phe",
type=file_path,
help="input phe file name (to add phenotype column)",
default="",
)
parser.add_argument(
"--output",
metavar="path",
type=output_path,
help="Path to output file",
default="output.alt",
)
parser.add_argument(
"--gene_family",
type=str,
help="Specify the gene family e.i. 'hla', 'kir'",
default="hla",
)
parser.add_argument(
"--config_file",
type=file_path,
help="Path to a custom allele parsing configuration file",
default="",
)
parser.set_defaults(func=call_function)
return parser
[docs]
def call_function(args):
"""
Main function to execute consensus genotyping.
Processes multiple JSON genotyping reports and generates consensus
HLA calls using evidence-based algorithms. Outputs results in
allele table format.
Args:
args: Parsed command line arguments containing:
- input: List of JSON file handles with genotyping reports
- output: Path to output consensus file
"""
parser = AlleleParser(gene_family=args.gene_family,
config_file=args.config_file)
reports = list()
for file in args.input:
j = read_json(file)
report = ConsensusReport(j, allele_parser=parser)
reports.extend(report.consensus(args.min_support))
# Filter under covered sample's genes
df = pd.DataFrame(reports)
df = df[df["coverage"] >= args.min_coverage]
# Convert and save file
alt = reports_as_allele_table(df, args.phe)
alt.to_csv(args.output)
[docs]
def reports_as_allele_table(
reports: pd.DataFrame, phe_file: str
) -> AlleleTable:
# reports["alleles"].fillna("").apply(list)
# pivot table
df_pivot = reports.pivot_table(
values="alleles",
index=["sample"],
columns=["gene"],
aggfunc="sum",
)
# Gene columns have a list of one or two alleles
# extract those alleles into [gene_name]_1 and [gene_name]_2
genes = df_pivot.columns
genes = [gene for gene in genes if gene != "sample"]
for gene in genes:
# Fill non genotyped alleles or filtered alleles
pairs = df_pivot.pop(gene).fillna("").apply(list).tolist()
# Expand the pair to two columns
df_pivot[[f"{gene}_1", f"{gene}_2"]] = pd.DataFrame(
pairs, index=df_pivot.index)
# Replace "" with nan
df_pivot.replace("", np.nan, inplace=True)
alt = AlleleTable()
alt.alleles = df_pivot
alt.load_phenotype(phe_file)
return alt
[docs]
class ConsensusGene(Gene):
def __init__(self,
name: str,
calls: dict,
allele_parser: AlleleParser,
coverage: List[dict] = [],
) -> None:
super().__init__(
name=name,
calls=calls,
allele_parser=allele_parser,
coverage=coverage
)
# Set default settings for consensus
self.consensus_normalize_weight = True
self.max_support = 0
[docs]
def set_consensus_settings(
self,
normalize_weight: bool,
max_support: float
):
self.consensus_normalize_weight = normalize_weight
self.max_support = max_support
[docs]
def get_consensus_call(
self, min_support: float
) -> Tuple[List[str], List[float]]:
"""
Determine the consensus alleles and returns two values
Args:
min_support (float): Minimum support threshold for consensus calls.
Returns:
alleles: list of consensus alleles
support: percentage of algorithms supported the consensus alleles
"""
tree = FieldTree(self.name)
for tool, alleles in self.calls.items():
if len(alleles) == 0:
continue
# Create a new tree for each tool
tool_tree = FieldTree(self.name)
# alleles = set(alleles) # Remove duplicates
alleles = [
self.allele_parser.parse(self.name + '*' + allele).get_fields()
for allele in alleles
]
tool_tree.add_batch(alleles)
if self.consensus_normalize_weight:
# All children count only as one vote (one tool)
tool_tree.set_support(new_weight=1, recursive=True)
# Now we submit the vote from this tool
tree.merge_tree(tool_tree)
# Get delimiters
gene_delimiter, field_delimiter = self.allele_parser.get_delimiters()
# Get the consensus from the main tree
alleles, support = tree.get_consensus(
min_support=min_support,
gene_delimiter=gene_delimiter,
field_delimiter=field_delimiter,
max_support=self.max_support,
)
return alleles, support
[docs]
def consensus_dict(self, min_support: float) -> dict:
"""
Convert this object into a dictionary
Args:
min_support (float): Minimum support threshold for consensus calls.
Returns:
dict: Dictionary representation of the consensus gene.
"""
alleles, support = self.get_consensus_call(min_support)
return {
"gene": self.name,
"coverage": self.mean_coverage(),
"alleles": alleles,
"support": support,
}
[docs]
class ConsensusReport(Report):
def __parse_gene__(
self, name: str, coverage: List[dict], calls: dict
) -> ConsensusGene:
return ConsensusGene(
name=name,
calls=calls,
allele_parser=self.allele_parser,
coverage=coverage,
)
[docs]
def consensus(self, min_support: float) -> List[dict]:
"""
Generate a consensus report for the given sample.
Args:
min_support (float): Minimum support threshold for consensus calls.
Returns:
List[dict]: Consensus report for the given sample.
"""
result = list()
for gene in self.genes:
d = gene.consensus_dict(min_support)
d["sample"] = self.sample
result.append(d)
return result