Source code for alleleTools.plot.graph_pathogens

"""
Pathogen Recognition Visualization Module.

This module provides functionality to query the IEDB (Immune Epitope Database)
for visualizing the results of epitope assays related to a specific HLA allele.
It helps researchers understand which pathogens might the provided allele bind to,
potentially providing resistance against or susceptibility to infections.

Data source: IEDB (Immune Epitope Database) - https://www.iedb.org/

Author: Nicolás Mendoza Mejía (2025)
"""

import matplotlib.pyplot as plt
import pandas as pd

from ..argtypes import path
from .iedb.epitope_query import organism_iris, query_mhc
from .iedb.taxon_query import query_taxon_ids



[docs]
def setup_parser(subparsers):
    """
    Set up the argument parser for the graph_pathogens command.

    Args:
        subparsers: The subparsers object to add this command to.

    Returns:
        argparse.ArgumentParser: The configured parser for pathogen visualization.
    """
    parser = subparsers.add_parser(
        name='graph_pathogens',
        help='Graph epitope assay results for HLA alleles',
        description='Consult IEDB for assay results on the desired HLA allele. This command produces graphs with the assay results.',
        epilog="Author: Nicolás Mendoza Mejía (2025)",
    )
    parser.add_argument(
        'email',
        type=str,
        help='email required to get the tax ids from NCBI'
    )
    parser.add_argument(
        'allele',
        type=str,
        help='HLA allele that you want to get the data from'
    )
    parser.add_argument(
        "--min_epitope_len",
        type=int,
        help="Minimum epitope length to query from the database",
        default=0
    )
    parser.add_argument(
        "--max_epitope_len",
        type=int,
        help="Maximum epitope length to query from the database",
        default=0
    )
    parser.add_argument(
        "--source",
        type=int,
        help=f"Epitopes' source organism. Supported organisms are: [{organism_iris.keys()}]. If your organism of interest is not supported, please submit a request in issues. It is a quick change in the code.",
        default=0
    )
    parser.add_argument(
        "--host",
        type=int,
        help=f"Epitopes' host organism. Supported organisms are: [{organism_iris.keys()}]. If your organism of interest is not supported, please submit a request in issues. It is a quick change in the code.",
        default=0
    )
    parser.add_argument(
        "--output_basename",
        type=path,
        help="Base name of the output files, since this command outputs multiple graphs.",
        default="output_graphs"
    )

    parser.set_defaults(func=call_function)

    return parser




[docs]
def call_function(args):
    """
    Main function to execute pathogen association analysis and visualization.

    This function orchestrates the complete workflow:
    1. Queries IEDB for epitope-MHC binding data
    2. Extracts and processes NCBI taxonomy IDs
    3. Retrieves taxonomic information from NCBI
    4. Generates separate visualizations for bacteria and viruses

    Args:
        args: Parsed command line arguments containing:
            - email: Email for NCBI API access
            - allele: HLA allele to analyze
            - min_epitope_len: Minimum epitope length filter
            - max_epitope_len: Maximum epitope length filter
            - source: Source organism filter
            - host: Host organism filter
            - output_basename: Base name for output files
    """
    # Query the IEDB API for a specific HLA allele
    data = query_mhc(
        args.allele,
        min_len=args.min_epitope_len,
        max_len=args.max_epitope_len
    )

    # Check which rows contain the NCBI IDs
    contains_ncbi_id = data["source_organism_iri"].str.contains(
        "NCBITaxon", case=False, na=False)

    # Pre-process taxa id
    leading_id = "NCBITaxon:"
    data["TaxId"] = data["source_organism_iri"]\
        .replace( leading_id, "", regex=True)
    taxon_ids = data.loc[contains_ncbi_id, "TaxId"]\
        .dropna().unique().astype(int)
    taxon_ids = taxon_ids.astype(str).tolist()

    # Proceed to get the genus and family of each ncbi taxon id
    taxon_ranks = query_taxon_ids(taxon_ids, args.email)

    merged = data.merge(taxon_ranks, left_on="TaxId", right_on="TaxId")

    graph_by_genus(merged, "Bacteria",
                   f"{args.output_basename}_bacteria_genus.svg")
    graph_by_genus(merged, "Viruses",
                   f"{args.output_basename}_virus_genus.svg")


# %% Stacked plot by percentage



[docs]
def adjustFigAspect(fig, aspect:float=1):
    """
    Adjust subplot parameters to achieve the correct aspect ratio.

    This function modifies the figure layout to ensure proper proportions
    for publication-quality plots.

    Args:
        fig (matplotlib.figure.Figure): Figure object to adjust
        aspect (float): Desired aspect ratio (default: 1)
    """
    xsize, ysize = fig.get_size_inches()
    minsize = min(xsize, ysize)
    xlim = 0.4 * minsize / xsize
    ylim = 0.4 * minsize / ysize
    if aspect < 1:
        xlim *= aspect
    else:
        ylim /= aspect
    fig.subplots_adjust(
        left=0.5 - xlim, right=0.5 + xlim, bottom=0.5 - ylim, top=0.5 + ylim
    )




[docs]
def graph_by_genus(data: pd.DataFrame, division: str, output_file: str):
    """
    Generate a stacked bar chart showing assay results by pathogen genus.

    Creates a visualization of epitope binding assay results grouped by
    pathogen genus, with bars showing the distribution of qualitative
    measures (positive, negative, etc.) for each genus.

    Args:
        data (pd.DataFrame): Merged dataset with IEDB and taxonomy information
        division (str): Taxonomic division to filter by (e.g., "Bacteria", "Viruses")
        output_file (str): Path for the output SVG file

    Note:
        The function filters data by the specified division and creates
        a stacked percentage bar chart showing assay result distributions.
    """

    grouped = (
        data[data["Division"] == division]
        .groupby(["genus", "qualitative_measure"])
        .count()
    )
    grouped = grouped["assay_iris"].unstack()
    n_samples = grouped.sum(axis=1)
    grouped.index = pd.Index([
        "{} (n={})".format(y.replace("_", " ").title(), int(val))
        for y, val in n_samples.to_dict().items()
    ])
    grouped = grouped[grouped.sum(axis=1) > 10]

    # sort by the sum of the columns
    grouped = grouped.loc[grouped.sum(
        axis=1).sort_values(ascending=False).index]

    # normalize rows
    grouped = grouped.div(grouped.sum(axis=1), axis=0)

    colors = {
        "Negative": "#8b0000",
        "Positive-Low": "#f6655f",
        "Positive": "#c1f57d",
        "Positive-Intermediate": "#97e692",
        "Positive-High": "#069d59"
    }
    # Filter qualitative values
    colors = {key: val for key, val in colors.items() if key in grouped.columns}

    # Sort columns
    qual_measurements = list(colors.keys())
    grouped = grouped.loc[:, qual_measurements]

    # Invert negative values
    all_negatives = ["Negative", "Positive-Low"]
    negative_values = [val for val in grouped.columns if val in all_negatives]
    grouped[negative_values] = grouped[negative_values] * -1

    fig, ax = plt.subplots()
    adjustFigAspect(fig, aspect=0.7)
    grouped.plot(kind="barh", stacked=True, width=0.7, color=colors, ax=ax)
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    # set the x-axis limits to -1, 1
    ax.set_xlim(-1, 1)
    ax.autoscale(enable=True, axis="y", tight=True)

    plt.savefig(output_file, bbox_inches="tight")