Source code for alleleTools.plot.graph_pathogens

"""
Pathogen Recognition Visualization Module.

This module provides functionality to query the IEDB (Immune Epitope Database)
for visualizing the results of epitope assays related to a specific HLA allele.
It helps researchers understand which pathogens might the provided allele bind to,
potentially providing resistance against or susceptibility to infections.

Data source: IEDB (Immune Epitope Database) - https://www.iedb.org/

Author: Nicolás Mendoza Mejía (2025)
"""

import matplotlib.pyplot as plt
import pandas as pd

from ..argtypes import path
from .iedb.epitope_query import organism_iris, query_mhc
from .iedb.taxon_query import query_taxon_ids


[docs] def setup_parser(subparsers): """ Set up the argument parser for the graph_pathogens command. Args: subparsers: The subparsers object to add this command to. Returns: argparse.ArgumentParser: The configured parser for pathogen visualization. """ parser = subparsers.add_parser( name='graph_pathogens', help='Graph epitope assay results for HLA alleles', description='Consult IEDB for assay results on the desired HLA allele. This command produces graphs with the assay results.', epilog="Author: Nicolás Mendoza Mejía (2025)", ) parser.add_argument( 'email', type=str, help='email required to get the tax ids from NCBI' ) parser.add_argument( 'allele', type=str, help='HLA allele that you want to get the data from' ) parser.add_argument( "--min_epitope_len", type=int, help="Minimum epitope length to query from the database", default=0 ) parser.add_argument( "--max_epitope_len", type=int, help="Maximum epitope length to query from the database", default=0 ) parser.add_argument( "--source", type=int, help=f"Epitopes' source organism. Supported organisms are: [{organism_iris.keys()}]. If your organism of interest is not supported, please submit a request in issues. It is a quick change in the code.", default=0 ) parser.add_argument( "--host", type=int, help=f"Epitopes' host organism. Supported organisms are: [{organism_iris.keys()}]. If your organism of interest is not supported, please submit a request in issues. It is a quick change in the code.", default=0 ) parser.add_argument( "--output_basename", type=path, help="Base name of the output files, since this command outputs multiple graphs.", default="output_graphs" ) parser.set_defaults(func=call_function) return parser
[docs] def call_function(args): """ Main function to execute pathogen association analysis and visualization. This function orchestrates the complete workflow: 1. Queries IEDB for epitope-MHC binding data 2. Extracts and processes NCBI taxonomy IDs 3. Retrieves taxonomic information from NCBI 4. Generates separate visualizations for bacteria and viruses Args: args: Parsed command line arguments containing: - email: Email for NCBI API access - allele: HLA allele to analyze - min_epitope_len: Minimum epitope length filter - max_epitope_len: Maximum epitope length filter - source: Source organism filter - host: Host organism filter - output_basename: Base name for output files """ # Query the IEDB API for a specific HLA allele data = query_mhc( args.allele, min_len=args.min_epitope_len, max_len=args.max_epitope_len ) # Check which rows contain the NCBI IDs contains_ncbi_id = data["source_organism_iri"].str.contains( "NCBITaxon", case=False, na=False) # Pre-process taxa id leading_id = "NCBITaxon:" data["TaxId"] = data["source_organism_iri"]\ .replace( leading_id, "", regex=True) taxon_ids = data.loc[contains_ncbi_id, "TaxId"]\ .dropna().unique().astype(int) taxon_ids = taxon_ids.astype(str).tolist() # Proceed to get the genus and family of each ncbi taxon id taxon_ranks = query_taxon_ids(taxon_ids, args.email) merged = data.merge(taxon_ranks, left_on="TaxId", right_on="TaxId") graph_by_genus(merged, "Bacteria", f"{args.output_basename}_bacteria_genus.svg") graph_by_genus(merged, "Viruses", f"{args.output_basename}_virus_genus.svg")
# %% Stacked plot by percentage
[docs] def adjustFigAspect(fig, aspect:float=1): """ Adjust subplot parameters to achieve the correct aspect ratio. This function modifies the figure layout to ensure proper proportions for publication-quality plots. Args: fig (matplotlib.figure.Figure): Figure object to adjust aspect (float): Desired aspect ratio (default: 1) """ xsize, ysize = fig.get_size_inches() minsize = min(xsize, ysize) xlim = 0.4 * minsize / xsize ylim = 0.4 * minsize / ysize if aspect < 1: xlim *= aspect else: ylim /= aspect fig.subplots_adjust( left=0.5 - xlim, right=0.5 + xlim, bottom=0.5 - ylim, top=0.5 + ylim )
[docs] def graph_by_genus(data: pd.DataFrame, division: str, output_file: str): """ Generate a stacked bar chart showing assay results by pathogen genus. Creates a visualization of epitope binding assay results grouped by pathogen genus, with bars showing the distribution of qualitative measures (positive, negative, etc.) for each genus. Args: data (pd.DataFrame): Merged dataset with IEDB and taxonomy information division (str): Taxonomic division to filter by (e.g., "Bacteria", "Viruses") output_file (str): Path for the output SVG file Note: The function filters data by the specified division and creates a stacked percentage bar chart showing assay result distributions. """ grouped = ( data[data["Division"] == division] .groupby(["genus", "qualitative_measure"]) .count() ) grouped = grouped["assay_iris"].unstack() n_samples = grouped.sum(axis=1) grouped.index = pd.Index([ "{} (n={})".format(y.replace("_", " ").title(), int(val)) for y, val in n_samples.to_dict().items() ]) grouped = grouped[grouped.sum(axis=1) > 10] # sort by the sum of the columns grouped = grouped.loc[grouped.sum( axis=1).sort_values(ascending=False).index] # normalize rows grouped = grouped.div(grouped.sum(axis=1), axis=0) colors = { "Negative": "#8b0000", "Positive-Low": "#f6655f", "Positive": "#c1f57d", "Positive-Intermediate": "#97e692", "Positive-High": "#069d59" } # Filter qualitative values colors = {key: val for key, val in colors.items() if key in grouped.columns} # Sort columns qual_measurements = list(colors.keys()) grouped = grouped.loc[:, qual_measurements] # Invert negative values all_negatives = ["Negative", "Positive-Low"] negative_values = [val for val in grouped.columns if val in all_negatives] grouped[negative_values] = grouped[negative_values] * -1 fig, ax = plt.subplots() adjustFigAspect(fig, aspect=0.7) grouped.plot(kind="barh", stacked=True, width=0.7, color=colors, ax=ax) plt.legend(loc="upper left", bbox_to_anchor=(1, 1)) # set the x-axis limits to -1, 1 ax.set_xlim(-1, 1) ax.autoscale(enable=True, axis="y", tight=True) plt.savefig(output_file, bbox_inches="tight")