Source code for alleleTools.plot.graph_phewas

"""
PheWAS Visualization Module.

This module provides functionality to create PheWAS (Phenome-Wide Association Study)
Manhattan plots for HLA alleles. It processes data from the PheWAS catalog and
generates publication-quality visualizations showing disease associations across
different phenotype categories.

Data source: https://phewascatalog.org/phewas/#hla
The input file should be downloaded from the PheWAS catalog.

Author: Nicolás Mendoza Mejía (2025)
"""

from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ..argtypes import file_path, output_path


[docs] def setup_parser(subparsers): """ Set up the argument parser for the pleiotropy command. Args: subparsers: The subparsers object to add this command to. Returns: argparse.ArgumentParser: The configured parser for pleiotropy visualization. """ parser = subparsers.add_parser( name="pleiotropy", help="Crete pleiotropy report", description="Create peliotropy report (Manhattan graph of phewas)", epilog="Author: Nicolás Mendoza Mejía (2025)", ) parser.add_argument( "input", type=file_path, help="path to the CSV downlowaded from https://phewascatalog.org/phewas/#hla", ) parser.add_argument( "--output", type=output_path, help="name of the output image file generated", default="pleiotropy.svg" ) parser.add_argument( "--allele_name", type=str, help="Name of the allele to include in the title", default="" ) parser.set_defaults(func=call_function) return parser
[docs] def call_function(args): """ Execute PheWAS plot generation. Args: args: Parsed command line arguments containing: - input: Path to PheWAS catalog CSV file - output: Path to output image file - allele_name: Name of allele for plot title """ create_phewas_plot(args.input, args.output, allele_name=args.allele_name)
[docs] def create_phewas_plot(input_file: str, output_file: str, figsize: Tuple[float, float] = (15, 6), significance_threshold: float = 10**-5, allele_name: str = '', ): """ Create a PheWAS Manhattan plot from PheWAS catalog data. Generates a Manhattan plot showing disease associations for an HLA allele across different phenotype categories. Points are colored by category and sized by significance level. Args: input_file (str): Path to CSV file from PheWAS catalog output_file (str): Path for output image file figsize (Tuple[int, ...]): Figure size in inches (default: (15, 6)) significance_threshold (float): P-value threshold for significance line (default: 1e-5) allele_name (str): Allele name to include in plot title (default: '') Note: Input CSV should contain columns: 'phecode', 'phenotype', 'p', 'category' as provided by the PheWAS catalog (https://phewascatalog.org/phewas/#hla) """ try: df = pd.read_csv(input_file) except Exception as e: raise Exception( f"An error occurred while opening the file {input_file}: {e}") # filter by allele name if allele_name: df = df[df['HLA'].str.contains(allele_name, case=False, na=False)] if df.empty: raise ValueError( f"No data found for allele name '{allele_name}' in the input file. Check that the argument --allele_name matches with the provided input file.") df.sort_values(by='Category', inplace=True) df.reset_index(drop=True, inplace=True) df["position"] = df.index # Calculate -log10(P-value) df['-log10(P)'] = -np.log10(df['P-Value']) # Create a dictionary to map categories to colors group_cat = df.reset_index().groupby('Category') categories = group_cat['Category'].first().unique() # Use a colormap for distinct colors colors = plt.cm.get_cmap('tab20', len(categories)) category_colors = {cat: colors(i) for i, cat in enumerate(categories)} # Create the Manhattan plot plt.figure(figsize=figsize) # Plot each point, coloring by category for category, color in category_colors.items(): subset = df[df['Category'] == category] # Plot the points for this category with alpha 0.8 and a contour line plt.scatter(subset["position"], subset['-log10(P)'], color=color, label=category, s=20, alpha=0.5) # Set x-ticks to be the Category names # place them at the middle of the category beginning = group_cat['position'].min() sizes = group_cat['position'].size() middle = ((sizes / 2) + beginning).sort_values() plt.xticks(middle.to_list(), list(middle.index), rotation=45, ha='right') # Set x and y limits plt.xlim(0, len(df.index)) # Adjust x-axis limits based on your data # Add some space above the highest point plt.ylim(0, df['-log10(P)'].max() + 1) # Customize the plot plt.ylabel('-log10(P-value)') plt.title(f"PheWAS Manhattan Plot {allele_name}") # Rotate x-axis labels for readability plt.xticks(rotation=45, fontsize=10) # color of xticks by category for tick in plt.gca().get_xticklabels(): tick.set_color(category_colors[tick.get_text()]) # Add labels for significant points (optional) # Adjust significance threshold as needed significant_points = df[df['P-Value'] < 10**-5] for index, row in significant_points.iterrows(): plt.annotate(row['Phenotype'], (row['position'], row['-log10(P)']), textcoords="offset points", xytext=(0, 5), ha='center', fontsize=8) # Add a horizontal line for significance threshold (optional) if significance_threshold >= 0: plt.axhline(y=5, color='r', linestyle='--', label='Significance (%f)' % 10**-5) # Add legend plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) # Show the plot plt.tight_layout() # Adjust layout to prevent labels from overlapping # plt.show() # save as svg plt.savefig(output_file, format='svg', bbox_inches='tight')