From 1e024e392e5289e78ca110f6143ed5d6272c8950 Mon Sep 17 00:00:00 2001 From: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Mon, 20 Apr 2026 10:57:22 +0200 Subject: [PATCH 1/8] update pretrained model training docs --- docs/index.rst | 2 +- docs/pretrained_models.md | 100 +++++++++++++++++--------------------- 2 files changed, 46 insertions(+), 56 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index c9d1700f..87cc8f0d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,7 @@ Welcome to DeepRVAT's documentation! Rare variant association testing using deep learning and data-driven gene impairment scores. -_Coming soon:_ Overview of the DeepRVAT methodaster +_Coming soon:_ Overview of the DeepRVAT method How to use this documentation diff --git a/docs/pretrained_models.md b/docs/pretrained_models.md index ae48d946..c80947c2 100644 --- a/docs/pretrained_models.md +++ b/docs/pretrained_models.md @@ -2,10 +2,23 @@ For using the pretrained DeepRVAT model provided as part of the package, or a custom pretrained model, we have setup pipelines for running only the association testing stage. This includes creating the association dataset files, computing gene impairment scores, regression, and evaluation. +_**Important note:**_ DeepRVAT currently supports association testing using REGENIE or SEAK. Because REGENIE is actively maintained and gives better control for ancestry effects, as well as other aspects of genetic background, association testing with SEAK is _deprecated_. Note especially that for binary phenotypes, REGENIE _must_ be used. -## Configuration and input files -Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml`, see [example file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml). For details on the meanings of the parameters and the format of input files, see [here](input_data). +## Association testing with REGENIE + +For specifics on input files and options for REGENIE steps 1 and 2, please refer to the [REGENIE documentation](https://rgcgithub.github.io/regenie/). + +### Configuration and input files + +Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml` (rename the example file). For details on the meanings of the parameters and the format of input files, see [here](input_data). + +For an example, refer to [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_config_regenie.yaml) (rename or link it to `deeprvat_input_pretrained_models_config.yaml` in your working directory), which includes REGENIE-specific parameters for steps 1 and 2. Importantly, specify +``` +regenie_options: + regenie_exp: True + gtf_file: path/to/annotation_file.gtf.gz +``` To use pretrained models, you must specify `use_pretrained_models: True` in your `deeprvat_input_pretrained_models_config.yaml` configuration file. Additionally, provide the path to pretrained models (an output of the training pipeline) in the parameter `pretrained_model_path`. Within the `pretrained_model_path` directory, there must be a `config.yaml` file in that directory with the following set of specified keys that were used for training the pretrained models; `rare_variant_annotations`, `training_data_thresholds`, and `model` . See [example file](https://github.com/PMBio/deeprvat/blob/main/pretrained_models/model_config.yaml). @@ -35,44 +48,8 @@ cv_options (optional) Note that the file specified by `annotation_filename` must contain a column corresponding to each annotation in the list `rare_variant_annotations` from `deeprvat/pretrained_models/model_config.yaml`. - -## Executing the pipeline - -``` -snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile -``` - -Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository. - - -## Running the association testing pipeline with REGENIE - -_Coming soon_ - - -Samples to be considered when testing sub-cohorts can be provided via `keep_samples.txt` which look like + -``` -12345 12345 -56789 56789 -```` -for keeping two samples with ids `12345` and `56789` + -### Running the association testing pipeline with SEAK + + + + + -```shell -cd experiment -ln -s [path_to_deeprvat]/pretrained_models +## Association testing with SEAK + +### Configuration file + +Follow the instructions as above, but specify +``` +regenie_options: + regenie_exp: False +``` + +For an example, see [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml) + +### Executing the pipeline + +``` snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile ``` ----> +Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository. + + From e319ff1e6ee43da4b4295c0f7ed2dccb5cfad2b6 Mon Sep 17 00:00:00 2001 From: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Mon, 20 Apr 2026 10:59:31 +0200 Subject: [PATCH 2/8] SEAK deprecation notice --- docs/pretrained_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pretrained_models.md b/docs/pretrained_models.md index c80947c2..00b8b9ca 100644 --- a/docs/pretrained_models.md +++ b/docs/pretrained_models.md @@ -2,7 +2,7 @@ For using the pretrained DeepRVAT model provided as part of the package, or a custom pretrained model, we have setup pipelines for running only the association testing stage. This includes creating the association dataset files, computing gene impairment scores, regression, and evaluation. -_**Important note:**_ DeepRVAT currently supports association testing using REGENIE or SEAK. Because REGENIE is actively maintained and gives better control for ancestry effects, as well as other aspects of genetic background, association testing with SEAK is _deprecated_. Note especially that for binary phenotypes, REGENIE _must_ be used. +_**Important note:**_ DeepRVAT currently supports association testing using REGENIE or SEAK. Because REGENIE is actively maintained and gives better control for ancestry effects, as well as other aspects of genetic background, association testing with SEAK is _deprecated_ and will be removed in a future version. Note especially that for binary phenotypes, REGENIE _must_ be used. ## Association testing with REGENIE From 535e0675c32ee4309265921bd8b2ca86232c9a17 Mon Sep 17 00:00:00 2001 From: PMBio Date: Mon, 20 Apr 2026 09:02:04 +0000 Subject: [PATCH 3/8] fixup! Format Python code with psf/black pull_request --- deeprvat/annotations/annotations.py | 54 +++++++------------ .../common_variant_condition_utils.py | 1 - deeprvat/seed_gene_discovery/evaluate.py | 6 +-- .../seed_gene_discovery.py | 6 +-- deeprvat/utils.py | 1 - pipelines/resources/spliceai.py | 1 - 6 files changed, 22 insertions(+), 47 deletions(-) diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 1bede950..a6e8772b 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -2215,16 +2215,14 @@ def merge_absplice_scores( if verbose: logger.info(f"Loading and transforming {genes}...") - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE genes_mod AS SELECT id AS gene_id, split_part(gene, '.', 1) AS gene, split_part(gene, '.', 2) AS exon FROM read_parquet('{genes}'); - """ - ) + """) if verbose: logger.info(f"Loading {annotations}...") @@ -2237,60 +2235,49 @@ def merge_absplice_scores( exclude_columns = {"chrom", "pos", "ref", "alt", "AbSplice_DNA"} selected_columns = [col for col in all_columns if col not in exclude_columns] select_clause = ", ".join(selected_columns) - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE annotations AS SELECT {select_clause} FROM read_parquet('{annotations}'); - """ - ) + """) if verbose: logger.info("Merging genes_mod with annotations...") - con.execute( - """ + con.execute(""" CREATE TEMP TABLE merged1 AS SELECT a.*, g.gene, g.exon FROM annotations a LEFT JOIN genes_mod g ON a.gene_id = g.gene_id; - """ - ) + """) if verbose: logger.info(f"Loading {variants}...") - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE variants AS SELECT * FROM read_parquet('{variants}'); - """ - ) + """) if verbose: logger.info("Merging merged1 with variants...") - con.execute( - """ + con.execute(""" CREATE TEMP TABLE merged2 AS SELECT m1.*, v.chrom, v.pos, v.ref, v.alt FROM merged1 m1 LEFT JOIN variants v ON m1.id = v.id; - """ - ) + """) if verbose: logger.info(f"Loading {ab_splice_agg_score_file}...") - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE splice AS SELECT * FROM read_parquet('{ab_splice_agg_score_file}'); - """ - ) + """) if verbose: logger.info("Merging merged2 with absplice scores...") - con.execute( - """ + con.execute(""" CREATE TEMP TABLE final_merge AS SELECT m2.*, s.AbSplice_DNA FROM merged2 m2 @@ -2300,26 +2287,21 @@ def merge_absplice_scores( AND m2.ref = s.ref AND m2.alt = s.alt AND m2.gene = s.gene_id; - """ - ) + """) if fill_nan: if verbose: print(f"Filling NULL values in splice column with 0...") - con.execute( - f""" + con.execute(f""" UPDATE final_merge SET AbSplice_DNA = 0 WHERE AbSplice_DNA IS NULL; - """ - ) + """) if verbose: logger.info(f"Saving final merged table to {output}...") - con.execute( - f""" + con.execute(f""" COPY final_merge TO '{output}' (FORMAT 'parquet'); - """ - ) + """) logger.info(f"Successfully saved final merged table to {output}") diff --git a/deeprvat/deeprvat/common_variant_condition_utils.py b/deeprvat/deeprvat/common_variant_condition_utils.py index 566f30a6..01aca89d 100644 --- a/deeprvat/deeprvat/common_variant_condition_utils.py +++ b/deeprvat/deeprvat/common_variant_condition_utils.py @@ -15,7 +15,6 @@ import shutil import os - compression_level = 1 logging.basicConfig( diff --git a/deeprvat/seed_gene_discovery/evaluate.py b/deeprvat/seed_gene_discovery/evaluate.py index 05ebb087..c1d9483e 100644 --- a/deeprvat/seed_gene_discovery/evaluate.py +++ b/deeprvat/seed_gene_discovery/evaluate.py @@ -44,10 +44,8 @@ def evaluate_(associations: Dict[str, pd.DataFrame], alpha: float): logger.info("Only using genes with EAC > 50") result = result.query("EAC > 50") if result["pval"].isna().sum() > 0: - logger.info( - "Attention: still NA pvals after EAC filtering\ - Removing remaining NA pvals" - ) + logger.info("Attention: still NA pvals after EAC filtering\ + Removing remaining NA pvals") result = result.dropna(subset=["pval"]) corrected_result = pval_correction( result, alpha, correction_type=correction_type diff --git a/deeprvat/seed_gene_discovery/seed_gene_discovery.py b/deeprvat/seed_gene_discovery/seed_gene_discovery.py index 208152cc..80164d73 100644 --- a/deeprvat/seed_gene_discovery/seed_gene_discovery.py +++ b/deeprvat/seed_gene_discovery/seed_gene_discovery.py @@ -557,10 +557,8 @@ def make_dataset_( batch_size = len(dataset) if "batch_size" in data_config["dataloader_config"].keys(): - raise ValueError( - """You can't specify the batch_size in the - dataloader config for the seed gene discovery""" - ) + raise ValueError("""You can't specify the batch_size in the + dataloader config for the seed gene discovery""") logger.info(f"Read dataset, batch size {batch_size}") dataloader = DataLoader( diff --git a/deeprvat/utils.py b/deeprvat/utils.py index e515a7ec..d926b736 100644 --- a/deeprvat/utils.py +++ b/deeprvat/utils.py @@ -14,7 +14,6 @@ from sklearn.preprocessing import quantile_transform from statsmodels.stats.multitest import fdrcorrection - logging.basicConfig( format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s", level=logging.INFO, diff --git a/pipelines/resources/spliceai.py b/pipelines/resources/spliceai.py index d4b4b7c5..6d242968 100644 --- a/pipelines/resources/spliceai.py +++ b/pipelines/resources/spliceai.py @@ -1,7 +1,6 @@ from spliceai_rocksdb.spliceAI import SpliceAI import snakemake - if snakemake.params["lookup_only"]: model = SpliceAI(db_path=snakemake.params["db_path"]) else: From d77c3ac2b317455f95ea180f417f52f5ff925d2b Mon Sep 17 00:00:00 2001 From: Eva Holtkamp Date: Fri, 24 Apr 2026 17:34:03 +0200 Subject: [PATCH 4/8] update config: remove unneccesary keys, add documentation, comment in regnie options --- ...eprvat_input_pretrained_models_config.yaml | 35 ++++++++----------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/example/config/deeprvat_input_pretrained_models_config.yaml b/example/config/deeprvat_input_pretrained_models_config.yaml index 66d4c8c6..525200d3 100644 --- a/example/config/deeprvat_input_pretrained_models_config.yaml +++ b/example/config/deeprvat_input_pretrained_models_config.yaml @@ -40,13 +40,14 @@ covariates: #x_phenotypes - genetic_PC_19 - genetic_PC_20 +# Filters for variants to be included in association testing +# The column names used here must be in annotation.parquet association_testing_data_thresholds: MAF: "< 1e-3" CADD_PHRED: "> 5" #DeepRVAT model settings n_repeats: 30 -y_transformation: quantile_transform # Results evaluation settings evaluation: @@ -57,24 +58,18 @@ evaluation: #sample_files: # association_testing: association_testing_samples.pkl -#Additional settings if using the CV pipeline -cv_options: - cv_exp: False - #cv_path: sample_files - #n_folds: 5 - #Additional settings if using the REGENIE integration regenie_options: - regenie_exp: False - # gtf_file: gencode.v38.basic.annotation.gtf.gz - # step_1: - # bgen: imputation.bgen - # snplist: imputation.snplist - # bsize: 1000 - # options: - # - "--sample imputation.sample" - # - "--qt" - # step_2: - # bsize: 400 - # options: - # - "--qt" + regenie_exp: True + gtf_file: gencode.v38.basic.annotation.gtf.gz + step_1: + bgen: imputation.bgen + snplist: imputation.snplist + bsize: 1000 + options: + - "--sample imputation.sample" + - "--qt" + step_2: + bsize: 400 + options: + - "--qt" From 2e86ff8f1420e0ed175c0bb2fc98d70b8dba24a2 Mon Sep 17 00:00:00 2001 From: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Mon, 27 Apr 2026 11:21:30 +0200 Subject: [PATCH 5/8] fix github action --- .github/workflows/docs-tests.yml | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/docs-tests.yml b/.github/workflows/docs-tests.yml index a87f84fa..117475f9 100644 --- a/.github/workflows/docs-tests.yml +++ b/.github/workflows/docs-tests.yml @@ -8,16 +8,23 @@ jobs: docs-build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: ammaraskar/sphinx-action@0.4 + - uses: actions/checkout@v6 + - name: Set up Python 3.8 + uses: actions/setup-python@v6 with: - docs-folder: "docs/" + python-version: '3.8' + - name: Install dependencies + run: | + pip install sphinx + pip install -r docs/requirements.txt + - name: Build documentation + run: sphinx-build -M html docs docs-html - docs-link-check: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ammaraskar/sphinx-action@0.4 - with: - docs-folder: "docs/" - build-command: "make linkcheck" + # docs-link-check: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v4 + # - uses: ammaraskar/sphinx-action@0.4 + # with: + # docs-folder: "docs/" + # build-command: "make linkcheck" From bc485713743e7e7d3140990545c227111f2b59e6 Mon Sep 17 00:00:00 2001 From: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Mon, 27 Apr 2026 11:25:08 +0200 Subject: [PATCH 6/8] fix docs-build action --- docs/requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 04217f4b..a8228214 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ -sphinx==7.2.6 -myst-parser==2.0.0 -sphinx-autodoc2==0.4.2 -astroid==2.15.8 +sphinx==8.2.1 +myst-parser==5.0.0 +sphinx-autodoc2==0.5.0 +astroid==3.3.8 sphinx-copybutton==0.5.2 -sphinx-rtd-theme==1.3.0 -PyYAML==6.0.1 +sphinx-rtd-theme==3.0.1 +PyYAML==6.0.2 From ca41302c3a00b85c924db0e64b4921be0ab0493b Mon Sep 17 00:00:00 2001 From: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Mon, 27 Apr 2026 11:27:45 +0200 Subject: [PATCH 7/8] fix docs-build action --- .github/workflows/docs-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs-tests.yml b/.github/workflows/docs-tests.yml index 117475f9..ba16c20b 100644 --- a/.github/workflows/docs-tests.yml +++ b/.github/workflows/docs-tests.yml @@ -9,10 +9,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - - name: Set up Python 3.8 + - name: Set up Python 3.12 uses: actions/setup-python@v6 with: - python-version: '3.8' + python-version: '3.12' - name: Install dependencies run: | pip install sphinx From dd9c32b07f20bcb201aed8f5ad280ab8387e3b68 Mon Sep 17 00:00:00 2001 From: Brian Clarke <9725212+bfclarke@users.noreply.github.com> Date: Fri, 8 May 2026 20:53:14 +0200 Subject: [PATCH 8/8] make REGENIE config default, create SEAK config file --- docs/pretrained_models.md | 8 +-- ...t_input_pretrained_models_config_seak.yaml | 63 +++++++++++++++++++ 2 files changed, 67 insertions(+), 4 deletions(-) create mode 100644 example/config/deeprvat_input_pretrained_models_config_seak.yaml diff --git a/docs/pretrained_models.md b/docs/pretrained_models.md index 00b8b9ca..4e0c1fb5 100644 --- a/docs/pretrained_models.md +++ b/docs/pretrained_models.md @@ -11,9 +11,9 @@ For specifics on input files and options for REGENIE steps 1 and 2, please refer ### Configuration and input files -Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml` (rename the example file). For details on the meanings of the parameters and the format of input files, see [here](input_data). +Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml`. For details on the meanings of the parameters and the format of input files, see [here](input_data). -For an example, refer to [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_config_regenie.yaml) (rename or link it to `deeprvat_input_pretrained_models_config.yaml` in your working directory), which includes REGENIE-specific parameters for steps 1 and 2. Importantly, specify +For an example, refer to [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml), which includes REGENIE-specific parameters for steps 1 and 2. Importantly, specify ``` regenie_options: regenie_exp: True @@ -64,7 +64,7 @@ step_2: - "--qt" ``` -### Run REGENIE +### Run DeepRVAT gene impairment scoring followed by REGENIE ``` @@ -96,7 +96,7 @@ regenie_options: regenie_exp: False ``` -For an example, see [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml) +For an example, see [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config_seak.yaml) (rename or link it to `deeprvat_input_pretrained_models_config.yaml` in your working directory). ### Executing the pipeline diff --git a/example/config/deeprvat_input_pretrained_models_config_seak.yaml b/example/config/deeprvat_input_pretrained_models_config_seak.yaml new file mode 100644 index 00000000..7988b39d --- /dev/null +++ b/example/config/deeprvat_input_pretrained_models_config_seak.yaml @@ -0,0 +1,63 @@ +use_pretrained_models: True +pretrained_model_path : pretrained_models + +#Phenotypes to be used only for Association Testing +phenotypes_for_association_testing: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium + +#File paths of necessary input files to DeepRVAT +gt_filename: genotypes.h5 +variant_filename: variants.parquet +phenotype_filename: phenotypes.parquet +annotation_filename: annotations.parquet +gene_filename: protein_coding_genes.parquet + +covariates: #x_phenotypes + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + +# Filters for variants to be included in association testing +# The column names used here must be in annotation.parquet +association_testing_data_thresholds: + MAF: "< 1e-3" + CADD_PHRED: "> 5" + +#DeepRVAT model settings +n_repeats: 30 + +# Results evaluation settings +evaluation: + correction_method: Bonferroni + alpha: 0.05 + +# Subsetting samples for association testing +#sample_files: +# association_testing: association_testing_samples.pkl + +#Additional settings if using the REGENIE integration +regenie_options: + regenie_exp: False