diff --git a/.editorconfig b/.editorconfig index 168cbd3..f630cf7 100644 --- a/.editorconfig +++ b/.editorconfig @@ -10,6 +10,7 @@ indent_style = space [*.{md,yml,yaml,html,css,scss,js,r,rmd,R,Rmd}] indent_size = 2 +trim_trailing_whitespace = false # These files are edited and tested upstream in nf-core/modules [/modules/nf-core/**] diff --git a/.nf-core.yml b/.nf-core.yml index fd11031..be29fb5 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,16 +1,14 @@ lint: - files_exist: - - conf/igenomes.config - - conf/igenomes_ignored.config - - assets/multiqc_config.yml - - conf/igenomes.config - - conf/igenomes_ignored.config - - assets/multiqc_config.yml + nextflow_config: + - manifest.homePage files_unchanged: - .github/CONTRIBUTING.md + - assets/email_template.html - assets/sendmail_template.txt - .github/CONTRIBUTING.md - assets/sendmail_template.txt + template_strings: + - assets/assesspool_report.Rmd multiqc_config: false nf_core_version: 3.3.1 repository_type: pipeline diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a0cd82..acb497e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,11 @@ -# nf-core/assesspool: Changelog +# assessPool: Changelog The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## v1.0.0dev - [date] -Initial release of nf-core/assesspool, created with the [nf-core](https://nf-co.re/) template. +Initial release of assessPool, created with the [nf-core](https://nf-co.re/) template. ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index 237ecb4..564eba3 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,38 @@ ## Pipeline tools +- [PoPoolation2](https://sourceforge.net/p/popoolation2/wiki/Tutorial/) + + > Kofler, R., Pandey, R. V., & Schlötterer, C. (2011). PoPoolation2: identifying differentiation between populations using sequencing of pooled DNA samples (Pool-Seq). Bioinformatics, 27(24), 3435-3436. + +- [poolfstat](https://doi.org/10.1111/1755-0998.13557) + + > Gautier, M., Vitalis, R., Flori, L., & Estoup, A. (2022). f‐Statistics estimation and admixture graph construction with Pool‐Seq or allele count data using the R package poolfstat. Molecular Ecology Resources, 22(4), 1394-1416. + +- [grenedalf](https://github.com/lczech/grenedalf) + + > Czech, L., Spence, J. P., & Expósito-Alonso, M. (2024). grenedalf: population genetic statistics for the next generation of pool sequencing. Bioinformatics, 40(8), btae508. + +- [bcftools](https://samtools.github.io/bcftools/bcftools.html) + + > Danecek, P., Bonfield, J. K., Liddle, J., Marshall, J., Ohan, V., Pollard, M. O., ... & Li, H. (2021). Twelve years of SAMtools and BCFtools. Gigascience, 10(2), giab008. + +- [vcftools](https://vcftools.github.io/) + + > Danecek, P., Auton, A., Abecasis, G., Albers, C. A., Banks, E., DePristo, M. A., ... & 1000 Genomes Project Analysis Group. (2011). The variant call format and VCFtools. Bioinformatics, 27(15), 2156-2158. + +- [samtools](https://samtools.github.io/) + + > Danecek, P., Bonfield, J. K., Liddle, J., Marshall, J., Ohan, V., Pollard, M. O., ... & Li, H. (2021). Twelve years of SAMtools and BCFtools. Gigascience, 10(2), giab008. + +- [Rsamtools](https://www.bioconductor.org/packages/release/bioc/html/Rsamtools.html) + + > Morgan M, Pagès H, Obenchain V, Hayden N (2024). _Rsamtools: Binary alignment (BAM), FASTA, variant call (BCF), and tabix + file import_. doi:10.18129/B9.bioc.Rsamtools , R package version 2.22.0, + . + + + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/LICENSE b/LICENSE index eff90ad..af4672c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) The nf-core/assesspool team +Copyright (c) The assessPool team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 814bba1..e27f8b6 100644 --- a/README.md +++ b/README.md @@ -1,64 +1,93 @@

- - nf-core/assesspool + + assessPool

-[![GitHub Actions CI Status](https://github.com/nf-core/assesspool/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/assesspool/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/assesspool/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/assesspool/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/assesspool/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) - -[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) -[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.1) -[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) -[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) -[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/assesspool) - -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23assesspool-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/assesspool)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.24.04.2-brightgreen.svg)](https://www.nextflow.io/) ## Introduction -**nf-core/assesspool** is a bioinformatics pipeline that ... - - +**assessPool** is a population genetics analysis pipeline designed for pooled sequencing runs (pool-seq). Starting from raw genomic variants (VCF or sync format), assessPool performs the following operations: + + * Filters SNPs based on adjustable criteria with suggestions for pooled data + * Calculates population genetic statistics using [PoPoolation2](https://sourceforge.net/p/popoolation2/wiki/Main/), [poolfstat](https://doi.org/10.1111/1755-0998.13557), and/or [grenedalf](https://github.com/lczech/grenedalf). + * Generates an HTML report including visualizations of population genetic statistics + * Outputs results in tabular format for downstream analyses + +Required inputs are a variant description file (sync or VCF) and a reference assembly (FASTA). These can be output from any number of reduced representation data processing pipelines (e.g., [grenepipe](https://github.com/moiexpositoalonsolab/grenepipe), [dDocent](https://ddocent.com/), etc.). + +Major pipeline operations: + +1. Import, index, and/or compress variant description and reference +1. Perform stepwise filtering to determine effects of individual filter options (count lost loci): + 1. Min/max read depth + 1. Minor allele count + 1. Hardy-Weinberg equilibrium cutoff + 1. Missing data + 1. Allele length + 1. Quality:depth ratio + 1. Minimum read quality + 1. Variant type + 1. Mispaired read likelihood + 1. Alternate observations + 1. Mapping quality + 1. Mapping ratio + 1. Overall depth + 1. Number of pools with data + 1. Read balance + 1. Variant thinning +1. Perform cumulative filtering (for VCF input) +1. Generate sync files + 1. Unified (all pools) + 1. Split pairwise +1. Generate allele frequency table +1. Calculate Fst + 1. PoPoolation2 + 1. {poolfstat} + 1. grenedalf +1. Calculate Fisher's exact test for individual SNPs + 1. PoPoolation2 + 1. assessPool native +1. Join frequency data to Fst results +1. Extract contigs containing (user-configurable) strongly-differentiated loci +1. Create HTML report +1. Save all output data in tabular format for downstream analysis - ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - +`project` (required): A brief unique identifier for this pool-seq project +`input` (required): Variant description file in sync or VCF format (optionally compressed using `bgzip`) +`vcf_index` (optional): TABIX-format index of the input VCF file (generated if not supplied) +`reference` (required): Reference assembly (FASTA) +`pools` (optional): Comma-separated list of pool names (will replace any names in the input file) +`pool_sizes` (required): Number of individuals in each pool. Either a single number for uniform pool sizes or a comma-separated list of sizes for each pool. -Now, you can run the pipeline using: - +Required columns are `project`, `input`, `reference`, and `pool_sizes`. + +Now, you can run the pipeline using: ```bash -nextflow run nf-core/assesspool \ +nextflow run tobodev/assesspool \ -profile \ --input samplesheet.csv \ --outdir @@ -67,41 +96,47 @@ nextflow run nf-core/assesspool \ > [!WARNING] > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). -For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/assesspool/usage) and the [parameter documentation](https://nf-co.re/assesspool/parameters). +For more details and further functionality, please refer to the [usage documentation](docs/usage.md) and the [parameter documentation](parameters.md). + +## Testing the pipeline +assessPool comes with two built-in profiles that allow the user to test the pipeline with a fully-functional input dataset. These profiles (whose descriptions can be found in [conf/test.confg](conf/test.config) and [conf/test_full.config](conf/test_full.config)) will run assessPool with either a full or reduced dataset of SNPs sequenced from wild populations of the coral *Montipora capitata*. Pipeline tests can be run by passing either `test` or `test_full` to the `-profile` option, along with a software/container management subsystem. For example, using singularity: + +``` +nextflow run tobodev/assesspool \ + -profile test,singularity +``` +or +``` +nextflow run tobodev/assesspool \ + -profile test_full,singularity +``` ## Pipeline output -To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/assesspool/results) tab on the nf-core website pipeline page. +Results of an example test run with a full size dataset can be found [here](https://tobodev.github.io/assesspool/). For more details about the output files and reports, please refer to the -[output documentation](https://nf-co.re/assesspool/output). +[output documentation](docs/output.md). ## Credits -nf-core/assesspool was originally written by Evan B Freel, Emily E Conklin, Mykle L Hoban, Derek W Kraft, Jonathan L Whitney, Ingrid SS Knapp, Zac H Forsman, Robert J Toonen. +assessPool was originally written by Evan B Freel, Emily E Conklin, Mykle L Hoban, Derek W Kraft, Jonathan L Whitney, Ingrid SS Knapp, Zac H Forsman, Robert J Toonen. We thank the following people for their extensive assistance in the development of this pipeline: - +Richard Coleman, ʻAleʻalani Dudoit, and Cataixa López, who used assessPool during development and helped identify issues and suggest key feature improvements. We would also like to thank Iliana Baums, Tanya Beirne, Dave Carlon, Greg Conception, Matt Craig, Jeff Eble, Scott Godwin, Matt Iacchei, Frederique Kandel, Steve Karl, Jim Maragos, Bob Moffitt, Joe O'Malley, Lawrie Provost, Jennifer Salerno, Derek Skillings, Michael Stat, Ben Wainwright, and Kim Weersing, for their efforts in sample collection. ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). -For further information or help, don't hesitate to get in touch on the [Slack `#assesspool` channel](https://nfcore.slack.com/channels/assesspool) (you can join with [this invite](https://nf-co.re/join/slack)). - ## Citations - - - - - -An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. +A list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. -You can cite the `nf-core` publication as follows: +This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). -> **The nf-core framework for community-curated bioinformatics pipelines.** +> The nf-core framework for community-curated bioinformatics pipelines. > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > -> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). +> Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x. diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json index b5fc0c0..c0b10f0 100644 --- a/assets/adaptivecard.json +++ b/assets/adaptivecard.json @@ -17,7 +17,7 @@ "size": "Large", "weight": "Bolder", "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", - "text": "nf-core/assesspool v${version} - ${runName}", + "text": "assessPool v${version} - ${runName}", "wrap": true }, { diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index e1d7549..979812a 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -11,8 +11,8 @@ output: df_print: paged # tables are printed as an html table with support for pagination over rows and columns highlight: pygments pdf_document: true - pdf_document: - toc: yes + self_contained: false + lib_dir: artifacts date: "`r Sys.Date()`" params: nf: null @@ -24,6 +24,86 @@ params: fisher: null tz: null --- +```{css, echo=FALSE} +.sliderbar { + position: fixed; + bottom: 40%; + margin-left: 0%; + padding: 10px; +} +.ui-slider-wrapper { + width: 95% +} +``` + + + + + + + + + + + ```{r setup, echo=FALSE, include=FALSE} # when run in a container, there are # weird issues with the timezone and lubridate @@ -48,16 +130,13 @@ if (!is.null(params$tz)) { # load libraries library(tidyr) +library(data.table) library(knitr) -library(yaml) library(plotly) library(DT) -library(readr) -library(dplyr) library(stringr) library(purrr) library(paletteer) -library(fs) library(htmltools) if (params$debug) { @@ -66,7 +145,6 @@ if (params$debug) { # set some global options knitr::opts_chunk$set(echo = FALSE, warning = FALSE) -options(dplyr.summarise.inform = FALSE) ``` ```{r init, include=FALSE} @@ -78,19 +156,6 @@ datatable( # load the plotly js plot_ly() -# plotly horizontal line -hline <- function(y = 0, color = "black") { - list( - type = "line", - x0 = 0, - x1 = 1, - xref = "paper", - y0 = y, - y1 = y, - line = list(color = color) - ) -} - # geometric mean function gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ if(any(x < 0, na.rm = TRUE)){ @@ -107,13 +172,19 @@ gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ } # get a sequence of coverage cutoffs from input parameters -cutoffs <- seq(params$nf$report_min_coverage,params$nf$report_max_coverage,params$nf$report_coverage_step) +cutoffs <- seq(params$nf$min_coverage_cutoff,params$nf$max_coverage_cutoff,params$nf$coverage_cutoff_step) # load pool data # start with fst show_fst <- !is.null(params$fst) if (show_fst) { - pool_data <- read_tsv(params$fst) + pool_data <- fread(params$fst) + pool_data[,pair := paste0(pop1," - ",pop2)] + pool_data[,`:=`( + pop1 = factor(pop1), + pop2 = factor(pop2) + )] + setorder(pool_data,method,chrom,pos) } else { pool_data <- FALSE } @@ -123,125 +194,116 @@ fisher_data <- NULL # load fisher data if applicable show_fisher <- !is.null(params$fisher) if (show_fisher) { - fisher_data <- read_tsv(params$fisher) %>% - select(chrom,pos,pop1,pop2,avg_min_cov,log_fisher=fisher,method) + fisher_data <- fread(params$fisher) + fisher_data[,c(names(fisher_data)[!names(fisher_data) %in% c('chrom','pos','pop1','pop2','avg_min_cov','log_fisher','method')]) := NULL] + # setnames(fisher_data,'fisher','log_fisher') + setcolorder(fisher_data,c('chrom','pos','pop1','pop2','avg_min_cov','log_fisher','method')) + setorder(fisher_data,method,chrom,pos) } # whether to show VCF filtering show_filter <- params$nf$visualize_filters & !is.null(params$filter) if (show_filter) { - filters <- read_tsv(params$filter) + filters <- fread(params$filter,col.names = c('filter','count')) } else { filters <- NULL } +# whether to show cumulative filtering show_final_filter <- params$nf$visualize_filters & !is.null(params$final_filter) -if (show_final_filter) { - final_filters <- read_tsv(params$final_filter) +if (show_final_filter) { + final_filters <- fread(params$final_filter,col.names = c('filter','count')) } else { final_filters <- NULL } +# whether to show any filtering at all any_filter <- show_filter | show_final_filter # used for tab selection -all_pools <- c('All SNPs'=FALSE,'Shared loci (all pools)'=TRUE) - -# place where stuff gets saved -data_dir <- "artifacts" - -# create artifacts dir -# if it already exists nobody will care -dir_create(data_dir) - -# move fst file into artifacts dir -# file_move(params$fst,data_dir) -``` +pools_shared <- c( + 'All SNPs'=FALSE, + 'SNPs shared by all pools'=TRUE +) - -```{r, results='asis', eval=params$debug} -cat("# Debug info\n") ``` -```{r, eval=params$debug} -if (params$debug) { - # print(params$meta$pools) - cat("params\n\n") - str(params) -} -``` - - # Abstract This report summarizes the output of population genetic analyses of a pool-seq RAD library. -```{r, results='asis', eval=any_filter} + +```{r filtering, results='asis', eval=any_filter} # TODO: show filter option values on plot/table cat("# Filtering {.tabset}\n") +# filtering tabs filter_headers <- c( "## Stepwise filtering reuslts", "## Cumulative filtering results" ) +# notes to display in filter tabs filter_notes <- c( 'Note: filtering results displayed are independent, not cumulative.', - 'Cumulative filter results' + 'Cumulative filter results
(Note: filtered SNP count may differ from SNP counts associated with Fst calculations).' ) +# list of filter data to use in the loop below report_data <- list(filters,final_filters) +# walk through different filter tabs c(show_filter,show_final_filter) %>% iwalk(\(show_report,i) { if (show_report) { # show tab header cat(filter_headers[i],"\n\n") - + # get relevant filtering data - filter_data <- report_data[[i]] - + filter_data <- report_data[[i]][order(-count)] + + # show some header info and notes print( tagList( h5('SNPs remaining after each filtering step.'), h5(tags$small(HTML(filter_notes[i]))) ) ) - filter_data <- filter_data %>% - arrange(desc(count)) %>% - mutate( - filter = replace(filter,which(filter == "before"),"Unfiltered"), - filter = replace(filter,which(filter == "cumulative"),"All filters"), - filter = forcats::fct_reorder(filter,-count), - filter = forcats::fct_relevel(filter,"Unfiltered"), - filtered = count[filter == "Unfiltered"] - count, - filtered = case_when( - filter == "Unfiltered" ~ count, - .default = filtered - ), - changed = filtered > 0 - ) #%>% - # filter(filtered > 0) - changed <- filter_data %>% - filter(changed) - unchanged <- filter_data %>% - filter(!changed) %>% - pull(filter) %>% - as.character() - unchanged <- str_glue("{unchanged}") - unchanged <- str_c(unchanged,collapse=", ") + + # rearrange filter data a bit + # make sure 'Unfiltered' is in bold and comes first + filter_data[,filter := replace(filter,which(filter == "before"),"Unfiltered")] + filter_data[,filter := replace(filter,which(filter == "cumulative"),"All filters")] + filter_data[,filter := forcats::fct_reorder(filter,-count)] + filter_data[,filter := forcats::fct_relevel(filter,"Unfiltered")] + filter_data[,filtered := count[filter == "Unfiltered"] - count] + filter_data[,filtered := ifelse(filter == "Unfiltered",count,filtered)] + filter_data[,changed := filtered > 0] + + # figure out which options did nothing + changed <- filter_data[changed == TRUE] + unchanged <- as.character(filter_data[changed == FALSE]$filter) + unchanged <- str_flatten_comma(unchanged) + # show options that didn't do anything if (unchanged != "") { print(h5(tags$small(HTML(str_glue( "Filter options with no effect: {unchanged}." ))))) } - + + # fixed-width font for tooltip font <- list( family = "Courier New", size = 14 ) - label <- list( font = font ) + + # quick formatting function num <- scales::comma + # tooltip template t_fmt <- 'Before filtering: {num(count[filter == "Unfiltered"])} SNPs
After filtering: {num(count)} SNPs
' + + # make the bar chart + # we use ggplot because it's better at doing continuous color scales + # for bar plots, then convert it to a plotly plot after the fact fig <- ggplot(changed) + geom_bar(aes(x=filter,y=count,fill=count,text=str_glue(t_fmt)),stat="identity") + scale_fill_paletteer_c("grDevices::Turku",direction = -1,name="SNPs remaining",labels=scales::comma) + @@ -251,6 +313,7 @@ c(show_filter,show_final_filter) %>% axis.title.x = element_text(margin = margin(t = -20)), plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.2,0.2,0.2,0.2), "cm")) + # plotly-ize it and do a little configuration fig <- fig %>% ggplotly(tooltip="text") %>% config(displayModeBar = FALSE) %>% @@ -258,484 +321,454 @@ c(show_filter,show_final_filter) %>% xaxis = list(title = "Filter option", fixedrange = TRUE), yaxis = list(title = "SNPs remaining", fixedrange = TRUE) ) %>% - style(hoverlabel = label) + style(hoverlabel = list(font = font)) print(tagList(fig)) - } + } }) ``` -```{r, summaries, results='asis', eval=show_fst} -cat("# High-level summaries {.tabset}\n\n") - -summary_datasets <- pool_data %>% - group_by(method) %>% - group_walk(\(fst_data,method) { - # output tab header - cat(str_glue("## {method$method} {{.tabset}}\n\n")) - - # walk through all_shared elements - # (whether or not we're looking only at snps shared across all pools) - all_pools %>% - iwalk(\(all_shared,hdr) { - # output tab header - cat(str_glue("### {hdr} {{.tabset}}\n\n")) - - # filter data for all pools if necessary - if (all_shared) { - fst_data <- fst_data %>% - mutate(pop1=factor(pop1),pop2=factor(pop2)) %>% - group_by(chrom,pos) %>% - filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% - ungroup() %>% - arrange(chrom,pos,pop1,pop2) - } - - # repeatedly filter the dataset for different coverage levels - # and bind those together into a new dataset - fst_cov <- cutoffs %>% - map(\(cov) { - fst_data %>% - # filter by average minimum coverage - filter(avg_min_cov >= cov) %>% - # retain the cutoff level in the data - mutate(cutoff=cov) - }) %>% - # and bind resulting subsets together into a superset - list_rbind() - # summarize the data by comparison and cutoff level - fst_grp <- fst_cov %>% - group_by(cutoff) %>% - summarise( - fst_sd = sd(fst), # std. dev. of fst - fst = mean(fst), # mean fst - fst_se = fst_sd / sqrt(n()), # std. err. of fst - cov = mean(avg_min_cov), # mean coverage - snps = n_distinct(chrom,pos,na.rm = TRUE), # snp count - contigs = n_distinct(chrom,na.rm=TRUE), # contig count - snps_per_contig = snps / contigs, # mean snps per contig - ci_lower= fst - qt(0.975, n() -1) * fst_se, - ci_upper= fst + qt(0.975, n() -1) * fst_se - ) %>% - arrange(cutoff) %>% - ungroup() %>% - select( - `Coverage cutoff`=cutoff, - `Mean Fst`=fst, - `Fst (SD)`=fst_sd, - `Fst (SEM)`=fst_se, - `Mean coverage`=cov, - SNPs=snps, - Contigs=contigs, - `SNPs per contig`=snps_per_contig, - `CI (lower)`=ci_lower, - `CI (upper)`=ci_upper - ) - - shared <- ifelse(all_shared,"all","shared") - filename <- str_glue("{method$method}_{shared}_summary") - - # a javascript callback that rounds numbers to the - # nearest thousand. otherwise the table is full of stuff like 0.3412341234123412341234 - js <- DT::JS( - "function ( data, type, row, meta ) {", - "return type === 'display' ?", - "''+(Math.round((data+Number.EPSILON)*1000)/1000).toLocaleString()+'' :", - "data;", - "}" - ) +```{r summaries, results='asis', eval=show_fst} +cat("# High-level summaries {.tabset}\n\n") - # show plot table - cat(str_glue("#### Plot\n\n")) +nothing <- pool_data[,{ + fst_data <- .SD + cat(str_glue("## {method} {{.tabset}}\n\n")) - print(h5("Global summary statistics by coverage cutoff")) + pools_shared %>% + iwalk(\(shared,hdr) { + cat(str_glue("### {hdr} {{.tabset}}\n\n")) - # TODO: use error_y for SD of fst - # plot_ly( - # x = ~cov, - # y = ~fst, - # type='scatter', - # mode='markers+lines', - # error_y = ~list( - # type='data', - # array = fst_sd - # ) - # ) - - # create interactive plot and add traces - fig <- fst_grp %>% - plot_ly( - y = ~ `Mean Fst`, - x = ~`Coverage cutoff`, - name = "Mean Fst", - visible = T, - mode = "lines+markers", - type = "scatter" - ) %>% - add_trace( - y = ~ `Fst (SD)`, - x = ~`Coverage cutoff`, - name = "Fst (SD)", - visible = F, - mode = "lines+markers", - type = "scatter" - ) %>% - add_trace( - y = ~SNPs, - x = ~`Coverage cutoff`, - name = "SNPs", - mode = "lines+markers", - type = "scatter", - visible=F - ) %>% - add_trace( - y = ~`SNPs per contig`, - x = ~`Coverage cutoff`, - name = "Mean SNPs per contig", - visible = F, - mode = "lines+markers", - type = "scatter" - ) %>% - add_trace( - y = ~Contigs, - x = ~`Coverage cutoff`, - name = "Contigs", - visible = F, - mode = "lines+markers", - type = "scatter" - ) + # filter data for all pools if necessary + if (shared) { + fst_data <- fst_data[ fst_data[, .I[all( unique(c(levels(fst_data$pop1),levels(pop2))) %in% c(pop1,pop2) )], by=c('chrom','pos')]$V1 ] + setorder(fst_data,chrom,pos,pop1,pop2) + } - # configure button layout and behavior - button_layout <- list( - type = "buttons", - direction = "right", - xanchor = 'center', - yanchor = "top", - pad = list('r' = 0, 't' = 10, 'b' = 10), - x = 0.5, - y = 1.27, - buttons = list( - list( - method = "update", - args = list( - list( - visible = list( - T, F, F, F, F - ) - ), - list( - yaxis = list(title = "Mean Fst") + fst_cov <- cutoffs %>% + map(\(cutoff) { + fst_data[avg_min_cov >= cutoff][,cutoff := cutoff][,{ + fst_sd <- sd(fst,na.rm=TRUE) + fst_se <- fst_sd / sqrt(.N) + n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) + n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) + fst_mean <- mean(fst,na.rm=TRUE) + .( + fst = fst_mean, + fst_sd = fst_sd, + snps = n_snps, # snp count + contigs = n_contigs, + snps_per_contig = n_snps / n_contigs + ) + },by=cutoff] + }) %>% + rbindlist() + setorder(fst_cov,cutoff) + setcolorder(fst_cov,c('cutoff','fst','fst_sd','snps','contigs','snps_per_contig')) + setnames( fst_cov, + c('cutoff','fst','fst_sd','snps','contigs','snps_per_contig'), + c('Coverage cutoff', 'Mean Fst', 'Fst (SD)', 'SNPs', 'Contigs', 'SNPs per contig') + ) + + cat(str_glue("#### Plot\n\n")) + print(h5("Global summary statistics by coverage cutoff")) + + fig <- fst_cov %>% + plot_ly() %>% + add_trace( + name = 'Mean Fst', + type = 'scatter', + mode = 'lines+markers', + visible=TRUE, + x = ~`Coverage cutoff`, + y = ~`Mean Fst`, + error_y = ~list( + type='data', + array = `Fst (SD)` + ) + ) %>% + add_trace( + name = 'SNPs', + type = 'scatter', + mode = 'lines+markers', + visible=FALSE, + x = ~`Coverage cutoff`, + y = ~`SNPs` + ) %>% + add_trace( + name = 'SNPs per contig', + type = 'scatter', + mode = 'lines+markers', + visible=FALSE, + x = ~`Coverage cutoff`, + y = ~`SNPs per contig` + ) %>% + add_trace( + name = 'Contigs', + type = 'scatter', + mode = 'lines+markers', + visible=FALSE, + x = ~`Coverage cutoff`, + y = ~`Contigs` + ) + + button_layout <- list( + type = "buttons", + direction = "right", + xanchor = 'center', + yanchor = "top", + pad = list('r' = 0, 't' = 10, 'b' = 10), + x = 0.5, + y = 1.27, + buttons = list( + list( + method = "update", + args = list( + list( + visible = list( + T, F, F, F ) ), - label = "Fst" + list( + yaxis = list(title = "Mean Fst") + ) ), - list( - method = "update", - args = list( - list( - visible = list( - F, T, F, F, F - ) - ), - list(yaxis = list(title = "Fst (standard deviation)") + label = "Fst" + ), + list( + method = "update", + args = list( + list( + visible = list( + F, T, F, F ) ), - label = "Fst (SD)" + list(yaxis = list(title = "SNPs") + ) ), - list( - method = "update", - args = list( - list( - visible = list( - F, F, T, F, F - ) - ), - list(yaxis = list(title = "SNPs") + label = "SNPs" + ), + list( + method = "update", + args = list( + list( + visible = list( + F, F, T, F ) ), - label = "SNPs" + list( + yaxis = list(title = "Mean SNPs per contig") + ) ), - list( - method = "update", - args = list( - list( - visible = list( - F, F, F, T, F - ) - ), - list( - yaxis = list(title = "Mean SNPs per contig") + label = "SNPs per contig" + ), + list( + method = "update", + args = list( + list( + visible = list( + F, F, F, T ) ), - label = "SNPs per contig" + list( + yaxis = list(title = "Number of contigs") + ) ), - list( - method = "update", - args = list( - list( - visible = list( - F, F, F, F, T - ) - ), - list( - yaxis = list(title = "Number of contigs") - ) - ), - label = "Contigs" - ) + label = "Contigs" ) ) - - # configure annotation properties - annotation <- list( - list( - text = "Statistic", - x = .15, - y = 1.22, - xref = 'paper', - yref = 'paper', - showarrow = FALSE - ) + ) + + # configure annotation properties + annotation <- list( + list( + text = "Statistic", + x = .15, + y = 1.22, + xref = 'paper', + yref = 'paper', + showarrow = FALSE ) - - # add layout and annotation to figure - fig <- fig %>% layout( + ) + + # add layout and annotation to figure + fig <- fig %>% + layout( yaxis = list(title = "Mean Fst",fixedrange=TRUE), xaxis = list(title = "Coverage cutoff",fixedrange=TRUE), - title = list(text=str_glue("Summary statistics by coverage cutoff - {method$method}"), yref= 'paper', y=1), + title = list(text=str_glue("Summary statistics by coverage cutoff - {method}"), yref= 'paper', y=1, font=list(size=12)), showlegend = F, updatemenus = list(button_layout), annotations = annotation ) %>% - config(displayModeBar = FALSE) - - # display figure - print(tagList(fig)) - - # show data table tab - cat(str_glue("#### Table\n\n")) - - print(h5("Global summary statistics by coverage cutoff")) + config(displayModeBar = FALSE) - # output the datatable - print( - tagList( - datatable( - fst_grp, - escape = FALSE, - rownames = FALSE, - filter = "none", - autoHideNavigation = TRUE, - extensions = 'Buttons', - options = list( - dom = 'tB', - buttons = list( - list( - extend='csv', - title=filename, - exportOptions = list( modifier = list(page = "all"), orthogonal = "export" - ) - ) - ), - scrollX = TRUE, - columnDefs = list( - list( - targets = c(2:5,8:10)-1, - render = js + # display figure + print(tagList(fig)) + + # show data table tab + cat(str_glue("#### Table\n\n")) + + # construct csv download filename + shared <- ifelse(shared,"all","shared") + filename <- str_glue("{method}_{shared}_summary") + + # a javascript callback that rounds numbers to the + # nearest thousand. otherwise the table is full of stuff like 0.3412341234123412341234 + js <- DT::JS( + "function ( data, type, row, meta ) {", + "return type === 'display' ?", + "''+(Math.round((data+Number.EPSILON)*1000)/1000).toLocaleString()+'' :", + "data;", + "}" + ) + + print(h5("Global summary statistics by coverage cutoff")) + + # output the datatable + print( + tagList( + datatable( + fst_cov, + escape = FALSE, + rownames = FALSE, + filter = "none", + autoHideNavigation = TRUE, + extensions = 'Buttons', + options = list( + dom = 'tB', + buttons = list( + list( + extend='csv', + title=filename, + exportOptions = list( modifier = list(page = "all"), orthogonal = "export" ) ) + ), + scrollX = TRUE, + columnDefs = list( + list( + targets = c(2:3)-1, + render = js + ) ) - ) %>% - formatRound(c(1,6,7),digits=0) - ) + ) + ) %>% + formatRound(c(1,4:6),digits=0) ) - }) - }) + ) + + + }) + +},by=method] ``` - ```{r fst-cov-all, results='asis', eval=show_fst} cat("# Pairwise Fst by minimum coverage {.tabset}\n\n") -# TODO: tie slider position in this plot to heatmap coverage slider -# (corresponding to method, shared snps, etc.) - # walk through calculation methods -pool_data %>% - group_by(method) %>% - group_walk(\(fst_data,method) { - # output tab header - cat(str_glue("## {method$method} {{.tabset}}\n\n")) - - # walk through all_shared elements - # (whether or not we're looking only at snps shared across all pools) - all_pools %>% - iwalk(\(all_shared,hdr) { - # output tab header - cat(str_glue("### {hdr} {{.tabset}}\n\n")) - - # filter data for all pools if necessary - if (all_shared) { - # get all unique pool names - all_pools <- sort( - union( - unique(fst_data$pop1), - unique(fst_data$pop2) +nothing <- pool_data[,{ + # output tab header + cat(str_glue("## {method} {{.tabset}}\n\n")) + + fst_data <- .SD + + # walk through whether or not we're looking only at snps shared across all pools + pools_shared %>% + iwalk(\(shared,hdr) { + # output tab header + cat(str_glue("### {hdr} {{.tabset}}\n\n")) + shr <- ifelse(shared,"shared","all") + + # filter data for all pools if necessary + if (shared) { + fst_data <- fst_data[ fst_data[, .I[all( unique(c(levels(fst_data$pop1),levels(pop2))) %in% c(pop1,pop2) )], by=c('chrom','pos')]$V1 ] + setorder(fst_data,chrom,pos,pop1,pop2) + } + + # calculate plot x-axis range + # basically, get the min and max mean fst across all coverage cutoffs + # we're doing this so all the different versions of the plot have the + # same x-axis. (it might be a bit data intensive but worthwhile) + xrange <- reduce(cutoffs,\(range,cutoff) { + r <- range(fst_data[avg_min_cov >= cutoff][,.(fst = mean(fst,na.rm=TRUE)),by=.(pop1,pop2)]$fst) + c(min(range[1],r[1]),max(range[2],r[2])) + },.init = c(0,0) ) + xrange <- c(min(xrange[1]-abs(0.1*xrange[1]),0),min(xrange[2]+abs(0.1*xrange[2]),1)) + + # get initial figures and output json script tags + fst_figs <- cutoffs %>% + reduce(\(figs,cutoff) { + # summarize the data by comparison and cutoff level + # precompute box plot stats because the datasets are too big + # and pandoc runs out of memory + fst_grp <- fst_data[avg_min_cov >= cutoff][,{ + fst_sd <- sd(fst,na.rm=TRUE) + n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) + n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) + .( + fst_sd = fst_sd, # std. dev. of fst + q1 = as.numeric(quantile(fst,na.rm=TRUE)[2]), + q3 = as.numeric(quantile(fst,na.rm=TRUE)[4]), + lwr = boxplot.stats(fst)$stats[1], + upr = boxplot.stats(fst)$stats[5], + med = median(fst,na.rm=TRUE), + min = min(fst,na.rm=TRUE), + max = max(fst,na.rm=TRUE), + fst_se = fst_sd / sqrt(.N), # std. err. of fst + fst = mean(fst,na.rm=TRUE), # mean fst + cov = mean(avg_min_cov,na.rm=TRUE), # mean coverage + snps = n_snps, # snp count + contigs = n_contigs, # contig count + snps_per_contig = n_snps / n_contigs # mean snps per contig ) + },by=.(pop1,pop2,pair)] + setorder(fst_grp,pair) + + # plot the scatterplot + fig <- fst_grp %>% + plot_ly( + x = ~ fst, + y = ~ pair, + color = ~snps, + # frame = ~str_glue("≥{cutoff}"), + # hover text templates + text = ~str_c(snps," SNPs
",contigs," contigs"), + hovertemplate = "Fst (%{y}): %{x}
%{text}", + type = 'scatter', + mode = 'markers', + marker = list(size = 12) + ) %>% + colorbar(title = "SNPs") %>% + layout( + margin=list(l = 100, r = 20, b = 10, t = 30), + yaxis = list(title="",tickangle=-35,tickfont=list(size=12)), + xaxis = list(title="Fst",tickfont=list(size=12), range=as.list(xrange)), + title = list(text=str_glue("Fst - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) + ) %>% + config(toImageButtonOptions = list( + format = 'svg', + width = 900, + height = 600 + )) + + # output the figure to html + if (is.null(figs$scatter_fig)) { + # assign div id to figure + fig_id <- str_glue("fst-fig-{method}-{shr}") + fig$elementId <- fig_id + + figs$scatter_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("scatter-json-{method}-{shr}"), + class="fig-container", + tagList(fig) + ) + ) + } + + # get and print the json for the figure + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + json_tag <- tags$script( + id = str_glue("scatter-json-{method}-{shr}-{cutoff}"), + type = 'application/json', + HTML(fig_json) ) - - # group data by snp (chrom/pos) and retain - # snps where all populations are represented in comparisons - # (either side) - fst_data <- fst_data %>% - group_by(chrom,pos) %>% - filter(all(all_pools %in% unique(c(unique(pop1),unique(pop2))))) %>% - ungroup() - } - - - - # repeatedly filter the dataset for different coverage levels - # and bind those together into a new dataset - fst_cov <- cutoffs %>% - map(\(cov) { - fst_data %>% - # filter by average minimum coverage - filter(avg_min_cov >= cov) %>% - # retain the cutoff level in the data - mutate(cutoff=cov) - }) %>% - # and bind resulting subsets together into a superset - list_rbind() %>% - # create a text representation of the comparison - mutate(pair = str_glue("{pop1} - {pop2}")) - - # summarize the data by comparison and cutoff level - # precompute box plot stats because the datasets are too big - # and pandoc runs out of memory - fst_grp <- fst_cov %>% - group_by(pop1,pop2,pair,cutoff) %>% - summarise( - fst_sd = sd(fst), # std. dev. of fst - q1 = as.numeric(quantile(fst)[2]), - q3 = as.numeric(quantile(fst)[4]), - lwr = boxplot.stats(fst)$stats[1], - upr = boxplot.stats(fst)$stats[5], - med = median(fst), - min = min(fst), - max = max(fst), - fst = mean(fst), # mean fst - fst_se = fst_sd / sqrt(n()), # std. err. of fst - cov = mean(avg_min_cov), # mean coverage - snps = n_distinct(chrom,pos,na.rm=TRUE), # snp count - contigs = n_distinct(chrom,na.rm=TRUE), # contig count - snps_per_contig = snps / contigs # mean snps per contig - ) %>% - # sort by cutoff and comparison - arrange(cutoff,pair) %>% - ungroup() - - # plot scatter plot header - cat("#### Scatter plot (means)\n\n") - - # plot the scatterplot - fig <- fst_grp %>% - plot_ly( - x = ~ fst, - y = ~ pair, - color = ~snps, - frame = ~str_glue("≥{cutoff}"), - # hover text templates - text = ~str_c(snps," SNPs
",contigs," contigs"), - hovertemplate = "Fst (%{y}): %{x}
%{text}", - type = 'scatter', - mode = 'markers', - marker = list(size = 12) - ) %>% - colorbar(title = "SNPs") %>% - layout( - margin=list(l = 100, r = 20, b = 10, t = 30), - yaxis = list(title="",tickangle=-35,tickfont=list(size=12)), - xaxis = list(title="Fst",tickfont=list(size=12)), - title = list(text=str_glue("Fst - {hdr}"), y=0.95) - ) %>% - # customize slider and remove play button - animation_slider(currentvalue = list(prefix = "Minimum coverage: ", font = list(color = "black"))) %>% - animation_button(visible = FALSE) - - # assign div id to figure - shr <- ifelse(all_shared,"shared","all") - fig_id <- str_glue("fst-fig-{method$method}-{shr}") - fig$elementId <- fig_id - - # output the figure to html - print(tagList(fig)) - - # TODO: slider change event - # fig_script <- tags$script( - # HTML(str_glue(' - # $(function() {{ - # $("#{fig_id}").on("plotly_sliderchange",function(e,data) {{ - # var cutoff = parseInt(data.slider.steps[data.slider.active].label.replaceAll(/[^0-9]+/g,"")); - # console.log("Cutoff: " + cutoff); - # }}) - # }}); - # ')) - # ) - # print(fig_script) - - # print box plot header - cat("#### Box plot (distributions)\n\n") - - # get outlier points - outs <- fst_cov %>% - group_by(pop1,pop2,pair,cutoff) %>% - summarise(outs = boxplot.stats(fst)$out) %>% - ungroup() - - # generate boxplot figure - fig <- fst_grp %>% - plot_ly( - type = "box", - y = ~pair, - q1 = ~q1, - q3 = ~q3, - median = ~med, - frame = ~str_glue("≥{cutoff}"), - lowerfence = ~lwr, - upperfence = ~upr - ) %>% - add_trace( - data = outs, - inherit = FALSE, - y = ~pair, - x = ~outs, - type="scatter", - mode="markers", - frame = ~str_glue("≥{cutoff}") - ) %>% - layout(showlegend = FALSE) %>% - animation_slider(currentvalue = list(prefix = "Minimum coverage: ", font = list(color = "black"))) %>% - animation_button(visible = FALSE) %>% - layout( - #margin=list(l = 200, r = 20, b = 80, t = 40), - yaxis = list(tickangle=-35,tickfont=list(size=12),title=""), - xaxis = list(tickfont=list(size=12),title="Fst") + print(json_tag) + + # get outlier points, limit to a subsample of 30 per pair because + # too many makes the thing crash. + outs <- fst_data[avg_min_cov >= cutoff][ ,{ + outliers <- boxplot.stats(fst)$out + .( outs = sample(outliers,min(length(outliers),30)) ) + },by=.(pop1,pop2,pair)] + + # generate boxplot figure + fig <- fst_grp %>% + plot_ly( + type = "box", + y = ~pair, + q1 = ~q1, + q3 = ~q3, + median = ~med, + lowerfence = ~lwr, + upperfence = ~upr + ) %>% + add_trace( + data = outs, + inherit = FALSE, + y = ~pair, + x = ~outs, + type="scatter", + mode="markers", + frame = ~str_glue("≥{cutoff}") + ) %>% + layout(showlegend = FALSE) %>% + layout( + yaxis = list(tickangle=-35,tickfont=list(size=12),title=""), + xaxis = list(tickfont=list(size=12),title="Fst"), + title = list(text=str_glue("Fst distribution - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) + ) %>% + config(toImageButtonOptions = list( + format = "svg", + width = 900, + height = 600 + )) + + # save figure for output + if (is.null(figs$box_fig)) { + fig_id <- str_glue("box-fig-{method}-{shr}") + fig$elementId <- fig_id + figs$box_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("box-json-{method}-{shr}"), + class="fig-container", + tagList(fig) + ) + ) + } + + # get/print figure json + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + json_tag <- tags$script( + id = str_glue("box-json-{method}-{shr}-{cutoff}"), + type = 'application/json', + HTML(fig_json) ) - print(tagList(fig)) - }) - }) + print(json_tag) + return(figs) + },.init = list(scatter_fig = NULL, box_fig = NULL)) + + # show scatter plot tab + cat("#### Scatter plot (means)\n\n") + + print(fst_figs$scatter_fig) + + # show box plot tab + cat("#### Box plot (distributions)\n\n") + print(h5(tags$small(HTML( + "Note: subsampled to 30 outliers per pair." + )))) + + print(fst_figs$box_fig) + + }) +},by=method] ``` -```{r, heatmap-setup, results='asis', eval=show_fst} -# TODO: add all snps/shared loci to heatmap plots +```{r heatmaps, results='asis', eval=show_fst} cat("# Summary statistic heatmaps {.tabset}\n\n") -# a list of parameters we'll use to plot each figure -# each item consists of: display name, bottom-left heatmap, and (optionally) top-right heatmap -# individual heatmap options are: x value, y value, variable name, color palette (from paletteer_c), -# number format (passed to sprintf), and missing color + +# configure variable pairs for heatmap plots var_pairs <- list( list( name="fst", @@ -786,269 +819,259 @@ cscale <- function(pal,n=10,na="darkgrey",min=0.00001) { }) if (!is.na(na)) { s[[1]][[1]] <- min - c(list(list(0,na), list(min,na)),s) + s <- c(list(list(0,na), list(min,na)),s) } + s } # replace "NA" (not NA) in a string vector fix_na <- function(x,na="n/a") replace(x,which(x == "NA"),na) -# summarize dataset (as above) by method and pool comparison -pool_data %>% - group_by(method) %>% - group_walk(\(fst_data,method) { - # output tab header - cat(str_glue("## {method$method} {{.tabset}}\n\n")) - - # now we walk through our different display pairs - var_pairs %>% - walk(\(trace_vars) { - # and show the tab header - cat(str_glue("### {trace_vars$title}\n\n")) - print(h5(HTML(trace_vars$description))) - - # create cutoff slider - cutoff_sel <- div( - class="coverage-slider", - tagList( - tags$label( - id=str_glue("heatmap-cutoff-label-{method$method}-{trace_vars$name}"), - `for`=str_glue("heatmap-cutoff-sel-{method$method}-{trace_vars$name}"), - str_glue('Minimum coverage: {params$nf$report_min_coverage}') - ), - div( - id=str_glue("heatmap-cutoff-sel-{method$method}-{trace_vars$name}") - ) - ) - ) - print(cutoff_sel) - - # cutoff slider handler code (js) - # note double curly braces because the whole thing is - # wrapped in str_glue - cutoff_script <- tags$script(HTML(str_glue( - ' - $(function() {{ - $("#heatmap-cutoff-sel-{method$method}-{trace_vars$name}").labeledslider({{ - min: {params$nf$report_min_coverage}, - max: {params$nf$report_max_coverage}, - step: {params$nf$report_coverage_step}, - tickInterval: {params$nf$report_coverage_step}, - change: function (e, ui) {{ - var fig = $("#heatmap-fig-{method$method}-{trace_vars$name}")[0]; - var figdata = $(`#heatmap-json-{method$method}-{trace_vars$name}-${{ui.value}}`).text(); - $("#heatmap-cutoff-label-{method$method}-{trace_vars$name}").text(`Minimum coverage: ${{ui.value}}`); - Plotly.react(fig,JSON.parse(figdata)); - }}, - slide: function (e, ui) {{ - var fig = $("#heatmap-fig-{method$method}-{trace_vars$name}")[0]; - var figdata = $(`#heatmap-json-{method$method}-{trace_vars$name}-${{ui.value}}`).text(); - $("#heatmap-cutoff-label-{method$method}-{trace_vars$name}").text(`Minimum coverage: ${{ui.value}}`); - Plotly.react(fig,JSON.parse(figdata)); - }} - }}); - - }}); - ' - ))) - print(cutoff_script) - - print(h5(tags$small(HTML( - "Note: Grey cells indicate missing data." - )))) - - # now walk through coverage cutoff levels - # and create figure and/or figure json for each one - cutoffs %>% - walk(\(cutoff) { - # filter data by coverage cutoff - # and summarize by pool comparison - fst_data <- fst_data %>% - filter(avg_min_cov >= cutoff) %>% - group_by(pop1,pop2) %>% - summarise( - fst_sd = sd(fst,na.rm=TRUE), - fst_se = fst_sd / sqrt(n()), - cov = mean(avg_min_cov,na.rm=TRUE), - cov_sd = sd(avg_min_cov,na.rm = TRUE), - cov_se = cov_sd / sqrt(n()), - snps = n_distinct(chrom,pos,na.rm=TRUE), - contigs = n_distinct(chrom,na.rm = TRUE), - snps_per_contig = snps / contigs, - fst = mean(fst,na.rm=TRUE) - ) %>% - ungroup() %>% - # retain the cutoff level in the data - mutate(cutoff=cutoff) - - # get all pool names - all_pops <- sort(union(unique(fst_data$pop1),unique(fst_data$pop2))) - - # create a tibble of all possible combinations (sorted) - full_table <- all_pops %>% - combn(2) %>% - apply(MARGIN = 2,FUN=\(x) sort(x)) %>% - t() %>% - as_tibble() %>% - rename(pop1=1,pop2=2) - - # convert pool names to factors with all possible levels - # make sure pop2 has levels in reverse so the figure looks right - # (i.e., a nice triangle) - # we left join from the table of all combos so we have all combos, even - # if some have missing data - hmd <- full_table %>% - left_join(fst_data,by=c("pop1","pop2")) %>% - arrange(pop1,desc(pop2)) %>% - mutate( +# walk through calculation methods +nothing <- pool_data[,{ + # show tab header + fst_data <- .SD + cat(str_glue("## {method} {{.tabset}}\n\n")) + + # walk through shared status + pools_shared %>% + iwalk(\(shared,hdr) { + # show tab header + cat(str_glue("### {hdr} {{.tabset}}\n\n")) + shr <- ifelse(shared,"shared","all") + + # filter data for all pools if necessary + if (shared) { + fst_data <- fst_data[ fst_data[, .I[all( unique(c(levels(fst_data$pop1),levels(pop2))) %in% c(pop1,pop2) )], by=c('chrom','pos')]$V1 ] + setorder(fst_data,chrom,pos,pop1,pop2) + } + + # walk through variable pairs + var_pairs %>% + walk(\(trace_vars) { + # show yet another tab header + cat(str_glue("#### {trace_vars$title}\n\n")) + + print(h5(HTML(trace_vars$description))) + print(h5(tags$small(HTML( + "Note: Grey cells indicate missing data." + )))) + + # generate figure and output + # all necessary figure json + heatmap_fig <- cutoffs %>% + reduce(\(plot_fig,cutoff) { + # summarize by pair and cutoff value + fst_grp <- fst_data[avg_min_cov >= cutoff][,{ + fst_sd <- sd(fst,na.rm=TRUE) + cov_sd <- sd(avg_min_cov,na.rm=TRUE) + n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) + n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) + .( + fst_sd = fst_sd, # std. dev. of fst + fst_se = fst_sd / sqrt(.N), # std. err. of fst + fst = mean(fst,na.rm=TRUE), # mean fst + cov = mean(avg_min_cov,na.rm=TRUE), # mean coverage + cov_sd = cov_sd, + cov_se = cov_sd / sqrt(.N), + snps = n_snps, # snp count + contigs = n_contigs, # contig count + snps_per_contig = n_snps / n_contigs # mean snps per contig + ) + },by=.(pop1,pop2)] + fst_grp[,cutoff := cutoff] + + # all this next bit is so that the heatmaps look good + # with all relevant rows of both the upper and lower triangles + + # get all possible pool names + all_pops <- sort(union(unique(fst_grp$pop1),unique(fst_grp$pop2))) + + # create a tibble of all possible combinations (sorted) + full_table <- all_pops %>% + combn(2) %>% + apply(MARGIN = 2,FUN=\(x) sort(x)) %>% + t() %>% + as.data.table() + setnames(full_table,c('pop1','pop2')) + + # convert pool names to factors with all possible levels + # make sure pop2 has levels in reverse so the figure looks right + # (i.e., a nice triangle) + # we left join from the table of all combos so we have all combos, even + # if some have missing data + hmd <- merge(full_table,fst_grp,by = c('pop1','pop2'),all=TRUE)[order(pop1,-pop2)] + hmd[,`:=`( pop1 = factor(pop1,levels=all_pops), pop2 = factor(pop2,levels=rev(all_pops)) - ) %>% - mutate( - # create columns for the tooltip text - across(where(is.numeric),\(x) x,.names = "{.col}_val"), - # replace misisng integers with -1 - across(where(is.integer) & !ends_with("_val"),\(x) replace_na(x,-1)), - # replace missing floats with min(x)*1e-5 - across(where(is.double) & !ends_with("_val"),\(x) replace_na(x,min(x,na.rm = TRUE)*0.00001)) - ) %>% - arrange(pop1,desc(pop2)) - - # get the factor level that would be the top row of the plot - top_level <- last(levels(hmd$pop2)) - - # now we make a dummy table for just that top row - top_row <- hmd %>% - filter(pop1 == top_level) %>% + )] + hmd[, c(paste0(names(.SD),'_na')) := lapply(.SD,is.na), .SDcols = is.numeric] + hmd[, c(paste0(names(.SD),'_val')) := .SD, .SDcols = is.numeric ] + setorder(hmd,pop1,-pop2) + + # get the factor level that would be the top row of the plot + top_level <- last(levels(hmd$pop2)) + + # now we make a dummy table for just that top row + # if we don't do this, the top triangle is missing its top row + # switch pop1 and pop2 - mutate(temp=pop1,pop1=pop2,pop2=temp) %>% - # drop the temp column - select(-temp) %>% + top_row <- hmd[pop1 == top_level] + top_row[, temp := pop1 ] + top_row[,`:=`( pop1=pop2, pop2=temp, temp=NULL )] + # set all numeric values to NA - mutate(across(-c(pop1,pop2),~NA)) - - # bind dummy values to plot data - # it seems to matter that we bind these - # to the end, rather than the other way around - hmd <- hmd %>% - bind_rows(top_row) - - # create an empty plotly figure - # don't show tooltips if there's no data - fig <- hmd %>% - plot_ly(hoverongaps=FALSE) - - # now we go through each figure pair - # and reduce them to a single plotly figure - # that c(1,2) is just to keep track of if we're on - # the lower or upper triangle - fig <- trace_vars$pairs %>% - reduce2(c(1,2),\(f,v,i) { - # if the element is valid - if (!all(is.na(v))) { - # add a heatmap trace - f %>% - add_trace( - type = "heatmap", - x = ~.data[[v[1]]], # x variable - y = ~.data[[v[2]]], # y variable - z = ~.data[[v[3]]], # z (color) variable - # this is what gets shown in the tooltip - customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), - # zmin and zmax are trying to make it so the "missing" - # color doesn't show up in the color scale legend - zmin = ~min(.data[[v[[3]]]],na.rm = TRUE)*0.00001, - zmax = ~max(.data[[v[3]]],na.rm = TRUE), - colorscale = cscale(v[4],n = 100,na = v[6]), # color palette, based on 100 gradations - # style the colorbar - colorbar = list( - # show the appropriate display name - title=list(text=value_map[v[3]]), - # determine the position of the colorbar - # bottom if it's for the bottom triangle, top if it's for the top - y = ifelse(i == 1,0,1), - # which side of the colorbar we reckon the y-anchor from - yanchor = ifelse(i == 1,"bottom","top"), - len = 0.5, - lenmode = "fraction" - ), - # text template - text = ~str_glue("{pop1} - {pop2}"), - # hover text template (hide the trace name with ) - hovertemplate=str_glue("%{{text}}
{value_map[v[3]]}: %{{customdata}}") + # cols = names(top_row)[!names(top_row) %in% c('pop1','pop2')] + # top_row[, c(cols) := NA ] + top_row[, (names(top_row)[sapply(top_row, is.numeric)]) := lapply(.SD, \(x) NA), .SDcols = is.numeric] + + # bind dummy values to plot data + # it seems to matter that we bind these + # to the end, rather than the other way around + hmd <- rbind(hmd,top_row) + + # start making the figure + fig <- plot_ly(hoverongaps=FALSE) + + # finish making the figure + fig <- trace_vars$pairs %>% + reduce2(c(1,2),\(plot_fig,v,i) { + if (!all(is.na(v))) { + # add the trace as either the upper or lower triangle + # depending on whether its #1 or #2 + col <- str_glue("{v[3]}_na") + hmdd <- hmd[get(col) | is.na(get(col))] + hmdd[,c(v[3]) := fifelse(.SD[[col]] == TRUE,0,NA) ] + + plot_fig %>% + { + if (sum(hmd[[str_glue("{v[3]}_na")]],na.rm = T) > 0 ) { + add_trace( + ., + type="heatmap", + hoverinfo = 'none', + data = hmdd, + x = ~.data[[v[1]]], + y = ~.data[[v[2]]], + z = ~.data[[v[3]]], + colorscale = 'Greys', + showscale = FALSE + ) + } else . + } %>% + add_trace( + type = "heatmap", + data = hmd, + x = ~.data[[v[1]]], # x variable + y = ~.data[[v[2]]], # y variable + z = ~.data[[v[3]]], # z (color) variable + # this is what gets shown in the tooltip + customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), + colorscale = cscale(v[4],n = 10,na = NA), # color palette, based on 100 gradations + # style the colorbar + colorbar = list( + # show the appropriate display name + title=list(text=value_map[v[3]]), + # determine the position of the colorbar + # bottom if it's for the bottom triangle, top if it's for the top + y = ifelse(i == 1,0,1), + # which side of the colorbar we reckon the y-anchor from + yanchor = ifelse(i == 1,"bottom","top"), + len = 0.5, + lenmode = "fraction" + ), + # text template + text = ~str_glue("{pop1} - {pop2}"), + # hover text template (hide the trace name with ) + hovertemplate=str_glue("%{{text}}
{value_map[v[3]]}: %{{customdata}}") + ) + } else return(plot_fig) + },.init = fig) %>% + layout( + # name the figure axes and don't let the user zoom around + xaxis = list(title = "Pool 1", fixedrange = TRUE), + yaxis = list(title = "Pool 2", fixedrange = TRUE), + title = list( + text = str_glue("{trace_vars$title} (coverage cutoff: ≥{cutoff})"), + font = list( + size = 12 ) - } else return(f) - },.init = fig) %>% - layout( - # name the figure axes and don't let the user zoom around - xaxis = list(title = "Pool 1", fixedrange = TRUE), - yaxis = list(title = "Pool 2", fixedrange = TRUE) - ) %>% - # hide the mode bar - config(displayModeBar = FALSE) - - # assign id attribute to plotly div - fig_id <- str_glue("heatmap-fig-{method$method}-{trace_vars$name}") - fig$elementId <- fig_id - - if (cutoff == first(cutoffs)) { - # output the figure to the report, but only if it's the first coverage cutoff - print(tagList(fig)) - } - - # get minified figure json - fig_json <- fig %>% - plotly_json(jsonedit = FALSE) %>% - jsonlite::minify() - - # create and print script tag with figure data json - # we use this to draw the other cutoff level figures when the slider is dragged - script_tag <- tags$script( - id = str_glue("heatmap-json-{method$method}-{trace_vars$name}-{cutoff}"), - type = 'application/json', - HTML(fig_json) - ) - print(script_tag) - }) + ) + ) %>% + # hide the mode bar + config(displayModeBar = FALSE) + + # get the figure for output + if (is.null(plot_fig)) { + fig_id <- str_glue("heatmap-fig-{method}-{shr}-{trace_vars$name}") + fig$elementId <- fig_id + # we stick it in a div with class 'fig-container' so + # they're easy to find in the DOM using jquery + plot_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("heatmap-json-{method}-{shr}-{trace_vars$name}"), + class="fig-container", + tagList(fig) + ) + ) + } + + # get the figure json and output it + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + # create and print script tag with figure data json + # we use this to draw the other cutoff level figures when the slider is dragged + json_tag <- tags$script( + id = str_glue("heatmap-json-{method}-{shr}-{trace_vars$name}-{cutoff}"), + type = 'application/json', + HTML(fig_json) + ) + print(json_tag) + + return(plot_fig) + },.init = NULL) + + # output the heatmap figure + print(heatmap_fig) + }) + }) +},by=method] - }) - }) ``` -```{r, fst-correlation, results='asis', eval=show_fst} + +```{r fst-correlation, results='asis', eval=show_fst} cat("# Fst method correlations {.tabset}\n\n") # get calculation methods methods <- unique(pool_data$method) # split the dataset into subsets by calculation method -splot <- pool_data %>% - split(.$method) methods %>% combn(2) %>% array_branch(2) %>% - walk(\(pair) { - cat(str_glue("## Fst correlation: {pair[1]} vs {pair[2]}\n\n")) + walk(\(methods) { + cat(str_glue("## {methods[1]} vs {methods[2]}\n\n")) print(h5(tags$small(HTML("Note: Due to potentially large datasets, only a subsample of 1,000 Fst values is shown here.")))) - - # make a joined table with both methods and subsample to 1000 rows - corr <- splot[[pair[1]]] %>% - sample_n(1000) %>% - inner_join(splot[[pair[2]]],by=c("chrom","pos","pop1","pop2")) - + + # make a joined table with both methods and subsample to max 1000 rows + corr <- merge( + pool_data[ method == methods[1] ][sample(.N,1000),], + pool_data[ method == methods[2] ], + by = c('chrom','pos','pop1','pop2') + ) + # do a linear regression lmm <- lm(fst.y ~ fst.x, data=corr) # get predicted data for the trend line - corr <- corr %>% - mutate( - predicted = lmm %>% - predict(corr %>% select(fst.x)) - ) - + corr[,predicted := predict(lmm,corr[,.(fst.x)])] + # get method names - xtitle <- pair[1] - ytitle <- pair[2] - + xtitle <- methods[1] + ytitle <- methods[2] + # plot the figure fig <- corr %>% plot_ly() %>% @@ -1060,7 +1083,7 @@ methods %>% mode = 'lines', line = list(color = '#232323', width = 0.7) ) %>% - # add scatter + # add scatter add_trace( x = ~fst.x, y = ~fst.y, @@ -1082,100 +1105,75 @@ methods %>% yaxis = list(title = str_glue("Fst ({ytitle})")) ) %>% # disappear the legends - hide_guides() - + hide_guides() %>% + config(toImageButtonOptions = list( + format = 'svg', + width = 900, + height = 600 + )) + print(tagList(fig)) }) ``` -```{r, fisher, results='asis', eval=show_fisher} -cat("# Fisher test plots {.tabset}\n\n") +```{r fisher, results='asis', eval=show_fisher} +cat("# Fisher's test plots {.tabset .fisher-tabs}\n\n") print(h5(tags$small( - "Fisher test results are presented as static plots due to the potentially large size of the datasets." + "Fisher test results are presented as static plots since these datasets can get huge, making plotly go slow or crash." ))) -fisher_data %>% - group_by(method) %>% - group_walk(\(fishr,method) { - cat(str_glue("## {method$method}\n\n")) - - # make slider for coverage cutoff - cutoff_slider <- div( - class="coverage-slider", - tagList( - tags$label( - id=str_glue("fisher-cutoff-label-{method$method}"), - `for`=str_glue("fisher-cutoff-sel-{method$method}"), - str_glue('Minimum coverage: {params$nf$report_min_coverage}') - ), - div( - id=str_glue("fisher-cutoff-sel-{method$method}") +nothing <- fisher_data[,{ + fishr <- .SD + cat(str_glue("## {method}\n\n")) + + print(h6(HTML("Points above red line indicate p < 0.05."))) + + figs <- cutoffs %>% + # set_names() %>% + map(\(cov) { + + # filter by coverage cutoff and get mean fisher p-value by snp + figdata <- fishr[avg_min_cov >= cov][,.(log_fisher = mean(log_fisher,na.rm=TRUE)), by=.(chrom,pos)][order(chrom,pos)] + figdata[,snp := .I] + + fig <- ggplot(figdata) + + geom_point(aes(x=snp,y=log_fisher),size=1) + + geom_hline(yintercept = -log10(0.05), color="red") + + theme_bw() + + labs(x = 'SNP', y=expression(paste(-log[10]~"(p-value)"))) + + disp <- if(cov == first(cutoffs)) "" else "display: none" + plotTag( + fig, + alt=str_glue("Fisher's exast test p-values for coverage ≥{cov}"), + width = 672, + attribs = list( + `data-method` = method, + `data-cutoff` = cov, + id = str_glue("fisher-plot-{method}-{cov}"), + class = c(str_glue('fisher-plotz'),str_glue('fisher-plotz-{cov}')), + style = disp ) ) + }) + + # for some reason enclosing this in another div() + # call makes the image tags show as text + cat('
') + print( + tagList( + div( + class = 'visible-fisher', + tagList(figs[1]) + ), + div( + class = 'hidden-fisher', + style = 'display: none', + tagList(figs[-1]) + ) ) - print(cutoff_slider) - - cutoff_script <- tags$script(HTML(str_glue( - ' - $(function() {{ - $("#fisher-cutoff-sel-{method$method}").labeledslider({{ - min: {params$nf$report_min_coverage}, - max: {params$nf$report_max_coverage}, - step: {params$nf$report_coverage_step}, - tickInterval: {params$nf$report_coverage_step}, - change: function (e, ui) {{ - $(".fisher-plotz-{method$method}").hide(); - $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); - $("#fisher-cutoff-label-{method$method}").text(`Minimum coverage: ${{ui.value}}`); - }}, - slide: function (e, ui) {{ - $(".fisher-plotz-{method$method}").hide(); - $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); - $("#fisher-cutoff-label-{method$method}").text(`Minimum coverage: ${{ui.value}}`); - }} - }}); - }}); - ' - ))) - print(cutoff_script) - - print(h6(HTML("Points above red line indicate p < 0.05."))) - - figs <- cutoffs %>% - # set_names() %>% - map(\(cov) { - # cat(str_glue("### Coverage ≥{cov}\n\n")) - fig <- fishr %>% - # filter by average minimum coverage - filter(avg_min_cov >= cov) %>% - # retain the cutoff level in the data - # mutate(cutoff=cov) %>% - group_by(chrom,pos) %>% - summarise( log_fisher = mean(log_fisher) ) %>% - ungroup() %>% - arrange(chrom,pos) %>% - mutate(snp = row_number()) %>% - ggplot() + - geom_point(aes(x=snp,y=log_fisher),size=1) + - geom_hline(yintercept = -log10(0.05), color="red") + - theme_bw() + - labs(x = 'SNP', y=expression(paste(-log[10]~"(p-value)"))) - - disp <- if(cov == first(cutoffs)) "" else "display: none" - plotTag( - fig, - alt=str_glue("Fisher's exast test p-values for coverage ≥{cov}"), - width = 672, - attribs = list( - id = str_glue("fisher-plot-{method$method}-{cov}"), - class = str_glue('fisher-plotz-{method$method}'), - style = disp - ) - ) - }) - print(div( - id='fisher-container', - tagList( figs ) - )) - }) + ) + cat('
') + +},by=method] ``` diff --git a/assets/email_template.html b/assets/email_template.html index d00307e..d33c051 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,21 +4,21 @@ - - nf-core/assesspool Pipeline Report + + assessPool Pipeline Report
-

nf-core/assesspool ${version}

+

assessPool ${version}

Run Name: $runName

<% if (!success){ out << """
-

nf-core/assesspool execution completed unsuccessfully!

+

assessPool execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

${errorReport}
@@ -27,7 +27,7 @@

nf-core/assesspool execution completed } else { out << """
- nf-core/assesspool execution completed successfully! + assessPool execution completed successfully!
""" } @@ -44,8 +44,8 @@

Pipeline Configuration:

-

nf-core/assesspool

-

https://github.com/nf-core/assesspool

+

assessPool

+

https://github.com/tobodev/assesspool

diff --git a/assets/email_template.txt b/assets/email_template.txt index 6d43a5b..ac6600d 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -4,15 +4,15 @@ |\\ | |__ __ / ` / \\ |__) |__ } { | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, `._,._,' - nf-core/assesspool ${version} + assessPool ${version} ---------------------------------------------------- Run Name: $runName <% if (success){ - out << "## nf-core/assesspool execution completed successfully! ##" + out << "## assessPool execution completed successfully! ##" } else { out << """#################################################### -## nf-core/assesspool execution completed unsuccessfully! ## +## assessPool execution completed unsuccessfully! ## #################################################### The exit status of the task that caused the workflow execution to fail was: $exitStatus. The full error message was: @@ -35,5 +35,5 @@ Pipeline Configuration: <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> -- -nf-core/assesspool -https://github.com/nf-core/assesspool +assessPool +https://github.com/tobodev/assesspool diff --git a/assets/input.csv b/assets/input.csv new file mode 100644 index 0000000..182019d --- /dev/null +++ b/assets/input.csv @@ -0,0 +1,2 @@ +project,input,vcf_index,reference,pools,pool_sizes +poolseq_test,data/pools.vcf.gz,data/pools.vcf.gz.tbi,data/ref.fasta,,"35,38,22,52,17,19" diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5f653ab..0000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_input.json b/assets/schema_input.json index 8acf4a5..4bcda41 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,25 +13,25 @@ "errorMessage": "Project name must be provided and cannot contain spaces", "meta": ["id"] }, - "vcf": { + "input": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.vcf(\\.gz)?$", - "errorMessage": "VCF file must be provided in the `vcf` column, cannot contain spaces, and must have extension '.vcf' or '.vcf.gz'" + "pattern": "^\\S+\\.(vcf|sync)(\\.gz)?(\\?.+)?$", + "errorMessage": "Input file must be provided in the `input` column, cannot contain spaces, and must have extension '.vcf', '.sync', 'vcf.gz', or '.sync.gz'" }, "vcf_index": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.vcf\\.gz\\.tbi$", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi(\\?.+)?$", "errorMessage": "VCF index file, must look like vcf_filename.vcf.gz.tbi" }, "reference": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.fa(s?ta)?(\\.gz)?$", + "pattern": "^\\S+\\.fa(s?ta)?(\\.gz)?(\\?.+)?$", "errorMessage": "Reference FASTA must be provided in the `reference` column, cannot contain spaces, and must have extension '.fa/fa.gz' or '.fasta/fasta.gz'" }, "pools": { @@ -42,6 +42,6 @@ "errorMessage": "Must provide pool size(s), either a single number or a comma-separated list" } }, - "required": ["project","vcf", "reference","pool_sizes"] + "required": ["project","input", "reference","pool_sizes"] } } diff --git a/assets/slackreport.json b/assets/slackreport.json index 8d0feea..a8c8caf 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "nf-core/assesspool ${version} - ${runName}", + "author_name": "assesspool ${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/conf/base.config b/conf/base.config index 83d3666..bbc63fd 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/assesspool Nextflow base config file + assesspool Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A 'blank slate' config file, appropriate for general use on most high performance compute environments. Assumes that all software is installed and available on @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { 1 * task.attempt } memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } @@ -24,7 +23,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } diff --git a/conf/filters.config b/conf/filters.config index 54a19d3..d4e4847 100644 --- a/conf/filters.config +++ b/conf/filters.config @@ -1,6 +1,10 @@ process { // bcftools filters withName: MAX_ALLELE_LENGTH { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -17,6 +21,10 @@ process { } } withName: QUALITY_DEPTH_RATIO { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -33,6 +41,10 @@ process { } } withName: MIN_QUALITY { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -49,6 +61,10 @@ process { } } withName: VARIANT_TYPE { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -65,6 +81,10 @@ process { } } withName: MISPAIRED_READS { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -81,6 +101,10 @@ process { } } withName: ALTERNATE_OBSERVATIONS { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -97,6 +121,10 @@ process { } } withName: MIN_MAPPING_QUALITY { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -113,6 +141,10 @@ process { } } withName: MAPPING_RATIO { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -130,6 +162,10 @@ process { } } withName: MIN_DEPTH { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -146,6 +182,10 @@ process { } } withName: MIN_POOLS { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -162,6 +202,10 @@ process { } } withName: READ_BALANCE { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -181,6 +225,10 @@ process { // vcftools filters withName: MAX_MEAN_DEPTH { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -188,6 +236,10 @@ process { ].join(' ').trim() } } withName: MIN_MEAN_DEPTH { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -195,12 +247,20 @@ process { ].join(' ').trim() } } withName: HWE_CUTOFF { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ params.hwe_cutoff ? "--hwe ${params.hwe_cutoff}" : "" ].join(' ').trim() } } withName: MINOR_ALLELE_COUNT { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -208,6 +268,10 @@ process { ].join(' ').trim() } } withName: MAX_MISSING { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -215,6 +279,10 @@ process { ].join(' ').trim() } } withName: THIN { + publishDir = [ + path: { "${params.outdir}/filter/incremental" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -226,6 +294,10 @@ process { // ripgrep wrapper to count snps in a vcf // works whether zipped or not withName: 'COUNT_SNPS.*' { + publishDir = [ + path: '', + enabled: false + ] ext.prefix = { "${meta.filter}_filter_count" } ext.args = { [ '-z', diff --git a/conf/modules.config b/conf/modules.config index 8688e19..dd59259 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,17 +20,25 @@ process { withName: RMARKDOWNNOTEBOOK { conda = { [ - 'conda-forge::r-rmarkdown=2.29', - 'conda-forge::r-knitr=1.50', - 'conda-forge::r-yaml=2.3.10', - 'conda-forge::r-plotly=4.10.4', - 'conda-forge::r-dt=0.33', - 'conda-forge::r-dplyr=1.1.4', - 'conda-forge::r-readr=2.1.5', - 'conda-forge::r-forcats=1.0.0', - 'conda-forge::r-paletteer=1.6.0' + 'r-rmarkdown=2.29', + 'r-yaml=2.3.10', + 'r-tidyr=1.3.1', + 'r-data.table=1.17.8', + 'r-knitr=1.50', + 'r-plotly=4.11.0', + 'r-dt=0.33', + 'r-stringr=1.5.1', + 'r-purrr=1.1.0', + 'r-paletteer=1.6.0', + 'r-htmltools=0.5.8.1', + 'r-ggplot2=3.5.2', + 'r-forcats=1.0.0', + 'r-scales=1.4.0', + 'r-jsonlite=2.0.0' ].join(' ').trim() } - container = { "fishbotherer/buildreport:1.0.1" } + container = { "${ workflow.containerEngine == 'singularity' && !ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b5ef69f92fe3bbdbe29f16d80675330b2bd45a66:a66dc1806b3ac5c904f1d0359e0504581f5495cd-0' : + 'biocontainers/mulled-v2-b5ef69f92fe3bbdbe29f16d80675330b2bd45a66:a66dc1806b3ac5c904f1d0359e0504581f5495cd-0' }" } publishDir = [ [ path: { "${params.outdir}/report" }, @@ -39,13 +47,17 @@ process { ], [ path: { "${params.outdir}/report" }, - mode: "symlink", + mode: params.publish_dir_mode, pattern: 'artifacts/*' ] ] } withName: REHEADER_FST { + publishDir = [ + path: { "${params.outdir}/fst/popoolation" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_" + meta.pools.keySet().sort().join("-") } ext.suffix = { "reheadered.fst" } ext.args = { @@ -53,27 +65,32 @@ process { .keySet() .sort() .withIndex() - .collect{ k,i -> "-v pop${i+1}='${k}'" } + - [ - """' - BEGIN{ - FS=OFS="\\t" - print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tfst\\tmethod" - } { - gsub(/^.+=/,"",\$NF) - fst=\$NF; nf=NF - \$(nf+3)="popoolation" - \$(nf+2)=fst - \$(nf+1)=pop2 - \$nf=pop1 - print - } - '""" - ] ).join(' ').trim() + .collect{ k,i -> "-v pop${i+1}='${k}'" } + ).join(' ').trim() + } + ext.args2 = { + """' + BEGIN{ + FS=OFS="\\t" + print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tfst\\tmethod" + } { + gsub(/^.+=/,"",\$NF) + fst=\$NF; nf=NF + \$(nf+3)="popoolation" + \$(nf+2)=fst + \$(nf+1)=pop2 + \$nf=pop1 + print + } + '""" } } withName: REHEADER_FISHER { + publishDir = [ + path: { "${params.outdir}/fisher/popoolation" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_" + meta.pools.keySet().sort().join("-") } ext.suffix = { "reheadered.fisher" } ext.args = { @@ -81,27 +98,32 @@ process { .keySet() .sort() .withIndex() - .collect{ k,i -> "-v pop${i+1}='${k}'" } + - [ - """' - BEGIN{ - FS=OFS="\\t" - print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tfisher\\tmethod" - } { - gsub(/^.+=/,"",\$NF) - fisher=\$NF; nf=NF - \$(nf+3)="popoolation" - \$(nf+2)=fisher - \$(nf+1)=pop2 - \$nf=pop1 - print - } - '""" - ] ).join(' ').trim() + .collect{ k,i -> "-v pop${i+1}='${k}'" } + ).join(' ').trim() + } + ext.args2 = { + """' + BEGIN{ + FS=OFS="\\t" + print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tlog_fisher\\tmethod" + } { + gsub(/^.+=/,"",\$NF) + fisher=\$NF; nf=NF + \$(nf+3)="popoolation" + \$(nf+2)=fisher + \$(nf+1)=pop2 + \$nf=pop1 + print + } + '""" } } withName: SPLIT_SYNC { + publishDir = [ + path: { "${params.outdir}/sync/pairwise" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_" + meta.pools.keySet().sort().join("-") } ext.suffix = { "sync" } ext.args = { @@ -109,24 +131,27 @@ process { .keySet() .sort() .withIndex() - .collect{ k,i -> "-v col${i+1}='${k}'" } + - [ - "-F \$'\\t'", - "-v OFS='\\t'", - """' - NR == 1 { - for (i=1; i <= NF; i++) { cols[\$i]=i } - # print \$1, \$2, \$3, \$cols[col1], \$cols[col2] - next - } - NR > 1 { print \$1, \$2, \$3, \$cols[col1], \$cols[col2] } - '""" - ] ).join(' ').trim() + .collect{ k,i -> "-v col${i+1}='${k}'" } + ).join(' ').trim() + } + ext.args2 = { + """' + BEGIN { FS=OFS="\\t" } + NR == 1 { + for (i=1; i <= NF; i++) { cols[\$i]=i } + next + } + NR > 1 { print \$1, \$2, \$3, \$cols[col1], \$cols[col2] } + '""" } } // grenedalf frequency options withName: GRENEDALF_FREQUENCY { + publishDir = [ + path: '', + enabled: false + ] ext.args = { [ "--allow-file-overwriting", "--write-sample-counts", @@ -141,6 +166,10 @@ process { // grenedalf sync options withName: GRENEDALF_SYNC { + publishDir = [ + path: { "${params.outdir}/sync" }, + mode: params.publish_dir_mode + ] ext.args = { [ params.missing_zeroes ? "--no-missing-marker" : "", "--filter-total-snp-min-count ${params.min_count}", @@ -151,6 +180,10 @@ process { // grenedalf fst options withName: GRENEDALF_FST { + publishDir = [ + path: { "${params.outdir}/fst/grenedalf" }, + mode: params.publish_dir_mode + ] ext.args = { ( [ "--method ${params.fst_method}", @@ -177,6 +210,10 @@ process { // r script to merge frequency and fst results withName: JOINFREQ { + publishDir = [ + path: '', + enabled: false + ] ext.prefix = { "${meta.id}_" + meta.pools.keySet().sort().join("-") } ext.args = { [ "--window-type ${params.window_type}", @@ -190,6 +227,10 @@ process { // r-based fisher test options withName: FISHERTEST { + publishDir = [ + path: { "${params.outdir}/fisher/assesspool" }, + mode: params.publish_dir_mode + ] ext.args = { [ "--window-type ${params.window_type}", "--min-count ${params.min_count}", @@ -202,6 +243,10 @@ process { // popoolation2 fisher test options withName: POPOOLATION2_FISHERTEST { + publishDir = [ + path: '', + enabled: false + ] ext.args = { [ "--min-count ${params.min_count}", "--min-coverage ${params.min_coverage}", @@ -212,6 +257,10 @@ process { // popoolation2 fst options withName: POPOOLATION2_FST { + publishDir = [ + path: '', + enabled: false + ] ext.args = { [ "--min-count ${params.min_count}", "--min-coverage ${params.min_coverage}", @@ -225,6 +274,10 @@ process { // poolfstat options withName: POOLFSTAT_FST { + publishDir = [ + path: { "${params.outdir}/fst/poolfstat" }, + mode: params.publish_dir_mode + ] ext.args = { [ "--min-coverage ${params.min_coverage}", "--max-coverage ${params.max_coverage}", @@ -237,6 +290,10 @@ process { } withName: COMPRESS_VCF { + publishDir = [ + path: '', + enabled: false + ] ext.args = { [ "--output-type z", "--write-index=tbi" @@ -244,6 +301,10 @@ process { } withName: BCFTOOLS_COMPRESS_INDEX_FILTERED { + publishDir = [ + path: '', + enabled: false + ] ext.prefix = { "filtered_final" } ext.args = { [ "--output-type z", @@ -252,6 +313,10 @@ process { } withName: BCFTOOLS_FILTER { + publishDir = [ + path: { "${params.outdir}/filter/bcftools" }, + mode: params.publish_dir_mode + ] ext.prefix = { "bcftools_filtered" } ext.args = { def f = params.filter @@ -287,6 +352,10 @@ process { } withName: THIN_SNPS { + publishDir = [ + path: { "${params.outdir}/filter/thin" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_vcftools_filtered_thinned" } ext.args = { [ "--recode --recode-INFO-all", @@ -295,6 +364,10 @@ process { } withName: VCFTOOLS_FILTER { + publishDir = [ + path: { "${params.outdir}/filter/vcftools" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_vcftools_filtered" } ext.args = { [ "--recode --recode-INFO-all", @@ -307,15 +380,30 @@ process { } withName: VCF_SAMPLES { + publishDir = [ + path: '', + enabled: false + ] ext.args = { "-l" } ext.prefix = { "sample_names" } ext.suffix = { "txt" } } withName: VCF_RENAME { + publishDir = [ + path: '', + enabled: false + ] ext.args2 = { [ "--output-type z", "--write-index=tbi" ].join(' ').trim() } } + + withName: EXTRACT_SEQUENCES { + publishDir = [ + path: { "${params.outdir}/sequences" }, + mode: params.publish_dir_mode + ] + } } diff --git a/conf/test.config b/conf/test.config index 85c1f3d..3eb19d7 100644 --- a/conf/test.config +++ b/conf/test.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/assesspool -profile test, --outdir + nextflow run tobodev/assesspool -profile test, --outdir ---------------------------------------------------------------------------------------- */ @@ -13,7 +13,7 @@ process { resourceLimits = [ cpus: 4, - memory: '15.GB', + memory: '24.GB', time: '1.h' ] } @@ -23,7 +23,37 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = params.pipelines_testdata_base_path + 'files/test.csv' + filter = true + max_missing = 0.5 + min_minor_allele_count = 2 + min_mapping_quality = 30 + min_mapping_quality_ref = 30 + min_mapping_ratio = 0.75 + max_mapping_ratio = 1.25 + quality_depth_ratio = 0.25 + mispaired_reads = true + read_balance_left = 0 + read_balance_right = 0 + min_pools = 5 + min_depth = 30 + max_allele_length = 10 + min_quality = 30 + variant_type = 'snp' + min_alternate_observations = 2 + min_mean_depth = 3 + max_mean_depth = 500 + grenedalf = true + all_fst_columns = true + popoolation2 = true + poolfstat = true + suppress_noninformative = true + match_allele_count = true + fisher_test = true + fisher_test_popoolation = true + outdir = 'output' + missing_zeroes = true + visualize_filters = true + filter_only = false + extract_sequences = true } diff --git a/conf/test_full.config b/conf/test_full.config index 00a8eb7..6cf77af 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run nf-core/assesspool -profile test_full, --outdir + nextflow run tobodev/assesspool -profile test_full, --outdir ---------------------------------------------------------------------------------------- */ @@ -15,10 +15,37 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Fasta references - fasta = params.pipelines_testdata_base_path + 'viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz' + input = params.pipelines_testdata_base_path + 'files/test_full.csv' + filter = true + max_missing = 0.5 + min_minor_allele_count = 2 + min_mapping_quality = 30 + min_mapping_quality_ref = 30 + min_mapping_ratio = 0.75 + max_mapping_ratio = 1.25 + quality_depth_ratio = 0.25 + mispaired_reads = true + read_balance_left = 0 + read_balance_right = 0 + min_pools = 5 + min_depth = 30 + max_allele_length = 10 + min_quality = 30 + variant_type = 'snp' + min_alternate_observations = 2 + min_mean_depth = 3 + max_mean_depth = 500 + grenedalf = true + all_fst_columns = true + popoolation2 = true + poolfstat = true + suppress_noninformative = true + match_allele_count = true + fisher_test = true + fisher_test_popoolation = true + outdir = 'output' + missing_zeroes = true + visualize_filters = true + filter_only = false + extract_sequences = true } diff --git a/defaults.yml b/defaults.yml new file mode 100644 index 0000000..2611293 --- /dev/null +++ b/defaults.yml @@ -0,0 +1,70 @@ +# general options +outdir: output + +# filtering options +filter: true +keep_indel: false +keep_multiallelic: false + +# vcftools filters +max_missing: 0.5 +min_minor_allele_count: 2 +min_mean_depth: 3 +max_mean_depth: 500 +hwe_cutoff: null +thin_snps: null + +# vcflib filters +min_mapping_quality: 30 +min_mapping_quality_ref: 30 +min_mapping_ratio: 0.75 +max_mapping_ratio: 1.25 +read_balance_left: 0 +read_balance_right: 0 +quality_depth_ratio: 0.25 +mispaired_reads: true +# min_pools recommended value: 1/2 number of pools +min_pools: null +min_depth: 30 +max_allele_length: 10 +min_quality: 30 +variant_type: snp # snp, ins, del, complex +min_alternate_observations: 2 + +# visualization/output options +visualize_filters: true +filter_only: false +min_coverage_cutoff: 10 +max_coverage_cutoff: 70 +coverage_cutoff_step: 10 +extract_sequences: true +fst_cutoff: 0.7 + +#fst calculations +popoolation2: false +grenedalf: true +poolfstat: true +missing_zeroes: true +window_size: 1 +window_stride: 0 +window_type: 'single' +window_region: false +window_region_list: null +window_region_skip_empty: false + +# popoolation/poolfstat +min_count: 2 +min_coverage: 10 +max_coverage: 1000 +min_covered_fraction: 1 +suppress_noninformative: false +min_minor_allele_frequency: 0 + +# grenedalf +match_allele_count: true +fst_method: 'kofler' +all_fst_columns: false + +# fisher test +fisher_test: true +fisher_test_popoolation: true diff --git a/docs/README.md b/docs/README.md index 4c14826..38c1025 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ -# nf-core/assesspool: Documentation +# assessPool: Documentation -The nf-core/assesspool documentation is split into the following pages: +The assessPool documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. diff --git a/docs/images/logo.png b/docs/images/logo.png new file mode 100644 index 0000000..7768ffc Binary files /dev/null and b/docs/images/logo.png differ diff --git a/docs/output.md b/docs/output.md index bc7ee01..64136e2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,4 +1,4 @@ -# nf-core/assesspool: Output +# assessPool: Output ## Introduction @@ -14,6 +14,81 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +### Index + +
+Output files + +- `index/` + - `*.fai/gzi`: FAI index of FASTA reference genome/assembly. + - `*.tbi`: VCF index. + +
+ +Results from calling `samtools faidx` on the reference assembly and `tabix` on the input VCF. + +### Filtering results + +
+Output files + +- `filter/` + - `incremental/` + - VCF files (`*.vcf.gz`) and their associated index files (`*.tbi`) containing results of stepwise filtering operations. + - `bcftools/` + - Results of cumulative filtering operations performed using `bcftools` (`*.vcf.gz`, `*.tbi`) + - `vcftools/` + - Results of cumulative filtering operations performed using `cftools` (`*.vcf.gz`, `*.tbi`) + +
+ +Results of VCF filtering. Contains both stepwise and and cumulative filter operations. + +### grenedalf sync + +
+Output files + +- `sync/` + - `pairwise/` + - `_-.sync`: Individual (headerless) pairwise sync file. + - `_sync.sync`: Sync file (in grenedalf headered format) for all pools. + +
+ +Sync files generated from input VCF (or copied / filtered from input sync file). + +### Fisher's test results + +
+Output files + +- `fisher/` + - `assesspool/*.tsv`: pairwise Fisher's test results using built-in assessPool method + - `popoolation/*.fisher`: pairwise Fisher's test results using PoPoolation2 + - `combined/.fisher`: All concatenated pairwise Fisher's test results + + +
+ +Results of Fisher's exact test calculations for each selected calculation method, both concatenated and single pairwise comparisons. + +### Fst results + +
+Output files + +- `fst/` + - `grenedalf/*.tsv`: Fst results calculated using `grenedalf`. + - `popoolation/*.fst`: Fst results calculated using `PoPoolation2`. + - `poolfstat/*.tsv`: Fst results calculated using `poolfstat`. + - `.fst`: Concatenated pairwise Fst results for all methods + +
+ +Pairwise Fst results by individual calculation method and concatenated. + + ### Pipeline information
diff --git a/docs/usage.md b/docs/usage.md index f3afd2c..2e5788e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,63 +1,40 @@ -# nf-core/assesspool: Usage - -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/assesspool/usage](https://nf-co.re/assesspool/usage) - -> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ +# assessPool: Usage ## Introduction - - -## Samplesheet input - -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. - -```bash ---input '[path to samplesheet file]' -``` +## Input spreadsheet -### Multiple runs of the same sample +First, prepare an input spreadsheet with your input data that looks as follows. Both csv and tsv formats are supported. The order of columns may be arbitrary, but column names must match those given below. -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +`input.csv`: -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +```csv +project,input,vcf_index,reference,pools,pool_sizes +poolseq_test,data/pools.vcf.gz,data/pools.vcf.gz.tbi,data/ref.fasta,,"35,38,22,52,17,19" ``` -### Full samplesheet +Each row represents a complete pool-seq project or experiment. Column descriptions: -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` - -| Column | Description | +| Column | Required? | Description | | --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `project` | required | A brief unique identifier for this pool-seq project. | +| `input` | required | Variant description file in sync or VCF format (optionally compressed using `bgzip`) | +| `vcf_index` | optional | TABIX-format index of the input VCF file (generated if not supplied) | +| `reference` | required | Reference assembly (FASTA, optionally compressed using `bgzip`) | +| `pools` | optional | Comma-separated list of pool names (will replace any names in the input file) | +| `pool_sizes` | required | Number of individuals in each pool. Either a single number for uniform pool sizes or a comma-separated list of sizes for each pool. | -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +An [example input sheet](../assets/input.csv) has been provided with the pipeline. + +> [!NOTE] +> assessPool accepts compressed VCF and/or FASTA input, but it must be compressed using `bgzip` rather than `gzip`. ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/assesspool --input ./samplesheet.csv --outdir ./results -profile docker +nextflow run tobodev/assesspool --input ./input.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -67,7 +44,7 @@ Note that the pipeline will create the following files in your working directory ```bash work # Directory containing the nextflow working files # Finished results in specified location (defined with --outdir) -.nextflow_log # Log file from Nextflow +.nextflow.log.* # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` @@ -81,36 +58,37 @@ Pipeline settings can be provided in a `yaml` or `json` file via `-params-file < The above pipeline run specified with a params file in yaml format: ```bash -nextflow run nf-core/assesspool -profile docker -params-file params.yaml +nextflow run tobodev/assesspool -profile docker -params-file params.yaml ``` with: ```yaml title="params.yaml" -input: './samplesheet.csv' +# params.yaml +input: './input.csv' outdir: './results/' <...> ``` -You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: ```bash -nextflow pull nf-core/assesspool +nextflow pull tobodev/assesspool ``` ### Reproducibility It is a good idea to specify the pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/assesspool releases page](https://github.com/nf-core/assesspool/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +First, go to the [tobodev/assesspool releases page](https://github.com/tobodev/assesspool/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. -To further assist in reproducibility, you can use share and reuse [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +To further assist in reproducibility, you can share and reuse [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. > [!TIP] > If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. @@ -138,7 +116,12 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Reduced-size input data for fast execution + - Includes links to test data so needs no other parameters other than optional container/package system (e.g., singularity) +- `test_full` + - A profile with a complete configuration for automated testing + - Full-sized input data + - Includes links to test data so needs no other parameters other than optional container/package system (e.g., singularity) - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` diff --git a/main.nf b/main.nf index ba59f81..278c271 100644 --- a/main.nf +++ b/main.nf @@ -1,11 +1,9 @@ #!/usr/bin/env nextflow /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/assesspool + assesspool ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/nf-core/assesspool - Website: https://nf-co.re/assesspool - Slack : https://nfcore.slack.com/channels/assesspool + Github : https://github.com/tobodev/assesspool ---------------------------------------------------------------------------------------- */ diff --git a/modules.json b/modules.json index d848745..74e4ad7 100644 --- a/modules.json +++ b/modules.json @@ -1,6 +1,6 @@ { - "name": "nf-core/assesspool", - "homePage": "https://github.com/nf-core/assesspool", + "name": "assessPool", + "homePage": "https://github.com/tobodev/assesspool", "repos": { "https://github.com/nf-core/modules.git": { "modules": { @@ -15,11 +15,6 @@ "git_sha": "0e9cb409c32d3ec4f0d3804588e4778971c09b7e", "installed_by": ["modules"] }, - "bcftools/reheader": { - "branch": "master", - "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] - }, "bcftools/view": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/local/extractsequences/environment.yml b/modules/local/extractsequences/environment.yml new file mode 100644 index 0000000..d7636b5 --- /dev/null +++ b/modules/local/extractsequences/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bioconductor-rsamtools=2.22.0 + - conda-forge::r-optparse=1.7.5 + - conda-forge::r-data.table=1.17.8 diff --git a/modules/local/extractsequences/main.nf b/modules/local/extractsequences/main.nf new file mode 100644 index 0000000..658a206 --- /dev/null +++ b/modules/local/extractsequences/main.nf @@ -0,0 +1,58 @@ +process EXTRACT_SEQUENCES { + tag "$meta.id" + label 'process_single_mem' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-24c2519e6f79268102473627bccca112fa9f03b6:b565a85a493491c260acda69b780ef6ffa1cda87-0': + 'biocontainers/mulled-v2-24c2519e6f79268102473627bccca112fa9f03b6:b565a85a493491c260acda69b780ef6ffa1cda87-0' }" + + input: + tuple val(meta), path(fasta), path(fai) + tuple val(meta), path(fst) + val(fst_cutoff) + + output: + tuple val(meta), path("*.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + extractsequences.R \\ + $args \\ + --fst-cutoff ${fst_cutoff} \\ + --output ${prefix}.fasta \\ + --index ${fai} \\ + $fasta \\ + $fst + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + Rsamtools: \$(Rscript -e "cat(paste(packageVersion('Rsamtools')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + touch ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + Rsamtools: \$(Rscript -e "cat(paste(packageVersion('Rsamtools')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") + END_VERSIONS + """ +} diff --git a/modules/local/extractsequences/meta.yml b/modules/local/extractsequences/meta.yml new file mode 100644 index 0000000..e628be8 --- /dev/null +++ b/modules/local/extractsequences/meta.yml @@ -0,0 +1,68 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "extractsequences" +## TODO nf-core: Add a description of the module and list keywords +description: write your description here +keywords: + - sort + - example + - genomics +tools: + - "extractsequences": + ## TODO nf-core: Add a description and other details for the software below + description: "" + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: + identifier: + +## TODO nf-core: Add a description of all of the variables used as input +input: + # Only when we have meta + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + ## TODO nf-core: Delete / customise this example input + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + +## TODO nf-core: Add a description of all of the variables used as output +output: + - bam: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + ## TODO nf-core: Delete / customise this example output + - "*.bam": + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mhoban" +maintainers: + - "@mhoban" diff --git a/modules/local/extractsequences/resources/usr/bin/extractsequences.R b/modules/local/extractsequences/resources/usr/bin/extractsequences.R new file mode 100755 index 0000000..d0b2de6 --- /dev/null +++ b/modules/local/extractsequences/resources/usr/bin/extractsequences.R @@ -0,0 +1,97 @@ +#!/usr/bin/env Rscript + +library(optparse) +library(Rsamtools) +library(data.table) + +# help message formatter +nice_formatter <- function(object) { + cat(object@usage, fill = TRUE) + cat(object@description, fill = TRUE) + cat("Options:", sep = "\n") + + options_list <- object@options + for (ii in seq_along(options_list)) { + option <- options_list[[ii]] + cat(" ") + if (!is.na(option@short_flag)) { + cat(option@short_flag) + if (optparse:::option_needs_argument(option)) { + cat(" ", toupper(option@metavar), sep = "") + } + cat(", ") + } + if (!is.null(option@long_flag)) { + cat(option@long_flag) + if (optparse:::option_needs_argument(option)) { + cat("=", toupper(option@metavar), sep = "") + } + } + cat("\n ") + cat(sub("%default", optparse:::as_string(option@default), option@help)) + cat("\n\n") + } + cat(object@epilogue, fill = TRUE) + return(invisible(NULL)) +} + +option_list <- list( + make_option(c("-i", "--index"), action="store", default=NA, type='character', help="FASTA index file"), + make_option(c("-o", "--output"), action="store", default=NA, type='character', help="Output FASTA file"), + make_option(c("-f", "--fst-cutoff"), action="store", default=0.7, type='double', help="Fst cutoff value") +) + +# use debug arguments if we have 'em +opt_args <- if (exists("debug_args")) debug_args else commandArgs(TRUE) + +# parse command-line options +opt <- parse_args( + OptionParser( + option_list=option_list, + formatter=nice_formatter, + prog="extractsequences.R", + usage="%prog [options] " + ), + convert_hyphens_to_underscores = TRUE, + positional_arguments = 2, + args = opt_args +) + +index_file <- opt$options$index +output_file <- opt$options$output +fst_cutoff <- opt$options$fst_cutoff + +if (!file.exists(opt$args[1])) { + stop(sprintf("FASTA file %s does not exist.",opt$args[1])) +} +if (!file.exists(opt$args[2])) { + stop(sprintf("Fst file %s does not exist.",opt$args[2])) +} +if (!is.na(index_file) & !file.exists(index_file)) { + stop(sprintf("Index file %s does not exist.",index_file)) +} + +# load indexed fasta file & get sequence lengths +fasta <- FaFile(file=opt$args[1],index=index_file) +lengths <- seqlengths(fasta) + +# use data.table for speed & memory (presumably) +# get sites with "strong" fst and create genomic ranges from them +fst <- fread(opt$args[2])[fst > fst_cutoff, ] +# group by contig and position, and string together calculation methods that got these high Fst values +fst <- fst[,.(methods = paste0(unique(method),collapse=',')), by=list(chrom,pos)][order(chrom,pos)] +# group by just contig and make a string that looks like pos1[];pos2[];etc. +fst <- fst[,.(positions = paste0(pos,"[",methods,"]",collapse=";")), by = chrom] +# pull in sequence lengths and make a 'name' column that's positions= +fst[, `:=`(len = lengths[chrom],name=paste0(chrom,' positions=',positions))] +# create range column +fst[,range := paste0(chrom,':1-',len)] + +# pull sequences from fasta using genomic ranges +ranges <- GRanges(fst$range) +seqs <- getSeq(fasta,ranges) +# assign new names to sequences including strongly differentiated positions +names(seqs) <- fst$name + +# save to fasta +writeXStringSet(seqs,output_file,format="fasta") diff --git a/modules/local/extractsequences/tests/main.nf.test b/modules/local/extractsequences/tests/main.nf.test new file mode 100644 index 0000000..cdebcf8 --- /dev/null +++ b/modules/local/extractsequences/tests/main.nf.test @@ -0,0 +1,73 @@ +// TODO nf-core: Once you have added the required tests, please run the following command to build this file: +// nf-core modules test extractsequences +nextflow_process { + + name "Test Process EXTRACT_SEQUENCES" + script "../main.nf" + process "EXTRACT_SEQUENCES" + + tag "modules" + tag "modules_" + tag "extractsequences" + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used + test("sarscov2 - bam") { + + // TODO nf-core: If you are created a test for a chained module + // (the module requires running more than one process to generate the required output) + // add the 'setup' method here. + // You can find more information about how to use a 'setup' method in the docs (https://nf-co.re/docs/contributing/modules#steps-for-creating-nf-test-for-chained-modules). + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + // See https://nf-co.re/docs/contributing/tutorials/nf-test_assertions for more information and examples. + ) + } + + } + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used but keep the " - stub" suffix. + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + ) + } + + } + +} diff --git a/modules/local/fishertest/environment.yml b/modules/local/fishertest/environment.yml index f101652..bbc8b6e 100644 --- a/modules/local/fishertest/environment.yml +++ b/modules/local/fishertest/environment.yml @@ -2,10 +2,5 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::r-dplyr=1.1.4 - - conda-forge::r-tidyr=1.3.1 - - conda-forge::r-readr=2.1.5 - - conda-forge::r-janitor=2.2.1 - - conda-forge::r-purrr=1.0.4 - conda-forge::r-optparse=1.7.5 - - conda-forge::r-matrixstats=1.5.0 + - conda-forge::r-data.table=1.17.8 diff --git a/modules/local/fishertest/main.nf b/modules/local/fishertest/main.nf index ca3d5cb..510687e 100644 --- a/modules/local/fishertest/main.nf +++ b/modules/local/fishertest/main.nf @@ -4,8 +4,8 @@ process FISHERTEST { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-dcb5179d6045c4df41d4319c99d446681271222e:4856497379063722ac7efef973f14ae775bd88ca-0': - 'biocontainers/mulled-v2-dcb5179d6045c4df41d4319c99d446681271222e:4856497379063722ac7efef973f14ae775bd88ca-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-91e4157032c00e9d9ac47f9837a67abee8f5afc2:7945c7dcdead60dfcbf06b19f4758634d6ad230a-0': + 'biocontainers/mulled-v2-91e4157032c00e9d9ac47f9837a67abee8f5afc2:7945c7dcdead60dfcbf06b19f4758634d6ad230a-0' }" input: tuple val(meta), val(pools), path(frequency) @@ -32,13 +32,8 @@ process FISHERTEST { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - optparse: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - janitor: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('janitor')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('matrixstats')),sep='.')") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ @@ -51,13 +46,8 @@ process FISHERTEST { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - optparse: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - janitor: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('janitor')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('matrixstats')),sep='.')") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ } diff --git a/modules/local/fishertest/resources/usr/bin/fisher.R b/modules/local/fishertest/resources/usr/bin/fisher.R index b0ce9c1..70106dd 100755 --- a/modules/local/fishertest/resources/usr/bin/fisher.R +++ b/modules/local/fishertest/resources/usr/bin/fisher.R @@ -1,15 +1,7 @@ #!/usr/bin/env Rscript -suppressPackageStartupMessages(library(dplyr)) -suppressPackageStartupMessages(library(tidyr)) -suppressPackageStartupMessages(library(readr)) -suppressPackageStartupMessages(library(janitor)) -suppressPackageStartupMessages(library(purrr)) +suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(optparse)) -suppressPackageStartupMessages(library(matrixStats)) - -options(dplyr.summarise.inform = FALSE) - # help message formatter nice_formatter <- function(object) { @@ -59,12 +51,7 @@ gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ pval_callback <- function(opt, flag, val, parser, ...) { - val <- char.expand(val,c("multiply","geometric-mean")) - switch( - val, - 'multiply' = prod, - 'geometric-mean' = gm_mean - ) + char.expand(val,c("multiply","geometric-mean")) } expand_callback <- function(opt, flag, val, parser, args, ...) { @@ -91,7 +78,7 @@ option_list <- list( make_option(c("-C", "--min-count"), action="store", default=2, type='double', help="Minimum Minimal allowed read count per base"), make_option(c("-c", "--min-coverage"), action="store", default=0, type='double', help="Minimum coverage per pool"), make_option(c("-x", "--max-coverage"), action="store", default=1e03, type='double', help="Maximum coverage per pool"), - make_option(c("-a", "--pval-aggregate"), action="callback", default=prod, type='character', help="P-value aggregation method",callback=pval_callback), + make_option(c("-a", "--pval-aggregate"), action="callback", default='multiply', type='character', help="P-value aggregation method",callback=pval_callback), make_option(c("-A", "--all-snps"), action="store_true", default=FALSE, type='logical', help="Save fisher test results for all SNPs, regardless of window size"), make_option(c("-j", "--adjust-pval"), action="store_true", default=FALSE, type='logical', help="Adjust p-value for multiple comparisons"), make_option(c("-J", "--adjust-method"), action="store", default="bonferroni", type='character', help="P-value adjustment method"), @@ -136,7 +123,11 @@ threads <- opt$options$threads thread_lines <- opt$options$lines_per_thread file_prefix <- opt$options$prefix outdir <- opt$options$output_dir -p_combine <- opt$options$pval_aggregate +p_combine <- switch( + opt$options$pval_aggregate, + 'multiply' = prod, + 'geometric-mean' = gm_mean +) save_all <- opt$options$all_snps adjust_p <- opt$options$adjust_pval adjust_method <- opt$options$adjust_method @@ -158,20 +149,12 @@ if (!file.exists(opt$args[1])) { stop(sprintf("Frequency file %s does not exist.",opt$args[1])) } -# filter frequency table down to just what we'd consider snps -# this filtering should match popoolation/poolfst - -# split(ceiling(seq_along(.)/50)) %>% - -freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) +freq_snps <- fread(opt$args[1]) if (all(is.na(pools))) { - pools <- freq_snps %>% - slice(1) %>% - pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% - pull(pool) %>% - sort() %>% - unique() + pools <- melt(freq_snps[1], measure = measure(pool,measure,pattern = r'[^(.+)\.(REF_CNT)$]'),value.name = 'count')[ + pool != 'TOTAL' + ][order(pool)]$pool } # make sure we're dealing with two pools @@ -181,88 +164,102 @@ if (npool != 2) { stop("Frequency file must contain exactly two pools") } -freq_snp_og <- freq_snps +cols <- names(freq_snps) +pr <- paste0('^(',paste0(pools,collapse='|'),')') + # continue filtering -freq_snps <- freq_snp_og %>% - select(CHROM:ALT,starts_with(paste0(pools,"."),ignore.case = FALSE)) %>% - mutate( - `TOTAL.REF_CNT` = rowSums(pick(ends_with(".REF_CNT",ignore.case = FALSE))), - `TOTAL.ALT_CNT` = rowSums(pick(ends_with(".ALT_CNT",ignore.case = FALSE))), - `TOTAL.DEPTH` = rowSums(pick(ends_with(".DEPTH",ignore.case = FALSE))), - ) %>% - mutate( - lwr = floor((POS-1)/window_step)*window_step+1, - upr = lwr+window_size-1, - middle=floor((upr+lwr)/2) - ) %>% - filter( if_all(ends_with(".DEPTH"),~.x >= min_depth) ) %>% - add_count(CHROM,middle,name="snp_count") %>% - rename_with(make_clean_names,.cols = starts_with("TOTAL.",ignore.case = FALSE)) %>% - filter( - if_all(starts_with("total_",ignore.case = FALSE) & ends_with("_cnt",ignore.case = FALSE),~.x >= min_count), - total_ref_cnt != total_depth, - total_depth >= min_depth - ) %>% - rename_with(CHROM:ALT,.fn=make_clean_names) %>% - select(chrom,pos,middle,ref:alt,ends_with(".REF_CNT"),ends_with(".ALT_CNT"),starts_with("total_"),everything()) - -fisher_results <- freq_snps %>% - select(order(colnames(.))) %>% - select( - chrom, pos, middle, snp_count,start=lwr,end=upr, - ends_with(".DEPTH",ignore.case = FALSE), - ends_with(".REF_CNT",ignore.case = FALSE), - ends_with(".ALT_CNT",ignore.case = FALSE) - ) %>% - pivot_longer( - -c(chrom,pos,middle,snp_count,start,end,ends_with(".DEPTH",ignore.case = FALSE)), - names_to = c("pop","measure"), - values_to = "count", - names_pattern = "^(.+)\\.([^.]+)$" - ) %>% - group_by(chrom,pos,middle,start,end,snp_count,across(ends_with(".DEPTH",ignore.case = FALSE))) %>% - summarise(pval = fisher.test(matrix(count,ncol=2))$p.value) %>% - ungroup() %>% - mutate(min_cov = matrixStats::rowMins(as.matrix(pick(ends_with(".DEPTH",ignore.case = FALSE))))) +# try to do as much data.table stuff in-place (using `:=`) as possible to increase memory efficiency + +# first, delete columns we don't care about +freq_snps[,cols[-c(1:4,grep(pr,cols))] := NULL ] +# add a total ref count +freq_snps[,TOTAL.REF_CNT := rowSums(.SD),.SDcols=patterns(r'[\.REF_CNT$]')] +# add a total alt count +freq_snps[,TOTAL.ALT_CNT := rowSums(.SD),.SDcols=patterns(r'[\.ALT_CNT$]')] +# add total depth +freq_snps[,TOTAL.DEPTH := rowSums(.SD),.SDcols=patterns(r'[\.DEPTH$]')] +# add window start +freq_snps[,start := floor((POS-1)/window_step)*window_step+1] +# add window end +freq_snps[,end := start+window_size-1] +# add window center (position) +freq_snps[,middle := floor((end+start)/2)] +# add window snp count +freq_snps[,snp_count := .N, by=.(CHROM,middle)] +# filter to minimum depth +freq_snps <- freq_snps[ freq_snps[,do.call(pmin,.SD) >= min_depth, .SDcols = patterns(r'[\.DEPTH$]')] ] +# filter to minimum count, remove invariant sites, and minimum depth +freq_snps <- freq_snps[ freq_snps[ , do.call(pmin,.SD) >= min_count, .SDcols = patterns(r'[^TOTAL\..+_CNT$]') ] ][ + TOTAL.REF_CNT != TOTAL.DEPTH & TOTAL.DEPTH >= min_depth +] +setnames(freq_snps,old=c("CHROM","POS","REF","ALT"),new=tolower) +setcolorder(freq_snps,neworder = c('chrom','pos','middle','ref','alt')) +freq_snps[,grep('total',names(freq_snps),ignore.case = TRUE) := NULL ] +cols <- names(freq_snps) +mv <- c(cols[1:3],'snp_count','start','end',grep(r'[\.DEPTH$]',cols,value=TRUE),grep(r'[_CNT$]',cols,value=TRUE)) +setcolorder(freq_snps,mv) + +# calculate fisher results + +# reshape frequency data and order ref and alt counts appropriately +fisher_results <- melt(freq_snps, measure = measure(pop,measure,pattern = r'[^(.+)\.(.+_CNT)]'),value.name = 'count')[order(chrom,pos,-measure,pop)] + +# get columns to group by +byc <- grep('^(pop|count|measure|ref|alt)$',names(fisher_results),invert = T,value = TRUE) +# calculate fisher tests for each combination +fisher_results <- fisher_results[,.(pval = fisher.test(matrix(count,ncol=2))$p.value),by=byc][order(chrom,pos)] +fisher_results[,avg_min_cov := do.call(pmin,.SD),.SDcols = patterns(r'[\.DEPTH$]')] if (adjust_p) { - fisher_results <- fisher_results %>% - mutate(p_adj = p.adjust(pval,method=adjust_method)) + fisher_results[, pval := p.adjust(pval,method=adjust_method)] } if (save_all & window_size > 1) { ss <- sprintf("%s/%s_%s_all_snps_fisher.tsv",outdir,file_prefix,paste0(pools,collapse="-")) - fisher_results %>% - mutate( - fisher = -log10(pval), - pop1 = pools[1], - pop2 = pools[2], - window_size = 1, - covered_fraction = 1, - start=pos, - end=pos, - ) %>% - select(chrom,pos,snps,covered_fraction,avg_min_cov = min_cov,pop1,pop2,fisher) %>% - write_tsv(ss) + fisher_results[,`:=`( + log_fisher = -log10(pval), + pop1 = pools[1], + pop2 = pools[2], + window_size = 1, + covered_fraction = 1, + start=pos, + end=pos + )] + + fwrite( + fisher_results[,c('chrom','pos','covered_fraction','avg_min_cov','pop1','pop2','fisher'),with=FALSE], + file = ss, + sep="\t" + ) } -fisher_results <- fisher_results %>% - group_by(chrom,middle,start,end,snp_count) %>% - summarise( - fisher = -log10(p_combine(pval)), +if (window_type != 'single') { + fisher_results <- fisher_results[,.( + log_fisher = -log10(p_combine(pval)), pop1 = pools[1], pop2 = pools[2], - window_size = n(), + window_size = .N, covered_fraction = unique(snp_count)/window_size, - avg_min_cov = mean(min_cov) - ) %>% - ungroup() %>% - mutate( - pos = floor((start+end)/2), - method = 'assesspool' - ) %>% - select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fisher,method) + avg_min_cov = mean(avg_min_cov) + ), by = .(chrom,middle,start,end,snp_count)] + fisher_results[,pos := floor((start+end)/2)] +} else { + fisher_results[,`:=`( + covered_fraction = 1, + window_size = 1, + pop1 = pools[1], + pop2 = pools[2], + log_fisher = -log10(pval) + )] +} + +fisher_results[,method := 'assesspool'] +cols <- names(fisher_results) +cn <- which(cols %in% c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','log_fisher','method')) +fisher_results[,cols[-cn] := NULL] +setcolorder(fisher_results,c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','log_fisher','method')) + ss <- sprintf("%s/%s_%s_window_%d_%d_fisher.tsv",outdir,file_prefix,paste0(pools,collapse="-"),window_size,window_step) -write_tsv(fisher_results,ss) \ No newline at end of file +fwrite(fisher_results,ss,sep="\t") \ No newline at end of file diff --git a/modules/local/grenedalf/frequency.nf b/modules/local/grenedalf/frequency.nf index 4973c2f..28c451b 100644 --- a/modules/local/grenedalf/frequency.nf +++ b/modules/local/grenedalf/frequency.nf @@ -25,10 +25,6 @@ process GRENEDALF_FREQUENCY { def fasta_arg = fasta ? "--reference-genome-fasta ${fasta}" : "" def fai_arg = fai ? "--reference-genome-fai ${fai}" : "" """ - # TODO: grenedalf command here - # // --sync-path ../two_pops.sync - # // --reference-genome-fai ../ref.fasta.fai - # // --file-prefix freq grenedalf frequency \\ --file-prefix "${prefix}_" \\ --threads ${task.cpus} \\ diff --git a/modules/local/grenedalf/sync.nf b/modules/local/grenedalf/sync.nf index b558d42..739f222 100644 --- a/modules/local/grenedalf/sync.nf +++ b/modules/local/grenedalf/sync.nf @@ -9,8 +9,10 @@ process GRENEDALF_SYNC { input: tuple val(meta), path(vcf), path(index) + tuple val(meta), path(sync) tuple val(meta), path(fasta) tuple val(meta), path(fai) + tuple val(meta), path(sample_map) output: tuple val(meta), path("*.sync"), emit: sync @@ -24,14 +26,18 @@ process GRENEDALF_SYNC { def prefix = task.ext.prefix ?: "${meta.id}" def fasta_arg = fasta ? "--reference-genome-fasta ${fasta}" : "" def fai_arg = fai ? "--reference-genome-fai ${fai}" : "" + def input_opt = vcf ? '--vcf-path' : (sync ? '--sync-path' : '') + def input_arg = vcf ?: (sync ?: '') + def remap_arg = sample_map ? "--rename-samples-list ${sample_map}" : "" """ grenedalf sync \\ --threads ${task.cpus} \\ ${args} \\ ${fasta_arg} \\ ${fai_arg} \\ - --file-prefix "${prefix}_" \\ - --vcf-path ${vcf} + ${remap_arg} \\ + ${input_opt} ${input_arg} \\ + --file-prefix "${prefix}_" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/joinfreq/environment.yml b/modules/local/joinfreq/environment.yml index 7ff4f81..6a14ca3 100644 --- a/modules/local/joinfreq/environment.yml +++ b/modules/local/joinfreq/environment.yml @@ -2,10 +2,6 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::r-dplyr=1.1.4 - - conda-forge::r-tidyr=1.3.1 - - conda-forge::r-readr=2.1.5 - - conda-forge::r-purrr=1.0.4 - - conda=forge::r-stringr=1.5.1 - conda-forge::r-optparse=1.7.5 + - conda-forge::r-data.table=1.17.8 diff --git a/modules/local/joinfreq/main.nf b/modules/local/joinfreq/main.nf index 050dd97..de88bb5 100644 --- a/modules/local/joinfreq/main.nf +++ b/modules/local/joinfreq/main.nf @@ -4,8 +4,8 @@ process JOINFREQ { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0': - 'biocontainers/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-91e4157032c00e9d9ac47f9837a67abee8f5afc2:7945c7dcdead60dfcbf06b19f4758634d6ad230a-0': + 'biocontainers/mulled-v2-91e4157032c00e9d9ac47f9837a67abee8f5afc2:7945c7dcdead60dfcbf06b19f4758634d6ad230a-0' }" input: tuple val(meta), path(frequency), path(fst), val(method) @@ -32,12 +32,8 @@ process JOINFREQ { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tidyr: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ @@ -50,12 +46,8 @@ process JOINFREQ { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tidyr: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ } diff --git a/modules/local/joinfreq/resources/usr/bin/joinfreq.R b/modules/local/joinfreq/resources/usr/bin/joinfreq.R index 05399e3..014398c 100755 --- a/modules/local/joinfreq/resources/usr/bin/joinfreq.R +++ b/modules/local/joinfreq/resources/usr/bin/joinfreq.R @@ -1,17 +1,10 @@ #!/usr/bin/env Rscript suppressPackageStartupMessages({ - library(dplyr) - library(tidyr) - library(readr) - library(purrr) - library(stringr) library(optparse) + library(data.table) }) -options(dplyr.summarise.inform = FALSE) - - # help message formatter nice_formatter <- function(object) { cat(object@usage, fill = TRUE) @@ -77,7 +70,7 @@ opt <- parse_args( OptionParser( option_list=option_list, formatter=nice_formatter, - prog="fisher.R", + prog="joinfreq.R", usage="%prog [options] " ), convert_hyphens_to_underscores = TRUE, @@ -105,62 +98,73 @@ if (!file.exists(opt$args[2])) { stop(sprintf("Fst file %s does not exist.",opt$args[1])) } -freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) -fst_wide <- read_tsv(opt$args[2],col_types = cols(),progress = FALSE) -if (all(c("start","end") %in% names(fst_wide))) { - fst_wide <- fst_wide %>% - mutate(pos = floor((start+end)/2)) %>% - select(-c(start,end)) %>% - select(chrom,pos,everything()) +freq_snps <- fread(opt$args[1]) +fst <- fread(opt$args[2]) + +if (all(c("start","end") %in% names(fst))) { + fst[, pos := floor((start+end)/2) ] + fst[, c('start','end') := NULL ] + setcolorder(fst,c('chrom','pos')) } # load fst file, pivot long, and filter out NAs -fst <- fst_wide %>% - { - if (calc_method == "grenedalf") { - select(.,chrom:pos,ends_with(".fst",ignore.case = FALSE)) %>% - rename_with(\(n) { - str_match(n,"^(.+):(.+)\\.(fst)$") %>% - array_branch(1) %>% - map_chr(\(x) { - f <- sort(x[2:3]) - str_c(f[1],":",f[2],".",x[4]) - }) - },.cols = -c(chrom:pos)) %>% - pivot_longer(-c(chrom:pos),names_to=c("pop1","pop2",".value"),names_pattern = "^([^:]+):(.+)\\.(fst)$") - } else select(.,chrom,pos,pop1,pop2,fst) - } %>% - filter(!is.na(fst)) - -freq_snps <- freq_snps %>% - pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% - select(CHROM,POS,pool,measure,count) %>% - pivot_wider(names_from="measure",values_from = "count") %>% - rename_with(str_to_lower,.cols = c(1:2)) - -# continue filtering -combined <- fst %>% - left_join(freq_snps,by=c("chrom","pos","pop1" = "pool")) %>% - left_join(freq_snps,by=c("chrom","pos","pop2" = "pool"),suffix=c(".pop1",".pop2")) %>% - mutate( - # summarize read depths - TOTAL.REF_CNT = REF_CNT.pop1 + REF_CNT.pop2, - TOTAL.ALT_CNT = ALT_CNT.pop1 + ALT_CNT.pop2, - TOTAL.DEPTH = DEPTH.pop1 + DEPTH.pop2 - ) %>% - filter( - # filter by minimum minimum depth and others - DEPTH.pop1 >= min_depth & DEPTH.pop2 >= min_depth, - TOTAL.ALT_CNT >= min_count & TOTAL.REF_CNT >= min_count, - TOTAL.REF_CNT != TOTAL.DEPTH, - TOTAL.DEPTH >= min_depth - ) %>% - mutate( - avg_min_cov = pmin(DEPTH.pop1, DEPTH.pop2), - method = calc_method - ) %>% - arrange(chrom,pos,pop1,pop2) %>% - rename_with(str_to_lower) +if (calc_method == "grenedalf") { + # delete unused columns + cols <- names(fst) + fst[,c(cols[grep(r'[\.(missing|numeric|passed|masked|empty|invariant)$]',cols)]) := NULL] + + # make sure the pool name pairs are sorted, so that they + # look like a:b.fst and not like b:a.fst + + # first, pull out fst column names and strip off '.fst' + old_cols <- grep(r'[\.fst$]',names(fst),value=TRUE) + fst_cols <- gsub(r'[\.fst$]','',old_cols) + # split them into pairs by colon, sort the pairs, smash them back together, and re-add '.fst' + fst_cols <- paste0(sapply(strsplit(fst_cols,':'),\(x) paste0(sort(x),collapse=':')),'.fst') + # rename the original columns to the sorted version + setnames(fst,old = old_cols, new=fst_cols) + + # now convert to long format + fst <- melt(fst, measure = measure(pop1,pop2,pattern = r'[^([^:]+):(.+)\.fst$]'),value.name = 'fst') +} else { + setcolorder(fst,c('chrom','pos','pop1','pop2','fst')) + fst[,c(names(fst)[-c(1:5)]) := NULL] +} +# pivot frequency table longer +freq_snps[,c(grep('^TOTAL',names(freq_snps))) := NULL ] +freq_snps <- melt(freq_snps, measure = measure(pool,measure,pattern = r'[^(.+)\.([A-Z_]+)$]'),value.name = 'count') +freq_snps <- dcast(freq_snps,CHROM+POS+pool ~ measure,value.var = "count")[order(CHROM,POS,pool)] +setnames(freq_snps,1:2,new=tolower) + +combined <- merge(fst,freq_snps,by.x = c('chrom','pos','pop1'),by.y = c('chrom','pos','pool')) +combined <- merge(combined,freq_snps,by.x = c('chrom','pos','pop2'),by.y = c('chrom','pos','pool')) +setnames( + combined, + c('REF_CNT.x','REF_CNT.y','ALT_CNT.x','ALT_CNT.y','DEPTH.x','DEPTH.y'), + c('REF_CNT.pop1','REF_CNT.pop2','ALT_CNT.pop1','ALT_CNT.pop2','DEPTH.pop1','DEPTH.pop2') +) +combined[,`:=`( + TOTAL.REF_CNT = REF_CNT.pop1 + REF_CNT.pop2, + TOTAL.ALT_CNT = ALT_CNT.pop1 + ALT_CNT.pop2, + TOTAL.DEPTH = DEPTH.pop1 + DEPTH.pop2 +)] +combined <- combined[ + DEPTH.pop1 >= min_depth & DEPTH.pop2 >= min_depth & + TOTAL.ALT_CNT >= min_count & TOTAL.REF_CNT >= min_count & + TOTAL.REF_CNT != TOTAL.DEPTH & + TOTAL.DEPTH >= min_depth +][order(chrom,pos,pop1,pop2)] +combined[,`:=`( + avg_min_cov = do.call(pmin,.SD), + method = calc_method +), .SDcols = patterns('^DEPTH\\.')] +setnames(combined,new=tolower) +setcolorder( + combined, + c('chrom','pos','pop1','pop2','fst','avg_min_cov','method','alt_cnt.pop1','alt_cnt.pop2', + 'ref_cnt.pop1','ref_cnt.pop2','depth.pop1','depth.pop2','total.alt_cnt','total.ref_cnt','total.depth') +) + outfile <- sprintf("%s/%s_fst_frequency.tsv",outdir,file_prefix) -write_tsv(combined,outfile) \ No newline at end of file +fwrite(combined,outfile,sep="\t") \ No newline at end of file diff --git a/modules/local/poolfstat/fst/environment.yml b/modules/local/poolfstat/fst/environment.yml index f2ca38b..6f1a026 100644 --- a/modules/local/poolfstat/fst/environment.yml +++ b/modules/local/poolfstat/fst/environment.yml @@ -4,7 +4,4 @@ channels: dependencies: - conda=forge::r-poolfstat=3.0.0 - conda=forge::r-optparse=1.7.5 - - conda=forge::r-tibble=3.2.1 - - conda=forge::r-stringr=1.5.1 - - conda=forge::r-readr=2.1.5 - - conda=forge::r-dplyr=1.1.4 + - conda-forge::r-data.table=1.17.8 diff --git a/modules/local/poolfstat/fst/main.nf b/modules/local/poolfstat/fst/main.nf index 79835ab..f4b7622 100644 --- a/modules/local/poolfstat/fst/main.nf +++ b/modules/local/poolfstat/fst/main.nf @@ -4,8 +4,8 @@ process POOLFSTAT_FST { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-5d425a6e26ab032a4dcba5bc997f43ab9299830a:a1958d454debf3df6f397d04d5f094df68682ef9-0': - 'biocontainers/mulled-v2-5d425a6e26ab032a4dcba5bc997f43ab9299830a:a1958d454debf3df6f397d04d5f094df68682ef9-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-618be27569d1bdadeed1044f0388638f6bfca2ae:095239e9e7e75cc2b7273c9a69167d844dd9b634-0': + 'biocontainers/mulled-v2-618be27569d1bdadeed1044f0388638f6bfca2ae:095239e9e7e75cc2b7273c9a69167d844dd9b634-0' }" input: tuple val(meta), val(pool_map), path(sync) @@ -39,11 +39,7 @@ process POOLFSTAT_FST { R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") poolfstat: \$(Rscript -e "cat(paste(packageVersion('poolfstat')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tibble')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ @@ -58,11 +54,7 @@ process POOLFSTAT_FST { R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") poolfstat: \$(Rscript -e "cat(paste(packageVersion('poolfstat')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tibble')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ } diff --git a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R index 08ef2d1..e484d62 100755 --- a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R +++ b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R @@ -3,10 +3,7 @@ suppressPackageStartupMessages({ library(poolfstat) library(optparse) - library(tibble) - library(stringr) - library(readr) - library(dplyr) + library(data.table) }) @@ -99,33 +96,31 @@ pooldata <- popsync2pooldata( fst <- computeFST(pooldata,sliding.window.size=opt$options$window_size) # get population combo string -fn <- str_c(pooldata@poolnames,collapse='-') +fn <- paste0(pooldata@poolnames,collapse='-') # save global Fst -global_fst <- tibble(pop1=pooldata@poolnames[1],pop2=pooldata@poolnames[2],fst=fst$Fst[1]) -write_tsv(global_fst,str_glue("{opt$options$prefix}_global_{fn}.tsv")) +global_fst <- data.table(pop1=pooldata@poolnames[1],pop2=pooldata@poolnames[2],fst=fst$Fst[1]) +fwrite(global_fst,sprintf("%s_global_%s.tsv",opt$options$prefix,fn),sep="\t") # save sliding window Fst, if they exist if (!is.null(fst$sliding.windows.fvalues)) { - sliding <- as_tibble(fst$sliding.windows.fvalues) %>% - rename(chrom=1,start=2,end=3,mid=4,cum_mid=5,fst=6) - write_tsv(sliding,str_glue("{opt$options$prefix}_sliding-{opt$options$window_size}_{fn}.tsv")) + setDT(fst$sliding.windows.fvalues) + setnames(fst$sliding.windows.fvalues,new=c('chrom','start','end','mid','cum_mid','fst')) + fwrite(fst$sliding.windows.fvalues,sprintf("%s_sliding-%d_%s.tsv",opt$options$prefix,opt$options$window_size,fn),sep="\t") } -pools <- str_c(str_c(pooldata@poolnames,collapse=':'),".fst") - -# save per-snp Fst, match popoolation output -snp_fst <- pooldata@snp.info %>% - as_tibble() %>% - select(chrom=1,pos=2) %>% - mutate( - window_size = opt$options$window_size, - covered_fraction = 1, - # depth = rowSums(pooldata@readcoverage), - avg_min_cov = do.call(pmin,as.data.frame(pooldata@readcoverage)), - pop1 = opt$options$pool_names[1], - pop2 = opt$options$pool_names[2], - fst = fst$snp.Fstats$Fst - ) %>% - select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fst) -write_tsv(snp_fst,str_glue("{opt$options$prefix}_{fn}.fst"),col_names=opt$options$headers) \ No newline at end of file +pools <- paste0(paste0(pooldata@poolnames,collapse=':'),".fst") + +setDT(pooldata@snp.info) +cols <- names(pooldata@snp.info) +pooldata@snp.info[,cols[-c(1:2)] := NULL] +setnames(pooldata@snp.info,c('chrom','pos')) +pooldata@snp.info[,`:=`( + window_size = opt$options$window_size, + covered_fraction = 1, + avg_min_cov = do.call(pmin,as.data.table(pooldata@readcoverage)), + pop1 = opt$options$pool_names[1], + pop2 = opt$options$pool_names[2], + fst = fst$snp.Fstats$Fst +)] +fwrite(pooldata@snp.info,sprintf("%s_%s.fst",opt$options$prefix,fn),col.names = opt$options$headers) diff --git a/modules/nf-core/bcftools/reheader/environment.yml b/modules/nf-core/bcftools/reheader/environment.yml deleted file mode 100644 index 5c00b11..0000000 --- a/modules/nf-core/bcftools/reheader/environment.yml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - bioconda::bcftools=1.20 diff --git a/modules/nf-core/bcftools/reheader/main.nf b/modules/nf-core/bcftools/reheader/main.nf deleted file mode 100644 index 9cf6d0d..0000000 --- a/modules/nf-core/bcftools/reheader/main.nf +++ /dev/null @@ -1,79 +0,0 @@ -process BCFTOOLS_REHEADER { - tag "$meta.id" - label 'process_low' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0': - 'biocontainers/bcftools:1.20--h8b25389_0' }" - - input: - tuple val(meta), path(vcf), path(header), path(samples) - tuple val(meta2), path(fai) - - output: - tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf - tuple val(meta), path("*.{csi,tbi}") , emit: index, optional: true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def fai_argument = fai ? "--fai $fai" : "" - def header_argument = header ? "--header $header" : "" - def samples_argument = samples ? "--samples $samples" : "" - - def args2 = task.ext.args2 ?: '--output-type z' - def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : - args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : - args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : - args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : - "vcf" - """ - bcftools \\ - reheader \\ - $fai_argument \\ - $header_argument \\ - $samples_argument \\ - $args \\ - --threads $task.cpus \\ - $vcf \\ - | bcftools view \\ - $args2 \\ - --output ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') - END_VERSIONS - """ - - stub: - def args2 = task.ext.args2 ?: '--output-type z' - def prefix = task.ext.prefix ?: "${meta.id}" - - def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : - args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : - args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : - args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : - "vcf" - def index = args2.contains("--write-index=tbi") || args2.contains("-W=tbi") ? "tbi" : - args2.contains("--write-index=csi") || args2.contains("-W=csi") ? "csi" : - args2.contains("--write-index") || args2.contains("-W") ? "csi" : - "" - def create_cmd = extension.endsWith(".gz") ? "echo '' | gzip >" : "touch" - def create_index = extension.endsWith(".gz") && index.matches("csi|tbi") ? "touch ${prefix}.${extension}.${index}" : "" - - """ - ${create_cmd} ${prefix}.${extension} - ${create_index} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/bcftools/reheader/meta.yml b/modules/nf-core/bcftools/reheader/meta.yml deleted file mode 100644 index 47e5344..0000000 --- a/modules/nf-core/bcftools/reheader/meta.yml +++ /dev/null @@ -1,76 +0,0 @@ -name: bcftools_reheader -description: Reheader a VCF file -keywords: - - reheader - - vcf - - update header -tools: - - reheader: - description: | - Modify header of VCF/BCF files, change sample names. - homepage: http://samtools.github.io/bcftools/bcftools.html - documentation: http://samtools.github.io/bcftools/bcftools.html#reheader - doi: 10.1093/gigascience/giab008 - licence: ["MIT"] - identifier: biotools:bcftools -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - vcf: - type: file - description: VCF/BCF file - pattern: "*.{vcf.gz,vcf,bcf}" - - header: - type: file - description: New header to add to the VCF - pattern: "*.{header.txt}" - - samples: - type: file - description: File containing sample names to update (one sample per line) - pattern: "*.{samples.txt}" - - - meta2: - type: map - description: | - Groovy Map containing reference information - e.g. [ id:'genome' ] - - fai: - type: file - description: Fasta index to update header sequences with - pattern: "*.{fai}" -output: - - vcf: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.{vcf,vcf.gz,bcf,bcf.gz}": - type: file - description: VCF with updated header, bgzipped per default - pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" - - index: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.{csi,tbi}": - type: file - description: Index of VCF with updated header - pattern: "*.{csi,tbi}" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@bjohnnyd" - - "@jemten" - - "@ramprasadn" -maintainers: - - "@bjohnnyd" - - "@jemten" - - "@ramprasadn" diff --git a/modules/nf-core/bcftools/reheader/tests/bcf.config b/modules/nf-core/bcftools/reheader/tests/bcf.config deleted file mode 100644 index 2b7dff5..0000000 --- a/modules/nf-core/bcftools/reheader/tests/bcf.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.args2 = { "--no-version --output-type b" } - ext.prefix = "tested" -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/main.nf.test b/modules/nf-core/bcftools/reheader/tests/main.nf.test deleted file mode 100644 index 96c1b7b..0000000 --- a/modules/nf-core/bcftools/reheader/tests/main.nf.test +++ /dev/null @@ -1,394 +0,0 @@ -nextflow_process { - - name "Test Process BCFTOOLS_REHEADER" - script "../main.nf" - process "BCFTOOLS_REHEADER" - tag "modules" - tag "modules_nfcore" - tag "bcftools" - tag "bcftools/reheader" - - test("sarscov2 - [vcf, [], []], fai - vcf output") { - - config "./vcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output") { - - config "./vcf.gz.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - index") { - - config "./vcf_gz_index.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.vcf, - process.out.index.collect { it.collect { it instanceof Map ? it : file(it).name } }, - process.out.versions - ).match() }, - { assert process.out.index[0][1].endsWith(".csi") } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index") { - - config "./vcf_gz_index_csi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.vcf, - process.out.index.collect { it.collect { it instanceof Map ? it : file(it).name } }, - process.out.versions - ).match() }, - { assert process.out.index[0][1].endsWith(".csi") } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index") { - - config "./vcf_gz_index_tbi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.vcf, - process.out.index.collect { it.collect { it instanceof Map ? it : file(it).name } }, - process.out.versions - ).match() }, - { assert process.out.index[0][1].endsWith(".tbi") } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - bcf output") { - - config "./bcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, header, []], []") { - - config "./vcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true), - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - [] - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], samples], fai") { - - config "./vcf.config" - when { - - process { - """ - ch_no_samples = Channel.of([ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [] - ]) - ch_samples = Channel.of(["samples.txt", "new_name"]) - .collectFile(newLine:true) - input[0] = ch_no_samples.combine(ch_samples) - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - stub") { - - options "-stub" - config "./vcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - file(process.out.vcf[0][1]).name, - process.out.versions, - ).match() } - ) - } - - } - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - index -stub") { - - options "-stub" - config "./vcf_gz_index.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index -stub") { - - options "-stub" - config "./vcf_gz_index_csi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index -stub") { - - options "-stub" - config "./vcf_gz_index_tbi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} diff --git a/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap b/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap deleted file mode 100644 index 87a3654..0000000 --- a/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap +++ /dev/null @@ -1,469 +0,0 @@ -{ - "sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index": { - "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.tbi" - ] - ], - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T10:09:05.955833763" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - index -stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:52:41.444952182" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index -stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:53:04.314827944" - }, - "sarscov2 - [vcf, [], []], fai - vcf output": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:50:41.983008108" - }, - "sarscov2 - [vcf, [], []], fai - bcf output": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.bcf.gz:md5,c8a304c8d2892039201154153c8cd536" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.bcf.gz:md5,c8a304c8d2892039201154153c8cd536" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:51:43.072513252" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:50:53.055630152" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - index": { - "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi" - ] - ], - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T10:08:37.999924355" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index -stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:52:52.512269206" - }, - "sarscov2 - [vcf, [], []], fai - stub": { - "content": [ - "tested.vcf", - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-05-31T15:16:36.337112514" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index": { - "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi" - ] - ], - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T10:08:55.434831174" - }, - "sarscov2 - [vcf, [], samples], fai": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,c64c373c10b0be24b29d6f18708ec1e8" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,c64c373c10b0be24b29d6f18708ec1e8" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:52:12.216002665" - }, - "sarscov2 - [vcf, header, []], []": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,3189bc9a720d5d5d3006bf72d91300cb" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,3189bc9a720d5d5d3006bf72d91300cb" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:51:54.062386022" - } -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/tags.yml b/modules/nf-core/bcftools/reheader/tests/tags.yml deleted file mode 100644 index c252941..0000000 --- a/modules/nf-core/bcftools/reheader/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -bcftools/reheader: - - modules/nf-core/bcftools/reheader/** diff --git a/modules/nf-core/bcftools/reheader/tests/vcf.config b/modules/nf-core/bcftools/reheader/tests/vcf.config deleted file mode 100644 index 820f2ae..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.args2 = { "--no-version" } - ext.prefix = "tested" -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/vcf.gz.config b/modules/nf-core/bcftools/reheader/tests/vcf.gz.config deleted file mode 100644 index c3031c3..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf.gz.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.args2 = { "--no-version --output-type z" } - ext.prefix = "tested" -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index.config b/modules/nf-core/bcftools/reheader/tests/vcf_gz_index.config deleted file mode 100644 index 1e050ec..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.prefix = { "${meta.id}_vcf" } - ext.args2 = "--output-type z --write-index --no-version" -} diff --git a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_csi.config b/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_csi.config deleted file mode 100644 index 536e4b4..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_csi.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.prefix = { "${meta.id}_vcf" } - ext.args2 = "--output-type z --write-index=csi --no-version" -} diff --git a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_tbi.config b/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_tbi.config deleted file mode 100644 index 91a80db..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_tbi.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - ext.prefix = { "${meta.id}_vcf" } - ext.args2 = "--output-type z --write-index=tbi --no-version" - -} diff --git a/nextflow.config b/nextflow.config index 6441ffa..d06e4e4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/assesspool Nextflow config file + assessPool Nextflow config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Default config options for all compute environments ---------------------------------------------------------------------------------------- @@ -9,7 +9,6 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options input = null @@ -46,16 +45,18 @@ params { visualize_filters = true filter_only = false - // report options - report_min_coverage = 10 - report_max_coverage = 70 - report_coverage_step = 10 + // report/output options + min_coverage_cutoff = 10 + max_coverage_cutoff = 70 + coverage_cutoff_step = 10 + extract_sequences = false // fst options popoolation2 = false grenedalf = false poolfstat = false missing_zeroes = false + fst_cutoff = 0.7 // popoolation/poolfstat min_count = 2 min_coverage = 10 @@ -81,7 +82,7 @@ params { // Boilerplate options - outdir = null + outdir = 'output' publish_dir_mode = 'symlink' email = null email_on_fail = null @@ -92,7 +93,7 @@ params { help_full = false show_hidden = false version = false - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' + pipelines_testdata_base_path = 'https://zenodo.org/records/19058481/' trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')// Config options config_profile_name = null config_profile_description = null @@ -232,12 +233,11 @@ profiles { // Load nf-core custom profiles from different institutions // If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. -// Load nf-core/assesspool custom profiles from different institutions. +// Load assesspool custom profiles from different institutions. includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" -// Load nf-core/assesspool custom profiles from different institutions. -// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs +// Load assesspool custom profiles from different institutions. // includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/assesspool.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile @@ -298,10 +298,9 @@ dag { } manifest { - name = 'nf-core/assesspool' + name = 'assessPool' author = """Evan B Freel, Emily E Conklin, Mykle L Hoban, Derek W Kraft, Jonathan L Whitney, Ingrid SS Knapp, Zac H Forsman, Robert J Toonen""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead contributors = [ - // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 [ name: 'Evan B Freel', affiliation: '', @@ -367,8 +366,8 @@ manifest { orcid: '' ], ] - homePage = 'https://github.com/nf-core/assesspool' - description = """nextflow-enabled version of assessPool radseq processing pipeline""" + homePage = 'https://github.com/tobodev/assesspool' + description = """nextflow-enabled version of assessPool pooled RADseq processing pipeline""" mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=24.04.2' @@ -386,7 +385,7 @@ validation { monochromeLogs = params.monochrome_logs help { enabled = true - command = "nextflow run nf-core/assesspool -profile --input samplesheet.csv --outdir " + command = "nextflow run tobodev/assesspool -profile --input samplesheet.csv --outdir " fullParameter = "help_full" showHiddenParameter = "show_hidden" beforeText = """ @@ -396,7 +395,7 @@ validation { \033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m \033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m \033[0;32m`._,._,\'\033[0m -\033[0;35m nf-core/assesspool ${manifest.version}\033[0m +\033[0;35m assessPool ${manifest.version}\033[0m -\033[2m----------------------------------------------------\033[0m- """ afterText = """${manifest.doi ? "\n* The pipeline\n" : ""}${manifest.doi.tokenize(",").collect { " https://doi.org/${it.trim().replace('https://doi.org/','')}"}.join("\n")}${manifest.doi ? "\n" : ""} @@ -404,7 +403,7 @@ validation { https://doi.org/10.1038/s41587-020-0439-x * Software dependencies - https://github.com/nf-core/assesspool/blob/main/CITATIONS.md + https://github.com/tobodev/assesspool/blob/main/CITATIONS.md """ } summary { diff --git a/nextflow_schema.json b/nextflow_schema.json index e350264..7b54eea 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/assesspool/main/nextflow_schema.json", - "title": "nf-core/assesspool pipeline parameters", + "$id": "https://raw.githubusercontent.com/tobodev/assesspool/main/nextflow_schema.json", + "title": "assessPool pipeline parameters", "description": "nextflow-enabled version of assessPool radseq processing pipeline", "type": "object", "$defs": { @@ -19,7 +19,7 @@ "schema": "assets/schema_input.json", "mimetype": "text/csv", "pattern": "^\\S+\\.(csv|tsv)$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", + "description": "Path to comma/tab-separated file containing information about pools..", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/assesspool/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, @@ -147,7 +147,7 @@ "type": "string", "fa_icon": "far fa-check-circle", "description": "Base URL or local path to location of pipeline test dataset files", - "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", + "default": "https://zenodo.org/records/19058481/", "hidden": true }, "trace_report_suffix": { @@ -201,7 +201,7 @@ "description": "p-value threshold for Hardy-Weinberg equlibrium test." }, "thin_snps": { - "type": "boolean", + "type": "integer", "description": "Thin sites so that no two sites are within the specified distance from one another." }, "min_mapping_quality": { @@ -287,6 +287,11 @@ "type": "boolean", "description": "Calculate Fst using `poolfstat`" }, + "fst_cutoff": { + "type": "number", + "default": 0.7, + "description": "Threshold at which Fst values are considered \"strongly differentiated.\"" + }, "match_allele_count": { "type": "boolean", "description": "Only keep sites where with matching allele counts across all pools (if something goes wrong early on, try this option)." @@ -378,20 +383,24 @@ "description": "", "default": "", "properties": { - "report_min_coverage": { + "min_coverage_cutoff": { "type": "integer", "default": 10, "description": "Minimum coverage cutoff for report visualizations." }, - "report_max_coverage": { + "max_coverage_cutoff": { "type": "integer", "default": 70, "description": "Maximum coverage cutoff for report visualizations." }, - "report_coverage_step": { + "coverage_cutoff_step": { "type": "integer", "default": 10, "description": "Report coverage cutoff step." + }, + "extract_sequences": { + "type": "boolean", + "description": "Extract reference contigs with sites having Fst > fst_cutoff (for any pairwise comparison)." } } } diff --git a/subworkflows/local/filter.nf b/subworkflows/local/filter.nf index 3e010da..896dfc3 100644 --- a/subworkflows/local/filter.nf +++ b/subworkflows/local/filter.nf @@ -6,12 +6,20 @@ include { BCFTOOLS_VIEW as BCFTOOLS_COMPRESS_INDEX_FILTERED } from '../../module workflow FILTER { take: - ch_vcf // channel: [ val(meta), [ vcf ] ] + ch_input // channel: [ val(meta), [ vcf ] ] main: ch_versions = Channel.empty() + ch_input = ch_input + .branch { meta, file, index -> + vcf: meta.format == 'vcf' + sync: meta.format == 'sync' + } + + ch_vcf = ch_input.vcf + // decide whether to run bcftools filter def bcff = params.match_allele_count || ( params.filter && ( params.min_mapping_quality || params.min_mapping_quality_ref || @@ -26,10 +34,8 @@ workflow FILTER { .map{ meta, vcf -> [ meta.id, meta, vcf ] } .join( BCFTOOLS_FILTER.out.tbi.map{ meta, index -> [ meta.id, index ] } ) .map{ id, meta, vcf, index -> [ meta, vcf, index ] } - .set{ ch_vcf2 } + .set{ ch_vcf } ch_versions = ch_versions.mix(BCFTOOLS_FILTER.out.versions.first()) - } else { - ch_vcf2 = ch_vcf } // decide whether to run vcftools filter @@ -37,42 +43,39 @@ workflow FILTER { params.max_mean_depth || params.hwe_cutoff ) if (vcft) { - VCFTOOLS_FILTER ( ch_vcf2.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) + VCFTOOLS_FILTER ( ch_vcf.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) ch_versions = ch_versions.mix(VCFTOOLS_FILTER.out.versions.first()) // TODO: re-compress and index output VCFTOOLS_FILTER.out.vcf .map{ meta, vcf -> [ meta, vcf, [] ] } - .set{ ch_vcf3 } - } else { - ch_vcf3 = ch_vcf2 + .set{ ch_vcf } } if (params.thin_snps) { - THIN_SNPS( ch_vcf3.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) + THIN_SNPS( ch_vcf.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) ch_versions = ch_versions.mix(THIN_SNPS.out.versions.first()) THIN_SNPS.out.vcf .map{ meta, vcf -> [ meta, vcf, [] ] } - .set{ ch_vcf4 } - } else { - ch_vcf4 = ch_vcf3 + .set{ ch_vcf } } if (vcft || params.thin_snps) { // re-index and compress if vcftools was run - BCFTOOLS_COMPRESS_INDEX_FILTERED( ch_vcf4, [], [], [] ).vcf + BCFTOOLS_COMPRESS_INDEX_FILTERED( ch_vcf, [], [], [] ).vcf .map{ meta,vcf -> [ meta.id, meta, vcf ] } .join( BCFTOOLS_COMPRESS_INDEX_FILTERED.out.tbi.map{ meta,index -> [ meta.id, index ] } ) .map{ id, meta, vcf, index -> [ meta, vcf, index ] } - .set{ ch_vcf_final } + .set{ ch_vcf } ch_versions = ch_versions.mix(BCFTOOLS_COMPRESS_INDEX_FILTERED.out.versions.first()) - } else { - ch_vcf_final = ch_vcf4 } + ch_vcf.mix( ch_input.sync ).set{ ch_output } + + // TODO: visualize filter results? (rmd section 'filter_visualization') emit: - vcf = ch_vcf_final + output = ch_output versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/filter_sim/meta.yml b/subworkflows/local/filter_sim/meta.yml deleted file mode 100644 index 35caf11..0000000 --- a/subworkflows/local/filter_sim/meta.yml +++ /dev/null @@ -1,51 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json -name: "filter_sim" -## TODO nf-core: Add a description of the subworkflow and list keywords -description: Sort SAM/BAM/CRAM file -keywords: - - sort - - bam - - sam - - cram -## TODO nf-core: Add a list of the modules and/or subworkflows used in the subworkflow -components: - - samtools/sort - - samtools/index -## TODO nf-core: List all of the channels used as input with a description and their structure -input: - - ch_bam: - type: file - description: | - The input channel containing the BAM/CRAM/SAM files - Structure: [ val(meta), path(bam) ] - pattern: "*.{bam/cram/sam}" -## TODO nf-core: List all of the channels used as output with a descriptions and their structure -output: - - bam: - type: file - description: | - Channel containing BAM files - Structure: [ val(meta), path(bam) ] - pattern: "*.bam" - - bai: - type: file - description: | - Channel containing indexed BAM (BAI) files - Structure: [ val(meta), path(bai) ] - pattern: "*.bai" - - csi: - type: file - description: | - Channel containing CSI files - Structure: [ val(meta), path(csi) ] - pattern: "*.csi" - - versions: - type: file - description: | - File containing software versions - Structure: [ path(versions.yml) ] - pattern: "versions.yml" -authors: - - "@mhoban" -maintainers: - - "@mhoban" diff --git a/subworkflows/local/filter_sim/main.nf b/subworkflows/local/filter_sim_vcf/main.nf similarity index 96% rename from subworkflows/local/filter_sim/main.nf rename to subworkflows/local/filter_sim_vcf/main.nf index b8671fa..72f8a7b 100644 --- a/subworkflows/local/filter_sim/main.nf +++ b/subworkflows/local/filter_sim_vcf/main.nf @@ -19,7 +19,7 @@ include { RIPGREP as COUNT_SNPS } from '../../../modules/nf include { BCFTOOLS_VIEW as BCFTOOLS_COMPRESS_INDEX_FILTERED } from '../../../modules/nf-core/bcftools/view/main' -workflow FILTER_SIM { +workflow FILTER_SIM_VCF { take: ch_vcf @@ -179,11 +179,7 @@ workflow FILTER_SIM { // turn all the count maps into tsv files describing filtering results ch_filter_summary = ch_filter_summary - .map{ meta, count -> meta.subMap('id') } - .unique() - .map{ meta -> [ meta, [ filter: 'filter', count: 'count' ] ] } - .concat( ch_filter_summary ) - .collectFile(newLine: true, sort: false) { meta, filter -> [ "${meta.id}.filter", "${filter.filter}\t${filter.count}" ] } + .collectFile(newLine: true, sort: true) { meta, filter -> [ "${meta.id}.filter", "${filter.filter}\t${filter.count}" ] } .map{ [it.baseName, it] } // join back to meta tag diff --git a/subworkflows/local/filter_sim_vcf/meta.yml b/subworkflows/local/filter_sim_vcf/meta.yml new file mode 100644 index 0000000..19e78a4 --- /dev/null +++ b/subworkflows/local/filter_sim_vcf/meta.yml @@ -0,0 +1,34 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "filter_sim" +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +components: + - vcftools + - bcftools/filter + - ripgrep +input: + - ch_vcf: + type: file + description: | + Input VCF file on which to perform stepwise filtering + pattern: "*.{vcf/vcf.gz}" +output: + - filter_summary: + type: file + description: | + Tabular data showing SNPs remaining after various filtering steps + pattern: "*.filter" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@mhoban" +maintainers: + - "@mhoban" diff --git a/subworkflows/local/filter_sim/tests/main.nf.test b/subworkflows/local/filter_sim_vcf/tests/main.nf.test similarity index 95% rename from subworkflows/local/filter_sim/tests/main.nf.test rename to subworkflows/local/filter_sim_vcf/tests/main.nf.test index e5956fb..5f41de6 100644 --- a/subworkflows/local/filter_sim/tests/main.nf.test +++ b/subworkflows/local/filter_sim_vcf/tests/main.nf.test @@ -2,9 +2,9 @@ // nf-core subworkflows test filter_sim nextflow_workflow { - name "Test Subworkflow FILTER_SIM" + name "Test Subworkflow FILTER_SIM_VCF" script "../main.nf" - workflow "FILTER_SIM" + workflow "FILTER_SIM_VCF" tag "subworkflows" tag "subworkflows_" diff --git a/subworkflows/local/poolstats.nf b/subworkflows/local/poolstats.nf index 2d714e4..8e159a6 100644 --- a/subworkflows/local/poolstats.nf +++ b/subworkflows/local/poolstats.nf @@ -13,7 +13,7 @@ include { combn } from "../../modules/local/popoolation2/combn workflow POOLSTATS { take: - ch_vcf // channel: [ val(meta), path(vcf), path(index) ] + ch_input // channel: [ val(meta), path(vcf), path(index) ] ch_ref // channel: path(ref) main: @@ -25,34 +25,61 @@ workflow POOLSTATS { // prepare vcf channel for input to grenedalf sync // (join fasta reference index) - ch_vcf - .map{ meta, vcf, index -> [ meta.id, meta, vcf, index ] } + ch_input + .map{ meta, file, index -> [ meta.id, meta, file, index ] } .join( ch_ref.map{ meta, ref, fai -> [ meta.id, ref, fai ] } ) - .map{ id, meta, vcf, index, ref, fai -> [ meta, vcf, index, ref, fai ] } + .map{ id, meta, file, index, ref, fai -> [ meta, file, index, ref, fai ] } .set{ ch_prep } - // create sync file using grenedalf + ch_prep + .map{ meta, file, index, ref, fai -> [ meta, meta.rename ? meta.pool_map : [:] ] } + // .filter { it[0].rename } + .collectFile(sort: true) { meta, pool_map -> [ "${meta.id}.map", pool_map ? pool_map.collect{k,v -> "${k}\t${v}"}.join("\n") : "" ] } + .map{ [ it.baseName, it ]} + .set{ ch_samplemap } + + ch_prep + .map{ [ it[0].id ] + it } + .join( ch_samplemap ) + .map{ id, meta, file, index, ref, fai, pool_map -> + [meta, file, index, ref, fai, pool_map.size() ? pool_map : [] ] + } + .set{ ch_prep } + + + // create/modify sync files using grenedalf GRENEDALF_SYNC( - ch_prep.map{ meta, vcf, index, ref, fai -> [ meta, vcf, index ] }, - ch_prep.map{ meta, vcf, index, ref, fai -> [ meta, [] ] }, - ch_prep.map{ meta, vcf, index, ref, fai -> [ meta, fai ] } + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, meta.format == 'vcf' ? file : [], index ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, meta.format == 'sync' ? file : [] ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, [] ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, fai ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, map ] } ) ch_versions = ch_versions.mix(GRENEDALF_SYNC.out.versions.first()) - // prepare input channel to split sync file into pairwise comparisons - GRENEDALF_SYNC.out.sync - .map{ meta, sync -> [ meta, sync, combn(meta.pools.keySet() as String[],2) ] } - .transpose() - .map{ meta, sync, pair -> - [ [ id: meta.id, pools: meta.pools.subMap(pair).sort{ it.key } ], sync ] - } - .set{ ch_split_sync } + // prepare and split input channel into pairwise comparisons + if (params.popoolation2 || params.poolfstat || params.fisher_test_popoolation) { + GRENEDALF_SYNC.out.sync + .map{ meta, sync -> [ meta, sync, combn(meta.pools.keySet() as String[],2) ] } + .transpose() + .map{ meta, sync, pair -> + // println meta.pools.subMap(pair).sort{ it.key }.values().toArray().getClass() + [ + meta + [ + pools: meta.pools.subMap(pair).sort{ it.key }, + pool_map: meta.pool_map.findAll{ it.value as String in pair as String[] }.sort{ it.key } + ], + sync + ] + } + .set{ ch_split_sync } - // pairwise split sync file - SPLIT_SYNC( ch_split_sync, [] ).output - .map{ meta, sync -> [ meta, meta.pools, sync ] } - .set{ ch_split_sync } - ch_versions = ch_versions.mix(SPLIT_SYNC.out.versions.first()) + // pairwise split sync file + SPLIT_SYNC( ch_split_sync, [] ).output + .map{ meta, sync -> [ meta, meta.pools, sync ] } + .set{ ch_split_sync } + ch_versions = ch_versions.mix(SPLIT_SYNC.out.versions.first()) + } // prepare input for frequency calculation GRENEDALF_SYNC.out.sync @@ -76,8 +103,6 @@ workflow POOLSTATS { ch_versions = ch_versions.mix(POPOOLATION2_FST.out.versions.first()) ch_versions = ch_versions.mix(REHEADER_FST.out.versions.first()) - // mix fst results - // ch_fst = ch_fst.mix( POPOOLATION2_FST.out.fst.map { meta, fst -> [ meta, fst, "popoolation" ] } ) ch_fst = ch_fst.mix( REHEADER_FST.out.output.map { meta, fst -> [ meta, fst, "popoolation" ] } ) } @@ -92,7 +117,7 @@ workflow POOLSTATS { // run grenedalf fst if requested if (params.grenedalf) { - // create pool size map + // create pool size map file GRENEDALF_SYNC.out.sync .collectFile { meta, sync -> [ "${meta.id}.ps", meta.pools.collect{ k, v -> "${k}\t${v}"}.join("\n") ] @@ -129,17 +154,17 @@ workflow POOLSTATS { // join pairwise fst results to snp frequencies and concatenate into one big file per input ch_join = JOINFREQ( ch_join ).fst_freq - .collectFile( keepHeader: true, sort: false, storeDir: 'output/fst' ){ meta, joined -> [ "${meta.id}.fst", joined ] } + .collectFile( keepHeader: true, sort: true, storeDir: 'output/fst', cache: true ){ meta, joined -> [ "${meta.id}.fst", joined ] } .map{ [it.baseName, it ] } + // rejoin meta tags - ch_join = ch_vcf + ch_join = ch_input .map{ it[0] } .unique() .map{ [ it.id, it ] } .join( ch_join ) .map{ id, meta, f -> [ meta, f ] } - ch_versions = ch_versions.mix(JOINFREQ.out.versions.first()) // run fisher tests if requested @@ -149,7 +174,14 @@ workflow POOLSTATS { .map{ meta, freq -> [ meta, freq, combn(meta.pools.keySet() as String[],2) ] } .transpose() .map{ meta, freq, pair -> - [ [ id: meta.id, pools: meta.pools.subMap(pair).sort{ it.key } ], pair.sort(), freq ] + [ + meta + [ + pools: meta.pools.subMap(pair).sort{ it.key }, + pool_map: meta.pool_map.findAll{ it.value as String in pair as String[] }.sort{ it.key } + ], + pair.sort(), + freq + ] } .set{ ch_split_freq } @@ -159,6 +191,7 @@ workflow POOLSTATS { // mix fisher results & versions ch_fisher = ch_fisher.mix( FISHERTEST.out.fisher ) ch_versions = ch_versions.mix(FISHERTEST.out.versions.first()) + } // run fisher tests with popoolation2 diff --git a/subworkflows/local/postprocess.nf b/subworkflows/local/postprocess.nf index ec12ef9..511eb6d 100644 --- a/subworkflows/local/postprocess.nf +++ b/subworkflows/local/postprocess.nf @@ -1,10 +1,11 @@ -include { RMARKDOWNNOTEBOOK as CREATE_REPORT } from '../../modules/nf-core/rmarkdownnotebook/main' -include { RIPGREP as COUNT_SNPS_FINAL } from '../../modules/nf-core/ripgrep/main' +include { RMARKDOWNNOTEBOOK as CREATE_REPORT } from '../../modules/nf-core/rmarkdownnotebook/main' +include { RIPGREP as COUNT_SNPS_FINAL } from '../../modules/nf-core/ripgrep/main' +include { EXTRACT_SEQUENCES } from '../../modules/local/extractsequences/main' workflow POSTPROCESS { take: - ch_vcf + ch_input ch_unfiltered ch_fst ch_ref @@ -18,21 +19,29 @@ workflow POSTPROCESS { ch_versions = Channel.empty() + // extract reference contigs with "strongly differentiated" snps + if (params.extract_sequences) { + EXTRACT_SEQUENCES( ch_ref, ch_fst, params.fst_cutoff ) + ch_versions = ch_versions.mix(EXTRACT_SEQUENCES.out.versions.first()) + } + + // collapse pairwise fisher tests into single files ch_fisher_collapsed = ch_fisher - .collectFile( keepHeader: true, sort: false, storeDir: 'output/fishertest' ){ meta, fish -> [ "${meta.id}.fisher", fish ] } + .collectFile( keepHeader: true, sort: true, storeDir: 'output/fisher/combined' ){ meta, fish -> [ "${meta.id}.fisher", fish ] } .map{ [ it.baseName, it ] } + // join them back to meta tags - ch_fisher_collapsed = ch_vcf + ch_fisher_collapsed = ch_input .map{ it[0] } .unique() .map{ [ it.id, it ] } .join( ch_fisher_collapsed ) .map{ id, meta, f -> [ meta, f ] } - ch_count = ch_vcf - .map{ meta, vcf, index -> [ meta + [filter: 'cumulative'], vcf ] } - .mix( ch_unfiltered.map{ meta, vcf, index -> [ meta + [filter: 'before'], vcf ] } ) + ch_count = ch_input + .map{ meta, input, index -> [ meta + [filter: 'cumulative'], input ] } + .mix( ch_unfiltered.map{ meta, input, index -> [ meta + [filter: 'before'], input ] } ) // count final filtered SNPs into map COUNT_SNPS_FINAL( ch_count, '^#', false ) @@ -40,14 +49,11 @@ workflow POSTPROCESS { // collect final filter summary into tsv files ch_filter_final = ch_filter_final - .map{ meta, count -> meta.subMap('id') } - .unique() - .map{ meta -> [ meta, [ filter: 'filter', count: 'count' ] ] } - .concat( ch_filter_final ) - .collectFile(newLine: true, sort: false) { meta, filter -> [ "${meta.id}.final_filter", "${filter.filter}\t${filter.count}" ] } + .collectFile(newLine: true, sort: true ) { meta, filter -> [ "${meta.id}.final_filter", "${filter.filter}\t${filter.count}" ] } .map{ [ it.baseName, it ] } + // join them back to the meta tags - ch_filter_final = ch_vcf + ch_filter_final = ch_input .map{ it[0] } .unique() .map{ [ it.id, it ] } @@ -55,9 +61,9 @@ workflow POSTPROCESS { .map{ id, meta, f -> [ meta, f ] } // build rmarkdown report input and params - def file_keys = [ 'fst', 'fisher', 'filter', 'final_filter' ] + // get report file channel as [ meta, reportfile ] - ch_report = ch_vcf.map { [ it[0], file("${projectDir}/assets/assesspool_report.Rmd") ] } + ch_report = ch_input.map { [ it[0].id, it[0], file("${projectDir}/assets/assesspool_report.Rmd") ] } // generate input files channel ch_input_files = ch_fst.ifEmpty{ [] } @@ -65,20 +71,43 @@ workflow POSTPROCESS { .mix( ch_filter.ifEmpty{ [] } ) .mix( ch_filter_final.ifEmpty{ [] } ) .groupTuple() - // .collect() + .map{ [it[0].id] + it[1..-1] } + + + // subset the params object because there's at least one value that changes + // every time, which invalidates the caching + nf_params = [ + 'coverage_cutoff_step', + 'max_coverage_cutoff', + 'min_coverage_cutoff', + 'visualize_filters' + ] ch_params = ch_input_files. map{ meta, files -> [ meta, files.collect{ [ it.extension, it.name ] }.collectEntries() ] } .map{ meta, p -> [ meta, p + [ - nf: params, + nf: params.subMap(nf_params), tz: TimeZone.getDefault().getID() ]]} - CREATE_REPORT( ch_report, ch_params.map{meta, p -> p}, ch_input_files.map{meta, f -> f} ) + // join everything together + ch_report + .join( ch_params ) + .join( ch_input_files ) + .map { it[1..-1] } + .set{ ch_report } - emit: - // // TODO nf-core: edit emitted channels + // tuple val(meta), path(notebook) + // val parameters + // path input_files + CREATE_REPORT( + ch_report.map{ meta, report, params, files -> [meta, report] }, + ch_report.map{ meta, report, params, files -> params }, + ch_report.map{ meta, report, params, files -> files } + ) + ch_versions = ch_versions.mix(CREATE_REPORT.out.versions.first()) + emit: versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/preprocess.nf b/subworkflows/local/preprocess.nf index bb19ca8..7f84061 100644 --- a/subworkflows/local/preprocess.nf +++ b/subworkflows/local/preprocess.nf @@ -1,9 +1,19 @@ include { BCFTOOLS_QUERY as VCF_SAMPLES } from '../../modules/nf-core/bcftools/query/main' -include { BCFTOOLS_REHEADER as VCF_RENAME } from '../../modules/nf-core/bcftools/reheader/main' include { BCFTOOLS_VIEW as COMPRESS_VCF } from '../../modules/nf-core/bcftools/view/main' include { TABIX_TABIX as INDEX_VCF } from '../../modules/nf-core/tabix/tabix/main' include { SAMTOOLS_FAIDX as INDEX_REFERENCE } from '../../modules/nf-core/samtools/faidx/main' +// read the first line from a gzip'd file +def gz_head(Path file) { + new java.io.BufferedReader( + new java.io.InputStreamReader( + new java.util.zip.GZIPInputStream(new java.io.FileInputStream(file.toFile())) + ) + ).withCloseable { reader -> + return reader.readLine() + } +} + workflow PREPROCESS { take: @@ -15,7 +25,7 @@ workflow PREPROCESS { // save ref genome channel ch_samplesheet - .map { meta, vcf, index, ref -> [ meta, ref ] } + .map { meta, input, index, ref -> [ meta, ref ] } .set { ch_ref } // generate fasta ref index from reference fasta INDEX_REFERENCE( ch_ref, ch_ref.map{ meta, ref -> [ meta, [] ] } ) @@ -26,68 +36,75 @@ workflow PREPROCESS { .map{ id, meta, fasta, fai -> [ meta, fasta, fai ] } .set{ ch_ref } - // save vcf channel + // save input channel ch_samplesheet - .map { meta, vcf, index, ref -> [ meta, vcf, index ] } - .set { ch_vcf } + .map { meta, input, index, ref -> [ meta, input, index ] } + .set { ch_input } + // branch inputs into sync and/or vcf + ch_input + .branch { meta, input, index -> + vcf: input.toString() =~ /(?i)\.vcf(\.gz)?$/ + sync: input.toString() =~ /(?i)\.sync(\.gz)?$/ + } + .set{ ch_input } + + // preprocess VCF input(s) // Get VCF sample names from header regardless of whether we're renaming them - VCF_SAMPLES(ch_vcf,[],[],[]).output + VCF_SAMPLES(ch_input.vcf,[],[],[]).output .splitText() .map { meta, pool -> [ meta.id, pool.trim() ] } .groupTuple() .set { ch_samplenames } - ch_versions = ch_versions.mix(VCF_SAMPLES.out.versions.first()) + ch_versions = ch_versions.mix(VCF_SAMPLES.out.versions.first()) // Extract VCF sample names and smash them into a pool map - ch_vcf + ch_input.vcf .map{ meta, vcf, index -> [ meta.id, meta, vcf, index ] } .join( ch_samplenames ) .map { id, meta, vcf, index, pools -> def pp = meta.pools ?: pools def ps = meta.pool_sizes.size() > 1 ? meta.pool_sizes : (1..pools.size()).collect{ meta.pool_sizes[0] } - [ [ id: meta.id, rename: meta.rename, pools: [pp,ps].transpose().collectEntries(), pool_map: [pools,pp].transpose().collectEntries() ], vcf, index ] + [ [ id: meta.id, format: 'vcf', rename: meta.rename, pools: [pp,ps].transpose().collectEntries(), pool_map: [pools,pp].transpose().collectEntries() ], vcf, index ] } .set{ ch_vcf } - // First, branch out which ones we need to rename, zip or index + // preprocess sync input(s) + // Extract/get sync sample names the same way, more or less + ch_input.sync + .map{ meta, sync, index -> + def pools = (sync.extension.toLowerCase() == 'gz' ? gz_head(sync) : sync.withReader { it.readLine() }).split(/\t/) + def sync_hdr = !!(pools[0] =~ /^#chr/) + pools = sync_hdr ? pools[3..-1] : (1..(pools.size()-3)).collect{ "${sync.baseName}.${it}"} + + def pp = meta.pools ?: pools + assert pp.size() == pools.size() + def ps = meta.pool_sizes.size() > 1 ? meta.pool_sizes : (1..pools.size()).collect{ meta.pool_sizes[0] } + [ + meta + [ + format: 'sync', + sync_hdr: sync_hdr, + pools: [pp,ps].transpose().collectEntries{k,v -> [ k.toString(), v ]}, + pool_map: [pools,pp].transpose().collectEntries{k,v -> [ k.toString(), v.toString() ]} + ], + sync, + [] + ] + } + .set { ch_sync } + + // Figure out how to process vcf samples ch_vcf .branch { meta, vcf, index -> - rename: meta.rename - zip: !meta.rename && vcf.extension != "gz" - index: !meta.rename && vcf.extension == "gz" && !index - keep_as_is: !meta.rename && vcf.extension == "gz" && index + // rename: meta.rename + zip: /*!meta.rename &&*/ vcf.extension != "gz" + index: /*!meta.rename &&*/ vcf.extension == "gz" && !index + keep_as_is: /*!meta.rename &&*/ vcf.extension == "gz" && index } .set{ ch_vcf } - // rename VCF samples ----------------------------------- - // Create a sample name map file for bcftools reheader - ch_vcf.rename - // mapfiles named for meta.id - .collectFile { meta, vcf, index -> - [ "${meta.id}.map", meta.pool_map.collect{ k, v -> "${k} ${v}"}.join("\n") ] - } - // use mapfile basename for meta.id to join below - .map { f -> [ f.baseName, f ] } - .set{ ch_remap } - - // Join sample map back to VCF channel - ch_vcf.rename - .map { meta, vcf, index -> [ meta.id, meta, vcf ] } - .join( ch_remap ) - .map { id, meta, vcf, mapfile -> [ meta, vcf, [], mapfile ] } - .set{ ch_remap } - - // Rename samples in the VCF header - VCF_RENAME( ch_remap,ch_remap.map{ [it[0],[]]} ).vcf - .map{ meta, vcf -> [meta.id, meta, vcf] } - .join( VCF_RENAME.out.index.map{ meta, index -> [ meta.id, index ] } ) - .map{ id, meta, vcf, index -> [ meta, vcf, index ] } - .set { ch_renamed } - ch_versions = ch_versions.mix(VCF_RENAME.out.versions.first()) - // rename VCF samples ----------------------------------- // zip unzipped VCF ------------------------------------- COMPRESS_VCF( ch_vcf.zip,[],[],[] ) @@ -97,7 +114,6 @@ workflow PREPROCESS { .map{ id, meta, vcf, index -> [meta,vcf,index] } .set{ ch_zipped } ch_versions = ch_versions.mix(COMPRESS_VCF.out.versions.first()) - // zip unzipped VCF ------------------------------------- // index unindexed VCF ---------------------------------- INDEX_VCF( ch_vcf.index.map{ meta, vcf, index -> [ meta,vcf ] } ) @@ -106,19 +122,17 @@ workflow PREPROCESS { .map{ id, meta, vcf, index -> [meta, vcf, index] } .set{ ch_indexed } ch_versions = ch_versions.mix(INDEX_VCF.out.versions.first()) - // index unindexed VCF ---------------------------------- - // // Mix back in any renamed bits + // Mix back in any renamed bits ch_vcf.keep_as_is - .mix( ch_renamed ) .mix( ch_zipped ) .mix( ch_indexed ) - .set{ ch_vcf } - + .mix( ch_sync ) + .set{ ch_input } emit: - vcf = ch_vcf + output = ch_input ref = ch_ref versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf b/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf index 82b585f..dc5780d 100644 --- a/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf @@ -1,5 +1,5 @@ // -// Subworkflow with functionality specific to the nf-core/assesspool pipeline +// Subworkflow with functionality specific to the assessPool pipeline // /* @@ -73,17 +73,17 @@ workflow PIPELINE_INITIALISATION { Channel .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) .map { - meta, vcf, index, ref, pools, pool_sizes -> + meta, input, index, ref, pools, pool_sizes -> def pp = pools ? pools.split(/,/) : [] def ps = (pool_sizes instanceof Number) ? [pool_sizes] : pool_sizes.split(/,/) if (pp.size() != ps.size()) { if (pp && ps.size() != 1) { error "Pool sizes must either be a single number or a list the same length as `pools`" } else if (pp && ps.size() == 1) { - ps = [1..pp.size()].collect{ ps[0] } + ps = (1..pp.size()).collect{ ps[0] } } } - return [ meta + [ pools: pp, pool_sizes: ps, rename: !(!pools) ], vcf, index, ref ] + return [ meta + [ pools: pp, pool_sizes: ps, rename: !(!pools) ], input, index, ref ] } .set { ch_samplesheet } @@ -163,11 +163,17 @@ def validateInputSamplesheet(input) { // Generate methods description for MultiQC // def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", + "PoPoolation2 (Kofler et al., 2011)", + "poolfstat (Gautier et al., 2022)", + "grenedalf (Czech & Expósito-Alonso, 2024)", + "bcftools (Danecek et al., 2021)", + "vcftools (Danecek et al., 2011)", + "samtools (Danecek et al., 2021)", + "Rsamtools (Morgan et al., 2024)", "." ].join(' ').trim() @@ -175,10 +181,13 @@ def toolCitationText() { } def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ + "Kofler, R., Pandey, R. V., & Schlötterer, C. (2011). PoPoolation2: identifying differentiation between populations using sequencing of pooled DNA samples (Pool-Seq). Bioinformatics, 27(24), 3435-3436", + "Gautier, M., Vitalis, R., Flori, L., & Estoup, A. (2022). f-Statistics estimation and admixture graph construction with Pool-Seq or allele count data using the R package poolfstat. Molecular Ecology Resources, 22(4), 1394-1416", + "Czech, L., Spence, J. P., & Expósito-Alonso, M. (2024). grenedalf: population genetic statistics for the next generation of pool sequencing. Bioinformatics, 40(8), btae508", + "Danecek, P., Bonfield, J. K., Liddle, J., Marshall, J., Ohan, V., Pollard, M. O., ... & Li, H. (2021). Twelve years of SAMtools and BCFtools. Gigascience, 10(2), giab008", + "Danecek, P., Auton, A., Abecasis, G., Albers, C. A., Banks, E., DePristo, M. A., ... & 1000 Genomes Project Analysis Group. (2011). The variant call format and VCFtools. Bioinformatics, 27(15), 2156-2158", + "Morgan M, Pagès H, Obenchain V, Hayden N (2024). Rsamtools: Binary alignment (BAM), FASTA, variant call (BCF), and tabix file import. doi:10.18129/B9.bioc.Rsamtools, R package version 2.22.0, " ].join(' ').trim() return reference_text @@ -221,12 +230,8 @@ def methodsDescriptionText(mqc_methods_yaml) { meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " // Tool references - meta["tool_citations"] = "" - meta["tool_bibliography"] = "" - - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! - // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") - // meta["tool_bibliography"] = toolBibliographyText() + meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + meta["tool_bibliography"] = toolBibliographyText() def methods_text = mqc_methods_yaml.text diff --git a/workflows/assesspool.nf b/workflows/assesspool.nf index 8980583..281bf69 100644 --- a/workflows/assesspool.nf +++ b/workflows/assesspool.nf @@ -7,10 +7,10 @@ include { paramsSummaryMap } from 'plugin/nf-schema' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_assesspool_pipeline' include { FILTER } from '../subworkflows/local/filter.nf' -include { FILTER_SIM } from '../subworkflows/local/filter_sim/main.nf' +include { FILTER_SIM_VCF } from '../subworkflows/local/filter_sim_vcf/main.nf' include { PREPROCESS } from '../subworkflows/local/preprocess.nf' include { POOLSTATS } from '../subworkflows/local/poolstats.nf' -include { POSTPROCESS } from '../subworkflows/local/postprocess.nf' +include { POSTPROCESS } from '../subworkflows/local/postprocess.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -24,11 +24,12 @@ workflow ASSESSPOOL { ch_samplesheet // channel: samplesheet read in from --input main: + ch_versions = Channel.empty() // run VCF preprocessing PREPROCESS( ch_samplesheet ) - ch_vcf = PREPROCESS.out.vcf + ch_input = PREPROCESS.out.output ch_ref = PREPROCESS.out.ref ch_versions = ch_versions.mix(PREPROCESS.out.versions.first()) @@ -41,21 +42,21 @@ workflow ASSESSPOOL { ch_fisher = Channel.empty() ch_filtered = Channel.empty() - // perform stepwise filtration - // for visualization and evaluation - - + // // perform stepwise filtration + // // for visualization and evaluation if (params.visualize_filters) { - FILTER_SIM( ch_vcf ) - ch_filter_sim = FILTER_SIM.out.filter_summary + FILTER_SIM_VCF( ch_input.filter { meta, f, index -> meta.format == 'vcf' } ) + ch_filter_sim = FILTER_SIM_VCF.out.filter_summary } + // TODO: add cumulative filtering to the filter_only workflow + // run downstream processing unless we're only // visualizing filters if (!params.filter_only) { // run filtering if desired - FILTER( ch_vcf ) - ch_filtered = FILTER.out.vcf + FILTER( ch_input ) + ch_filtered = FILTER.out.output ch_versions = ch_versions.mix(FILTER.out.versions.first()) // calculate pool statistics (fst, fisher, etc.) @@ -72,7 +73,7 @@ workflow ASSESSPOOL { // run post-processing steps POSTPROCESS( ch_filtered, - ch_vcf, + ch_input, ch_fst, ch_ref, ch_sync,