From 546591fa9af508b63d47215a1f7f78cfdcf1a271 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Fri, 18 Jul 2025 12:20:50 -1000 Subject: [PATCH 01/39] add fst SD as error bars in line graph instead of separate trace --- assets/assesspool_report.Rmd | 54 +++++++++--------------------------- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index e1d7549..03c881e 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -186,7 +186,7 @@ filter_headers <- c( ) filter_notes <- c( 'Note: filtering results displayed are independent, not cumulative.', - 'Cumulative filter results' + 'Cumulative filter results
(Note: filtered SNP count may differ from SNP counts associated with Fst calculations).' ) report_data <- list(filters,final_filters) @@ -267,7 +267,7 @@ c(show_filter,show_final_filter) %>% ```{r, summaries, results='asis', eval=show_fst} cat("# High-level summaries {.tabset}\n\n") -summary_datasets <- pool_data %>% +pool_data %>% group_by(method) %>% group_walk(\(fst_data,method) { # output tab header @@ -350,35 +350,20 @@ summary_datasets <- pool_data %>% print(h5("Global summary statistics by coverage cutoff")) - # TODO: use error_y for SD of fst - # plot_ly( - # x = ~cov, - # y = ~fst, - # type='scatter', - # mode='markers+lines', - # error_y = ~list( - # type='data', - # array = fst_sd - # ) - # ) - # create interactive plot and add traces fig <- fst_grp %>% - plot_ly( + plot_ly() %>% + add_trace( y = ~ `Mean Fst`, x = ~`Coverage cutoff`, name = "Mean Fst", visible = T, mode = "lines+markers", - type = "scatter" - ) %>% - add_trace( - y = ~ `Fst (SD)`, - x = ~`Coverage cutoff`, - name = "Fst (SD)", - visible = F, - mode = "lines+markers", - type = "scatter" + type = "scatter", + error_y = ~list( + type='data', + array = `Fst (SD)` + ) ) %>% add_trace( y = ~SNPs, @@ -420,7 +405,7 @@ summary_datasets <- pool_data %>% args = list( list( visible = list( - T, F, F, F, F + T, F, F, F ) ), list( @@ -434,20 +419,7 @@ summary_datasets <- pool_data %>% args = list( list( visible = list( - F, T, F, F, F - ) - ), - list(yaxis = list(title = "Fst (standard deviation)") - ) - ), - label = "Fst (SD)" - ), - list( - method = "update", - args = list( - list( - visible = list( - F, F, T, F, F + F, T, F, F ) ), list(yaxis = list(title = "SNPs") @@ -460,7 +432,7 @@ summary_datasets <- pool_data %>% args = list( list( visible = list( - F, F, F, T, F + F, F, T, F ) ), list( @@ -474,7 +446,7 @@ summary_datasets <- pool_data %>% args = list( list( visible = list( - F, F, F, F, T + F, F, F, T ) ), list( From b3b0d909720a4583061ea2224921d2c955d13ff4 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Wed, 23 Jul 2025 16:00:01 -1000 Subject: [PATCH 02/39] report updates, postprocess cleanup --- assets/assesspool_report.Rmd | 934 ++++++++++++++++-------------- subworkflows/local/postprocess.nf | 6 +- 2 files changed, 501 insertions(+), 439 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 03c881e..6121fb6 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -24,6 +24,75 @@ params: fisher: null tz: null --- +```{css, echo=FALSE} +.sliderbar { + position: fixed; + bottom: 20%; + margin-left: 0%; + padding: 10px; +} +.ui-slider-wrapper { + width: 95% +} +``` + + + + + + + + + + + ```{r setup, echo=FALSE, include=FALSE} # when run in a container, there are # weird issues with the timezone and lubridate @@ -78,19 +149,6 @@ datatable( # load the plotly js plot_ly() -# plotly horizontal line -hline <- function(y = 0, color = "black") { - list( - type = "line", - x0 = 0, - x1 = 1, - xref = "paper", - y0 = y, - y1 = y, - line = list(color = color) - ) -} - # geometric mean function gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ if(any(x < 0, na.rm = TRUE)){ @@ -113,7 +171,8 @@ cutoffs <- seq(params$nf$report_min_coverage,params$nf$report_max_coverage,param # start with fst show_fst <- !is.null(params$fst) if (show_fst) { - pool_data <- read_tsv(params$fst) + pool_data <- read_tsv(params$fst) %>% + mutate(pair = str_glue("{pop1} - {pop2}")) } else { pool_data <- FALSE } @@ -135,77 +194,64 @@ if (show_filter) { filters <- NULL } +# whether to show cumulative filtering show_final_filter <- params$nf$visualize_filters & !is.null(params$final_filter) -if (show_final_filter) { +if (show_final_filter) { final_filters <- read_tsv(params$final_filter) } else { final_filters <- NULL } +# whether to show any filtering at all any_filter <- show_filter | show_final_filter # used for tab selection -all_pools <- c('All SNPs'=FALSE,'Shared loci (all pools)'=TRUE) - -# place where stuff gets saved -data_dir <- "artifacts" - -# create artifacts dir -# if it already exists nobody will care -dir_create(data_dir) +pools_shared <- c('All SNPs'=FALSE,'Shared loci (all pools)'=TRUE) -# move fst file into artifacts dir -# file_move(params$fst,data_dir) ``` - -```{r, results='asis', eval=params$debug} -cat("# Debug info\n") -``` - -```{r, eval=params$debug} -if (params$debug) { - # print(params$meta$pools) - cat("params\n\n") - str(params) -} -``` - - # Abstract This report summarizes the output of population genetic analyses of a pool-seq RAD library. -```{r, results='asis', eval=any_filter} +```{r filtering, results='asis', eval=any_filter} # TODO: show filter option values on plot/table cat("# Filtering {.tabset}\n") +# filtering tabs filter_headers <- c( "## Stepwise filtering reuslts", "## Cumulative filtering results" ) +# notes to display in filter tabs filter_notes <- c( 'Note: filtering results displayed are independent, not cumulative.', 'Cumulative filter results
(Note: filtered SNP count may differ from SNP counts associated with Fst calculations).' ) +# list of filter data to use in the loop below report_data <- list(filters,final_filters) +# walk through different filter tabs c(show_filter,show_final_filter) %>% iwalk(\(show_report,i) { if (show_report) { # show tab header cat(filter_headers[i],"\n\n") - + # get relevant filtering data filter_data <- report_data[[i]] - + + # show some header info and notes print( tagList( h5('SNPs remaining after each filtering step.'), h5(tags$small(HTML(filter_notes[i]))) ) ) + + # rearrange filter data a bit + # make sure 'Unfiltered' is in bold and comes first filter_data <- filter_data %>% arrange(desc(count)) %>% mutate( @@ -219,8 +265,9 @@ c(show_filter,show_final_filter) %>% .default = filtered ), changed = filtered > 0 - ) #%>% - # filter(filtered > 0) + ) + + # figure out which options did nothing changed <- filter_data %>% filter(changed) unchanged <- filter_data %>% @@ -229,19 +276,27 @@ c(show_filter,show_final_filter) %>% as.character() unchanged <- str_glue("{unchanged}") unchanged <- str_c(unchanged,collapse=", ") + # show options that didn't do anything if (unchanged != "") { print(h5(tags$small(HTML(str_glue( "Filter options with no effect: {unchanged}." ))))) } - + + # fixed-width font for tooltip font <- list( family = "Courier New", size = 14 ) - label <- list( font = font ) + + # quick formatting function num <- scales::comma + # tooltip template t_fmt <- 'Before filtering: {num(count[filter == "Unfiltered"])} SNPs
After filtering: {num(count)} SNPs
' + + # make the bar chart + # we use ggplot because it's better at doing continuous color scales + # for bar plots, then convert it to a plotly plot after the fact fig <- ggplot(changed) + geom_bar(aes(x=filter,y=count,fill=count,text=str_glue(t_fmt)),stat="identity") + scale_fill_paletteer_c("grDevices::Turku",direction = -1,name="SNPs remaining",labels=scales::comma) + @@ -251,6 +306,7 @@ c(show_filter,show_final_filter) %>% axis.title.x = element_text(margin = margin(t = -20)), plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.2,0.2,0.2,0.2), "cm")) + # plotly-ize it and do a little configuration fig <- fig %>% ggplotly(tooltip="text") %>% config(displayModeBar = FALSE) %>% @@ -258,34 +314,34 @@ c(show_filter,show_final_filter) %>% xaxis = list(title = "Filter option", fixedrange = TRUE), yaxis = list(title = "SNPs remaining", fixedrange = TRUE) ) %>% - style(hoverlabel = label) + style(hoverlabel = list(font = font)) print(tagList(fig)) - } + } }) ``` -```{r, summaries, results='asis', eval=show_fst} +```{r summaries, summaries, results='asis', eval=show_fst} cat("# High-level summaries {.tabset}\n\n") +# walk through data by method pool_data %>% group_by(method) %>% group_walk(\(fst_data,method) { # output tab header cat(str_glue("## {method$method} {{.tabset}}\n\n")) - # walk through all_shared elements - # (whether or not we're looking only at snps shared across all pools) - all_pools %>% - iwalk(\(all_shared,hdr) { + # walk through whether or not we're looking only at snps shared across all pools + pools_shared %>% + iwalk(\(shared,hdr) { # output tab header cat(str_glue("### {hdr} {{.tabset}}\n\n")) # filter data for all pools if necessary - if (all_shared) { + if (shared) { fst_data <- fst_data %>% mutate(pop1=factor(pop1),pop2=factor(pop2)) %>% group_by(chrom,pos) %>% - filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% + filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% ungroup() %>% arrange(chrom,pos,pop1,pop2) } @@ -303,7 +359,7 @@ pool_data %>% # and bind resulting subsets together into a superset list_rbind() - # summarize the data by comparison and cutoff level + # summarize the data by cutoff level fst_grp <- fst_cov %>% group_by(cutoff) %>% summarise( @@ -319,6 +375,7 @@ pool_data %>% ) %>% arrange(cutoff) %>% ungroup() %>% + # give the columns friendly names select( `Coverage cutoff`=cutoff, `Mean Fst`=fst, @@ -332,10 +389,11 @@ pool_data %>% `CI (upper)`=ci_upper ) - shared <- ifelse(all_shared,"all","shared") + # construct csv download filename + shared <- ifelse(shared,"all","shared") filename <- str_glue("{method$method}_{shared}_summary") - # a javascript callback that rounds numbers to the + # a javascript callback that rounds numbers to the # nearest thousand. otherwise the table is full of stuff like 0.3412341234123412341234 js <- DT::JS( "function ( data, type, row, meta ) {", @@ -345,7 +403,7 @@ pool_data %>% "}" ) - # show plot table + # show plot tab cat(str_glue("#### Plot\n\n")) print(h5("Global summary statistics by coverage cutoff")) @@ -471,14 +529,15 @@ pool_data %>% ) # add layout and annotation to figure - fig <- fig %>% layout( - yaxis = list(title = "Mean Fst",fixedrange=TRUE), - xaxis = list(title = "Coverage cutoff",fixedrange=TRUE), - title = list(text=str_glue("Summary statistics by coverage cutoff - {method$method}"), yref= 'paper', y=1), - showlegend = F, - updatemenus = list(button_layout), - annotations = annotation - ) %>% + fig <- fig %>% + layout( + yaxis = list(title = "Mean Fst",fixedrange=TRUE), + xaxis = list(title = "Coverage cutoff",fixedrange=TRUE), + title = list(text=str_glue("Summary statistics by coverage cutoff - {method$method}"), yref= 'paper', y=1), + showlegend = F, + updatemenus = list(button_layout), + annotations = annotation + ) %>% config(displayModeBar = FALSE) # display figure @@ -529,9 +588,6 @@ pool_data %>% ```{r fst-cov-all, results='asis', eval=show_fst} cat("# Pairwise Fst by minimum coverage {.tabset}\n\n") -# TODO: tie slider position in this plot to heatmap coverage slider -# (corresponding to method, shared snps, etc.) - # walk through calculation methods pool_data %>% group_by(method) %>% @@ -539,175 +595,201 @@ pool_data %>% # output tab header cat(str_glue("## {method$method} {{.tabset}}\n\n")) - # walk through all_shared elements - # (whether or not we're looking only at snps shared across all pools) - all_pools %>% - iwalk(\(all_shared,hdr) { + # walk through whether or not we're looking only at snps shared across all pools + pools_shared %>% + iwalk(\(shared,hdr) { # output tab header cat(str_glue("### {hdr} {{.tabset}}\n\n")) + shr <- ifelse(shared,"shared","all") # filter data for all pools if necessary - if (all_shared) { - # get all unique pool names - all_pools <- sort( - union( - unique(fst_data$pop1), - unique(fst_data$pop2) - ) - ) - - # group data by snp (chrom/pos) and retain - # snps where all populations are represented in comparisons - # (either side) + if (shared) { fst_data <- fst_data %>% + mutate(pop1=factor(pop1),pop2=factor(pop2)) %>% group_by(chrom,pos) %>% - filter(all(all_pools %in% unique(c(unique(pop1),unique(pop2))))) %>% - ungroup() + filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% + ungroup() %>% + arrange(chrom,pos,pop1,pop2) } + # calculate plot x-axis range + # basically, get the min and max mean fst across all coverage cutoffs + # we're doing this so all the different versions of the plot have the + # same x-axis. (it might be a bit data intensive but worthwhile) + xrange <- reduce(cutoffs,\(range,cutoff) { + r <- fst_data %>% + filter(avg_min_cov >= cutoff) %>% + group_by(pop1,pop2) %>% + summarise(fst = mean(fst)) %>% + pull(fst) %>% + range() + c(min(range[1],r[1]),max(range[2],r[2])) + },.init = c(0,0) ) + xrange <- c(min(xrange[1]-abs(0.1*xrange[1]),0),min(xrange[2]+abs(0.1*xrange[2]),1)) + + # get initial figures and output json script tags + fst_figs <- cutoffs %>% + reduce(\(figs,cutoff) { + # summarize the data by comparison and cutoff level + # precompute box plot stats because the datasets are too big + # and pandoc runs out of memory + fst_grp <- fst_data %>% + filter(avg_min_cov >= cutoff) %>% + group_by(pop1,pop2,pair) %>% + summarise( + fst_sd = sd(fst), # std. dev. of fst + q1 = as.numeric(quantile(fst)[2]), + q3 = as.numeric(quantile(fst)[4]), + lwr = boxplot.stats(fst)$stats[1], + upr = boxplot.stats(fst)$stats[5], + med = median(fst), + min = min(fst), + max = max(fst), + fst = mean(fst), # mean fst + fst_se = fst_sd / sqrt(n()), # std. err. of fst + cov = mean(avg_min_cov), # mean coverage + snps = n_distinct(chrom,pos,na.rm=TRUE), # snp count + contigs = n_distinct(chrom,na.rm=TRUE), # contig count + snps_per_contig = snps / contigs # mean snps per contig + ) %>% + # sort by cutoff and comparison + arrange(cutoff,pair) %>% + ungroup() + + # plot the scatterplot + fig <- fst_grp %>% + plot_ly( + x = ~ fst, + y = ~ pair, + color = ~snps, + # frame = ~str_glue("≥{cutoff}"), + # hover text templates + text = ~str_c(snps," SNPs
",contigs," contigs"), + hovertemplate = "Fst (%{y}): %{x}
%{text}", + type = 'scatter', + mode = 'markers', + marker = list(size = 12) + ) %>% + colorbar(title = "SNPs") %>% + layout( + margin=list(l = 100, r = 20, b = 10, t = 30), + yaxis = list(title="",tickangle=-35,tickfont=list(size=12)), + xaxis = list(title="Fst",tickfont=list(size=12), range=as.list(xrange)), + title = list(text=str_glue("Fst - {hdr}"), y=0.95) + ) + # output the figure to html + if (is.null(figs$scatter_fig)) { + # assign div id to figure + fig_id <- str_glue("fst-fig-{method$method}-{shr}") + fig$elementId <- fig_id + + figs$scatter_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("scatter-json-{method$method}-{shr}"), + class="fig-container", + tagList(fig) + ) + ) + } - # repeatedly filter the dataset for different coverage levels - # and bind those together into a new dataset - fst_cov <- cutoffs %>% - map(\(cov) { - fst_data %>% - # filter by average minimum coverage - filter(avg_min_cov >= cov) %>% - # retain the cutoff level in the data - mutate(cutoff=cov) - }) %>% - # and bind resulting subsets together into a superset - list_rbind() %>% - # create a text representation of the comparison - mutate(pair = str_glue("{pop1} - {pop2}")) + # get and print the json for the figure + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + json_tag <- tags$script( + id = str_glue("scatter-json-{method$method}-{shr}-{cutoff}"), + type = 'application/json', + HTML(fig_json) + ) + print(json_tag) - # summarize the data by comparison and cutoff level - # precompute box plot stats because the datasets are too big - # and pandoc runs out of memory - fst_grp <- fst_cov %>% - group_by(pop1,pop2,pair,cutoff) %>% - summarise( - fst_sd = sd(fst), # std. dev. of fst - q1 = as.numeric(quantile(fst)[2]), - q3 = as.numeric(quantile(fst)[4]), - lwr = boxplot.stats(fst)$stats[1], - upr = boxplot.stats(fst)$stats[5], - med = median(fst), - min = min(fst), - max = max(fst), - fst = mean(fst), # mean fst - fst_se = fst_sd / sqrt(n()), # std. err. of fst - cov = mean(avg_min_cov), # mean coverage - snps = n_distinct(chrom,pos,na.rm=TRUE), # snp count - contigs = n_distinct(chrom,na.rm=TRUE), # contig count - snps_per_contig = snps / contigs # mean snps per contig - ) %>% - # sort by cutoff and comparison - arrange(cutoff,pair) %>% - ungroup() + # get outlier points, limit to a subsample of 30 per pair because + # too many makes the thing crash. + outs <- fst_data %>% + filter(avg_min_cov >= cutoff) %>% + group_by(pop1,pop2,pair) %>% + summarise(outs = boxplot.stats(fst)$out) %>% + slice_sample(n=30) %>% + ungroup() + + # generate boxplot figure + fig <- fst_grp %>% + plot_ly( + type = "box", + y = ~pair, + q1 = ~q1, + q3 = ~q3, + median = ~med, + lowerfence = ~lwr, + upperfence = ~upr + ) %>% + add_trace( + data = outs, + inherit = FALSE, + y = ~pair, + x = ~outs, + type="scatter", + mode="markers", + frame = ~str_glue("≥{cutoff}") + ) %>% + layout(showlegend = FALSE) %>% + layout( + yaxis = list(tickangle=-35,tickfont=list(size=12),title=""), + xaxis = list(tickfont=list(size=12),title="Fst") + ) - # plot scatter plot header - cat("#### Scatter plot (means)\n\n") + # save figure for output + if (is.null(figs$box_fig)) { + fig_id <- str_glue("box-fig-{method$method}-{shr}") + fig$elementId <- fig_id + figs$box_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("box-json-{method$method}-{shr}"), + class="fig-container", + tagList(fig) + ) + ) + } - # plot the scatterplot - fig <- fst_grp %>% - plot_ly( - x = ~ fst, - y = ~ pair, - color = ~snps, - frame = ~str_glue("≥{cutoff}"), - # hover text templates - text = ~str_c(snps," SNPs
",contigs," contigs"), - hovertemplate = "Fst (%{y}): %{x}
%{text}", - type = 'scatter', - mode = 'markers', - marker = list(size = 12) - ) %>% - colorbar(title = "SNPs") %>% - layout( - margin=list(l = 100, r = 20, b = 10, t = 30), - yaxis = list(title="",tickangle=-35,tickfont=list(size=12)), - xaxis = list(title="Fst",tickfont=list(size=12)), - title = list(text=str_glue("Fst - {hdr}"), y=0.95) - ) %>% - # customize slider and remove play button - animation_slider(currentvalue = list(prefix = "Minimum coverage: ", font = list(color = "black"))) %>% - animation_button(visible = FALSE) + # get/print figure json + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + json_tag <- tags$script( + id = str_glue("box-json-{method$method}-{shr}-{cutoff}"), + type = 'application/json', + HTML(fig_json) + ) + print(json_tag) + return(figs) + },.init = list(scatter_fig = NULL, box_fig = NULL)) - # assign div id to figure - shr <- ifelse(all_shared,"shared","all") - fig_id <- str_glue("fst-fig-{method$method}-{shr}") - fig$elementId <- fig_id + # show scatter plot tab + cat("#### Scatter plot (means)\n\n") - # output the figure to html - print(tagList(fig)) + print(fst_figs$scatter_fig) - # TODO: slider change event - # fig_script <- tags$script( - # HTML(str_glue(' - # $(function() {{ - # $("#{fig_id}").on("plotly_sliderchange",function(e,data) {{ - # var cutoff = parseInt(data.slider.steps[data.slider.active].label.replaceAll(/[^0-9]+/g,"")); - # console.log("Cutoff: " + cutoff); - # }}) - # }}); - # ')) - # ) - # print(fig_script) - - # print box plot header + # show box plot tab cat("#### Box plot (distributions)\n\n") + print(h5(tags$small(HTML( + "Note: subsampled to 30 outliers per pair." + )))) - # get outlier points - outs <- fst_cov %>% - group_by(pop1,pop2,pair,cutoff) %>% - summarise(outs = boxplot.stats(fst)$out) %>% - ungroup() + print(fst_figs$box_fig) - # generate boxplot figure - fig <- fst_grp %>% - plot_ly( - type = "box", - y = ~pair, - q1 = ~q1, - q3 = ~q3, - median = ~med, - frame = ~str_glue("≥{cutoff}"), - lowerfence = ~lwr, - upperfence = ~upr - ) %>% - add_trace( - data = outs, - inherit = FALSE, - y = ~pair, - x = ~outs, - type="scatter", - mode="markers", - frame = ~str_glue("≥{cutoff}") - ) %>% - layout(showlegend = FALSE) %>% - animation_slider(currentvalue = list(prefix = "Minimum coverage: ", font = list(color = "black"))) %>% - animation_button(visible = FALSE) %>% - layout( - #margin=list(l = 200, r = 20, b = 80, t = 40), - yaxis = list(tickangle=-35,tickfont=list(size=12),title=""), - xaxis = list(tickfont=list(size=12),title="Fst") - ) - print(tagList(fig)) }) }) ``` -```{r, heatmap-setup, results='asis', eval=show_fst} -# TODO: add all snps/shared loci to heatmap plots +```{r heatmaps, results='asis', eval=show_fst} cat("# Summary statistic heatmaps {.tabset}\n\n") -# a list of parameters we'll use to plot each figure -# each item consists of: display name, bottom-left heatmap, and (optionally) top-right heatmap -# individual heatmap options are: x value, y value, variable name, color palette (from paletteer_c), -# number format (passed to sprintf), and missing color + +# configure variable pairs for heatmap plots var_pairs <- list( list( name="fst", @@ -765,229 +847,210 @@ cscale <- function(pal,n=10,na="darkgrey",min=0.00001) { # replace "NA" (not NA) in a string vector fix_na <- function(x,na="n/a") replace(x,which(x == "NA"),na) -# summarize dataset (as above) by method and pool comparison +# walk through calculation methods pool_data %>% group_by(method) %>% group_walk(\(fst_data,method) { - # output tab header + # show tab header cat(str_glue("## {method$method} {{.tabset}}\n\n")) - # now we walk through our different display pairs - var_pairs %>% - walk(\(trace_vars) { - # and show the tab header - cat(str_glue("### {trace_vars$title}\n\n")) - print(h5(HTML(trace_vars$description))) - - # create cutoff slider - cutoff_sel <- div( - class="coverage-slider", - tagList( - tags$label( - id=str_glue("heatmap-cutoff-label-{method$method}-{trace_vars$name}"), - `for`=str_glue("heatmap-cutoff-sel-{method$method}-{trace_vars$name}"), - str_glue('Minimum coverage: {params$nf$report_min_coverage}') - ), - div( - id=str_glue("heatmap-cutoff-sel-{method$method}-{trace_vars$name}") - ) - ) - ) - print(cutoff_sel) - - # cutoff slider handler code (js) - # note double curly braces because the whole thing is - # wrapped in str_glue - cutoff_script <- tags$script(HTML(str_glue( - ' - $(function() {{ - $("#heatmap-cutoff-sel-{method$method}-{trace_vars$name}").labeledslider({{ - min: {params$nf$report_min_coverage}, - max: {params$nf$report_max_coverage}, - step: {params$nf$report_coverage_step}, - tickInterval: {params$nf$report_coverage_step}, - change: function (e, ui) {{ - var fig = $("#heatmap-fig-{method$method}-{trace_vars$name}")[0]; - var figdata = $(`#heatmap-json-{method$method}-{trace_vars$name}-${{ui.value}}`).text(); - $("#heatmap-cutoff-label-{method$method}-{trace_vars$name}").text(`Minimum coverage: ${{ui.value}}`); - Plotly.react(fig,JSON.parse(figdata)); - }}, - slide: function (e, ui) {{ - var fig = $("#heatmap-fig-{method$method}-{trace_vars$name}")[0]; - var figdata = $(`#heatmap-json-{method$method}-{trace_vars$name}-${{ui.value}}`).text(); - $("#heatmap-cutoff-label-{method$method}-{trace_vars$name}").text(`Minimum coverage: ${{ui.value}}`); - Plotly.react(fig,JSON.parse(figdata)); - }} - }}); - - }}); - ' - ))) - print(cutoff_script) + # walk through shared status + pools_shared %>% + iwalk(\(shared,hdr) { + # show tab header + cat(str_glue("### {hdr} {{.tabset}}\n\n")) + shr <- ifelse(shared,"shared","all") - print(h5(tags$small(HTML( - "Note: Grey cells indicate missing data." - )))) + # filter data for all pools if necessary + if (shared) { + fst_data <- fst_data %>% + mutate(pop1=factor(pop1),pop2=factor(pop2)) %>% + group_by(chrom,pos) %>% + filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% + ungroup() %>% + arrange(chrom,pos,pop1,pop2) + } - # now walk through coverage cutoff levels - # and create figure and/or figure json for each one - cutoffs %>% - walk(\(cutoff) { - # filter data by coverage cutoff - # and summarize by pool comparison - fst_data <- fst_data %>% - filter(avg_min_cov >= cutoff) %>% - group_by(pop1,pop2) %>% - summarise( - fst_sd = sd(fst,na.rm=TRUE), - fst_se = fst_sd / sqrt(n()), - cov = mean(avg_min_cov,na.rm=TRUE), - cov_sd = sd(avg_min_cov,na.rm = TRUE), - cov_se = cov_sd / sqrt(n()), - snps = n_distinct(chrom,pos,na.rm=TRUE), - contigs = n_distinct(chrom,na.rm = TRUE), - snps_per_contig = snps / contigs, - fst = mean(fst,na.rm=TRUE) - ) %>% - ungroup() %>% - # retain the cutoff level in the data - mutate(cutoff=cutoff) - - # get all pool names - all_pops <- sort(union(unique(fst_data$pop1),unique(fst_data$pop2))) - - # create a tibble of all possible combinations (sorted) - full_table <- all_pops %>% - combn(2) %>% - apply(MARGIN = 2,FUN=\(x) sort(x)) %>% - t() %>% - as_tibble() %>% - rename(pop1=1,pop2=2) - - # convert pool names to factors with all possible levels - # make sure pop2 has levels in reverse so the figure looks right - # (i.e., a nice triangle) - # we left join from the table of all combos so we have all combos, even - # if some have missing data - hmd <- full_table %>% - left_join(fst_data,by=c("pop1","pop2")) %>% - arrange(pop1,desc(pop2)) %>% - mutate( - pop1 = factor(pop1,levels=all_pops), - pop2 = factor(pop2,levels=rev(all_pops)) - ) %>% - mutate( - # create columns for the tooltip text - across(where(is.numeric),\(x) x,.names = "{.col}_val"), - # replace misisng integers with -1 - across(where(is.integer) & !ends_with("_val"),\(x) replace_na(x,-1)), - # replace missing floats with min(x)*1e-5 - across(where(is.double) & !ends_with("_val"),\(x) replace_na(x,min(x,na.rm = TRUE)*0.00001)) - ) %>% - arrange(pop1,desc(pop2)) - - # get the factor level that would be the top row of the plot - top_level <- last(levels(hmd$pop2)) - - # now we make a dummy table for just that top row - top_row <- hmd %>% - filter(pop1 == top_level) %>% - # switch pop1 and pop2 - mutate(temp=pop1,pop1=pop2,pop2=temp) %>% - # drop the temp column - select(-temp) %>% - # set all numeric values to NA - mutate(across(-c(pop1,pop2),~NA)) - - # bind dummy values to plot data - # it seems to matter that we bind these - # to the end, rather than the other way around - hmd <- hmd %>% - bind_rows(top_row) - - # create an empty plotly figure - # don't show tooltips if there's no data - fig <- hmd %>% - plot_ly(hoverongaps=FALSE) - - # now we go through each figure pair - # and reduce them to a single plotly figure - # that c(1,2) is just to keep track of if we're on - # the lower or upper triangle - fig <- trace_vars$pairs %>% - reduce2(c(1,2),\(f,v,i) { - # if the element is valid - if (!all(is.na(v))) { - # add a heatmap trace - f %>% - add_trace( - type = "heatmap", - x = ~.data[[v[1]]], # x variable - y = ~.data[[v[2]]], # y variable - z = ~.data[[v[3]]], # z (color) variable - # this is what gets shown in the tooltip - customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), - # zmin and zmax are trying to make it so the "missing" - # color doesn't show up in the color scale legend - zmin = ~min(.data[[v[[3]]]],na.rm = TRUE)*0.00001, - zmax = ~max(.data[[v[3]]],na.rm = TRUE), - colorscale = cscale(v[4],n = 100,na = v[6]), # color palette, based on 100 gradations - # style the colorbar - colorbar = list( - # show the appropriate display name - title=list(text=value_map[v[3]]), - # determine the position of the colorbar - # bottom if it's for the bottom triangle, top if it's for the top - y = ifelse(i == 1,0,1), - # which side of the colorbar we reckon the y-anchor from - yanchor = ifelse(i == 1,"bottom","top"), - len = 0.5, - lenmode = "fraction" - ), - # text template - text = ~str_glue("{pop1} - {pop2}"), - # hover text template (hide the trace name with ) - hovertemplate=str_glue("%{{text}}
{value_map[v[3]]}: %{{customdata}}") + # walk through variable pairs + var_pairs %>% + walk(\(trace_vars) { + # show yet another tab header + cat(str_glue("#### {trace_vars$title}\n\n")) + + print(h5(HTML(trace_vars$description))) + print(h5(tags$small(HTML( + "Note: Grey cells indicate missing data." + )))) + + # generate figure and output + # all necessary figure json + heatmap_fig <- cutoffs %>% + reduce(\(plot_fig,cutoff) { + # summarize by pair and cutoff value + fst_grp <- fst_data %>% + filter(avg_min_cov >= cutoff) %>% + group_by(pop1,pop2) %>% + summarise( + fst_sd = sd(fst,na.rm=TRUE), + fst_se = fst_sd / sqrt(n()), + cov = mean(avg_min_cov,na.rm=TRUE), + cov_sd = sd(avg_min_cov,na.rm = TRUE), + cov_se = cov_sd / sqrt(n()), + snps = n_distinct(chrom,pos,na.rm=TRUE), + contigs = n_distinct(chrom,na.rm = TRUE), + snps_per_contig = snps / contigs, + fst = mean(fst,na.rm=TRUE) + ) %>% + ungroup() %>% + # retain the cutoff level in the data + mutate(cutoff=cutoff) + + # all this next bit is so that the heatmaps look good + # with all relevant rows of both the upper and lower triangles + + # get all possible pool names + all_pops <- sort(union(unique(fst_grp$pop1),unique(fst_grp$pop2))) + + # create a tibble of all possible combinations (sorted) + full_table <- all_pops %>% + combn(2) %>% + apply(MARGIN = 2,FUN=\(x) sort(x)) %>% + t() %>% + as_tibble() %>% + rename(pop1=1,pop2=2) + + # convert pool names to factors with all possible levels + # make sure pop2 has levels in reverse so the figure looks right + # (i.e., a nice triangle) + # we left join from the table of all combos so we have all combos, even + # if some have missing data + hmd <- full_table %>% + left_join(fst_grp,by=c("pop1","pop2")) %>% + arrange(pop1,desc(pop2)) %>% + mutate( + pop1 = factor(pop1,levels=all_pops), + pop2 = factor(pop2,levels=rev(all_pops)) + ) %>% + mutate( + # create columns for the tooltip text + across(where(is.numeric),\(x) x,.names = "{.col}_val"), + # replace misisng integers with -1 + across(where(is.integer) & !ends_with("_val"),\(x) replace_na(x,-1)), + # replace missing floats with min(x)*1e-5 + # this is so we can show grey squares for missing data + across(where(is.double) & !ends_with("_val"),\(x) replace_na(x,min(x,na.rm = TRUE)*0.00001)) + ) %>% + arrange(pop1,desc(pop2)) + # get the factor level that would be the top row of the plot + top_level <- last(levels(hmd$pop2)) + # now we make a dummy table for just that top row + # if we don't do this, the top triangle is missing its top row + top_row <- hmd %>% + filter(pop1 == top_level) %>% + # switch pop1 and pop2 + mutate(temp=pop1,pop1=pop2,pop2=temp) %>% + # drop the temp column + select(-temp) %>% + # set all numeric values to NA + mutate(across(-c(pop1,pop2),~NA)) + # bind dummy values to plot data + # it seems to matter that we bind these + # to the end, rather than the other way around + hmd <- hmd %>% + bind_rows(top_row) + + # start making the figure + fig <- hmd %>% + plot_ly(hoverongaps=FALSE) + + # finish making the figure + fig <- trace_vars$pairs %>% + reduce2(c(1,2),\(plot_fig,v,i) { + if (!all(is.na(v))) { + # add the trace as either the upper or lower triangle + # depending on whether its #1 or #2 + plot_fig %>% + add_trace( + type = "heatmap", + x = ~.data[[v[1]]], # x variable + y = ~.data[[v[2]]], # y variable + z = ~.data[[v[3]]], # z (color) variable + # this is what gets shown in the tooltip + customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), + # zmin and zmax are trying to make it so the "missing" + # color doesn't show up in the color scale legend + zmin = ~min(.data[[v[[3]]]],na.rm = TRUE)*0.00001, + zmax = ~max(.data[[v[3]]],na.rm = TRUE), + colorscale = cscale(v[4],n = 10,na = v[6]), # color palette, based on 100 gradations + # style the colorbar + colorbar = list( + # show the appropriate display name + title=list(text=value_map[v[3]]), + # determine the position of the colorbar + # bottom if it's for the bottom triangle, top if it's for the top + y = ifelse(i == 1,0,1), + # which side of the colorbar we reckon the y-anchor from + yanchor = ifelse(i == 1,"bottom","top"), + len = 0.5, + lenmode = "fraction" + ), + # text template + text = ~str_glue("{pop1} - {pop2}"), + # hover text template (hide the trace name with ) + hovertemplate=str_glue("%{{text}}
{value_map[v[3]]}: %{{customdata}}") + ) + } else return(plot_fig) + },.init = fig) %>% + layout( + # name the figure axes and don't let the user zoom around + xaxis = list(title = "Pool 1", fixedrange = TRUE), + yaxis = list(title = "Pool 2", fixedrange = TRUE) + ) %>% + # hide the mode bar + config(displayModeBar = FALSE) + + # get the figure for output + if (is.null(plot_fig)) { + fig_id <- str_glue("heatmap-fig-{method$method}-{shr}-{trace_vars$name}") + fig$elementId <- fig_id + # we stick it in a div with class 'fig-container' so + # they're easy to find in the DOM using jquery + plot_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("heatmap-json-{method$method}-{shr}-{trace_vars$name}"), + class="fig-container", + tagList(fig) ) - } else return(f) - },.init = fig) %>% - layout( - # name the figure axes and don't let the user zoom around - xaxis = list(title = "Pool 1", fixedrange = TRUE), - yaxis = list(title = "Pool 2", fixedrange = TRUE) - ) %>% - # hide the mode bar - config(displayModeBar = FALSE) - - # assign id attribute to plotly div - fig_id <- str_glue("heatmap-fig-{method$method}-{trace_vars$name}") - fig$elementId <- fig_id - - if (cutoff == first(cutoffs)) { - # output the figure to the report, but only if it's the first coverage cutoff - print(tagList(fig)) - } + ) + } + + # get the figure json and output it + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + # create and print script tag with figure data json + # we use this to draw the other cutoff level figures when the slider is dragged + json_tag <- tags$script( + id = str_glue("heatmap-json-{method$method}-{shr}-{trace_vars$name}-{cutoff}"), + type = 'application/json', + HTML(fig_json) + ) + print(json_tag) - # get minified figure json - fig_json <- fig %>% - plotly_json(jsonedit = FALSE) %>% - jsonlite::minify() + return(plot_fig) + },.init = NULL) - # create and print script tag with figure data json - # we use this to draw the other cutoff level figures when the slider is dragged - script_tag <- tags$script( - id = str_glue("heatmap-json-{method$method}-{trace_vars$name}-{cutoff}"), - type = 'application/json', - HTML(fig_json) - ) - print(script_tag) + # output the heatmap figure + print(heatmap_fig) }) - }) }) + ``` -```{r, fst-correlation, results='asis', eval=show_fst} + +```{r fst-correlation, results='asis', eval=show_fst} cat("# Fst method correlations {.tabset}\n\n") # get calculation methods @@ -1004,18 +1067,18 @@ methods %>% # make a joined table with both methods and subsample to 1000 rows corr <- splot[[pair[1]]] %>% - sample_n(1000) %>% + slice_sample(n=1000) %>% inner_join(splot[[pair[2]]],by=c("chrom","pos","pop1","pop2")) # do a linear regression lmm <- lm(fst.y ~ fst.x, data=corr) - + # get predicted data for the trend line corr <- corr %>% mutate( predicted = lmm %>% predict(corr %>% select(fst.x)) - ) + ) # get method names xtitle <- pair[1] @@ -1032,7 +1095,7 @@ methods %>% mode = 'lines', line = list(color = '#232323', width = 0.7) ) %>% - # add scatter + # add scatter add_trace( x = ~fst.x, y = ~fst.y, @@ -1060,17 +1123,18 @@ methods %>% }) ``` -```{r, fisher, results='asis', eval=show_fisher} +```{r fisher, results='asis', eval=show_fisher} cat("# Fisher test plots {.tabset}\n\n") print(h5(tags$small( - "Fisher test results are presented as static plots due to the potentially large size of the datasets." + "Fisher test results are presented as static plots since these datasets can get huge, making plotly go slow or crash." ))) fisher_data %>% group_by(method) %>% group_walk(\(fishr,method) { cat(str_glue("## {method$method}\n\n")) - + + # TODO: fix cutoff slider for fisher plots # make slider for coverage cutoff cutoff_slider <- div( class="coverage-slider", @@ -1086,7 +1150,7 @@ fisher_data %>% ) ) print(cutoff_slider) - + cutoff_script <- tags$script(HTML(str_glue( ' $(function() {{ @@ -1097,12 +1161,12 @@ fisher_data %>% tickInterval: {params$nf$report_coverage_step}, change: function (e, ui) {{ $(".fisher-plotz-{method$method}").hide(); - $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); + $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); $("#fisher-cutoff-label-{method$method}").text(`Minimum coverage: ${{ui.value}}`); }}, slide: function (e, ui) {{ $(".fisher-plotz-{method$method}").hide(); - $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); + $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); $("#fisher-cutoff-label-{method$method}").text(`Minimum coverage: ${{ui.value}}`); }} }}); @@ -1110,9 +1174,9 @@ fisher_data %>% ' ))) print(cutoff_script) - + print(h6(HTML("Points above red line indicate p < 0.05."))) - + figs <- cutoffs %>% # set_names() %>% map(\(cov) { @@ -1132,7 +1196,7 @@ fisher_data %>% geom_hline(yintercept = -log10(0.05), color="red") + theme_bw() + labs(x = 'SNP', y=expression(paste(-log[10]~"(p-value)"))) - + disp <- if(cov == first(cutoffs)) "" else "display: none" plotTag( fig, @@ -1141,7 +1205,7 @@ fisher_data %>% attribs = list( id = str_glue("fisher-plot-{method$method}-{cov}"), class = str_glue('fisher-plotz-{method$method}'), - style = disp + style = disp ) ) }) diff --git a/subworkflows/local/postprocess.nf b/subworkflows/local/postprocess.nf index ec12ef9..53ae84e 100644 --- a/subworkflows/local/postprocess.nf +++ b/subworkflows/local/postprocess.nf @@ -55,7 +55,7 @@ workflow POSTPROCESS { .map{ id, meta, f -> [ meta, f ] } // build rmarkdown report input and params - def file_keys = [ 'fst', 'fisher', 'filter', 'final_filter' ] + // get report file channel as [ meta, reportfile ] ch_report = ch_vcf.map { [ it[0], file("${projectDir}/assets/assesspool_report.Rmd") ] } @@ -65,7 +65,6 @@ workflow POSTPROCESS { .mix( ch_filter.ifEmpty{ [] } ) .mix( ch_filter_final.ifEmpty{ [] } ) .groupTuple() - // .collect() ch_params = ch_input_files. map{ meta, files -> [ meta, files.collect{ [ it.extension, it.name ] }.collectEntries() ] } @@ -75,10 +74,9 @@ workflow POSTPROCESS { ]]} CREATE_REPORT( ch_report, ch_params.map{meta, p -> p}, ch_input_files.map{meta, f -> f} ) + ch_versions = ch_versions.mix(CREATE_REPORT.out.versions.first()) emit: - // // TODO nf-core: edit emitted channels - versions = ch_versions // channel: [ versions.yml ] } From d1be175d81390d0c25d7309ab099f0f5557b614d Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Tue, 29 Jul 2025 19:02:58 -0700 Subject: [PATCH 03/39] extract sequences, update report, fix caching issues --- assets/assesspool_report.Rmd | 148 +++++++++++------- conf/filters.config | 69 ++++++++ conf/modules.config | 9 +- .../local/extractsequences/environment.yml | 19 +++ modules/local/extractsequences/main.nf | 52 ++++++ modules/local/extractsequences/meta.yml | 68 ++++++++ .../resources/usr/bin/extractsequences.R | 94 +++++++++++ .../local/extractsequences/tests/main.nf.test | 73 +++++++++ .../joinfreq/resources/usr/bin/joinfreq.R | 2 +- nextflow.config | 12 +- nextflow_schema.json | 19 ++- subworkflows/local/filter_sim/main.nf | 6 +- subworkflows/local/poolstats.nf | 4 +- subworkflows/local/postprocess.nf | 33 ++-- 14 files changed, 524 insertions(+), 84 deletions(-) create mode 100644 modules/local/extractsequences/environment.yml create mode 100644 modules/local/extractsequences/main.nf create mode 100644 modules/local/extractsequences/meta.yml create mode 100755 modules/local/extractsequences/resources/usr/bin/extractsequences.R create mode 100644 modules/local/extractsequences/tests/main.nf.test diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 6121fb6..595a7fa 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -27,7 +27,7 @@ params: ```{css, echo=FALSE} .sliderbar { position: fixed; - bottom: 20%; + bottom: 40%; margin-left: 0%; padding: 10px; } @@ -35,13 +35,13 @@ params: width: 95% } ``` - @@ -165,7 +164,7 @@ gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ } # get a sequence of coverage cutoffs from input parameters -cutoffs <- seq(params$nf$report_min_coverage,params$nf$report_max_coverage,params$nf$report_coverage_step) +cutoffs <- seq(params$nf$min_coverage_cutoff,params$nf$max_coverage_cutoff,params$nf$coverage_cutoff_step) # load pool data # start with fst @@ -189,7 +188,7 @@ if (show_fisher) { # whether to show VCF filtering show_filter <- params$nf$visualize_filters & !is.null(params$filter) if (show_filter) { - filters <- read_tsv(params$filter) + filters <- read_tsv(params$filter,col_names = c('filter','count')) } else { filters <- NULL } @@ -197,7 +196,7 @@ if (show_filter) { # whether to show cumulative filtering show_final_filter <- params$nf$visualize_filters & !is.null(params$final_filter) if (show_final_filter) { - final_filters <- read_tsv(params$final_filter) + final_filters <- read_tsv(params$final_filter,col_names = c('filter','count')) } else { final_filters <- NULL } @@ -249,7 +248,7 @@ c(show_filter,show_final_filter) %>% h5(tags$small(HTML(filter_notes[i]))) ) ) - + # rearrange filter data a bit # make sure 'Unfiltered' is in bold and comes first filter_data <- filter_data %>% @@ -265,8 +264,8 @@ c(show_filter,show_final_filter) %>% .default = filtered ), changed = filtered > 0 - ) - + ) + # figure out which options did nothing changed <- filter_data %>% filter(changed) @@ -288,12 +287,12 @@ c(show_filter,show_final_filter) %>% family = "Courier New", size = 14 ) - + # quick formatting function num <- scales::comma # tooltip template t_fmt <- 'Before filtering: {num(count[filter == "Unfiltered"])} SNPs
After filtering: {num(count)} SNPs
' - + # make the bar chart # we use ggplot because it's better at doing continuous color scales # for bar plots, then convert it to a plotly plot after the fact @@ -529,11 +528,11 @@ pool_data %>% ) # add layout and annotation to figure - fig <- fig %>% + fig <- fig %>% layout( yaxis = list(title = "Mean Fst",fixedrange=TRUE), xaxis = list(title = "Coverage cutoff",fixedrange=TRUE), - title = list(text=str_glue("Summary statistics by coverage cutoff - {method$method}"), yref= 'paper', y=1), + title = list(text=str_glue("Summary statistics by coverage cutoff - {method$method}"), yref= 'paper', y=1, font=list(size=12)), showlegend = F, updatemenus = list(button_layout), annotations = annotation @@ -675,8 +674,13 @@ pool_data %>% margin=list(l = 100, r = 20, b = 10, t = 30), yaxis = list(title="",tickangle=-35,tickfont=list(size=12)), xaxis = list(title="Fst",tickfont=list(size=12), range=as.list(xrange)), - title = list(text=str_glue("Fst - {hdr}"), y=0.95) - ) + title = list(text=str_glue("Fst - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) + ) %>% + config(toImageButtonOptions = list( + format = "svg", + width = 900, + height = 600 + )) # output the figure to html if (is.null(figs$scatter_fig)) { @@ -712,8 +716,8 @@ pool_data %>% group_by(pop1,pop2,pair) %>% summarise(outs = boxplot.stats(fst)$out) %>% slice_sample(n=30) %>% - ungroup() - + ungroup() + # generate boxplot figure fig <- fst_grp %>% plot_ly( @@ -737,8 +741,14 @@ pool_data %>% layout(showlegend = FALSE) %>% layout( yaxis = list(tickangle=-35,tickfont=list(size=12),title=""), - xaxis = list(tickfont=list(size=12),title="Fst") - ) + xaxis = list(tickfont=list(size=12),title="Fst"), + title = list(text=str_glue("Fst distribution - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) + ) %>% + config(toImageButtonOptions = list( + format = "svg", + width = 900, + height = 600 + )) # save figure for output if (is.null(figs$box_fig)) { @@ -840,8 +850,9 @@ cscale <- function(pal,n=10,na="darkgrey",min=0.00001) { }) if (!is.na(na)) { s[[1]][[1]] <- min - c(list(list(0,na), list(min,na)),s) + s <- c(list(list(0,na), list(min,na)),s) } + s } # replace "NA" (not NA) in a string vector @@ -882,7 +893,7 @@ pool_data %>% "Note: Grey cells indicate missing data." )))) - # generate figure and output + # generate figure and output # all necessary figure json heatmap_fig <- cutoffs %>% reduce(\(plot_fig,cutoff) { @@ -904,13 +915,13 @@ pool_data %>% ungroup() %>% # retain the cutoff level in the data mutate(cutoff=cutoff) - + # all this next bit is so that the heatmaps look good # with all relevant rows of both the upper and lower triangles # get all possible pool names all_pops <- sort(union(unique(fst_grp$pop1),unique(fst_grp$pop2))) - + # create a tibble of all possible combinations (sorted) full_table <- all_pops %>% combn(2) %>% @@ -932,13 +943,10 @@ pool_data %>% pop2 = factor(pop2,levels=rev(all_pops)) ) %>% mutate( + # record whether value was NA + across(where(is.numeric),\(x) is.na(x),.names = "{.col}_na"), # create columns for the tooltip text - across(where(is.numeric),\(x) x,.names = "{.col}_val"), - # replace misisng integers with -1 - across(where(is.integer) & !ends_with("_val"),\(x) replace_na(x,-1)), - # replace missing floats with min(x)*1e-5 - # this is so we can show grey squares for missing data - across(where(is.double) & !ends_with("_val"),\(x) replace_na(x,min(x,na.rm = TRUE)*0.00001)) + across(where(is.numeric),\(x) x,.names = "{.col}_val") ) %>% arrange(pop1,desc(pop2)) # get the factor level that would be the top row of the plot @@ -958,11 +966,10 @@ pool_data %>% # to the end, rather than the other way around hmd <- hmd %>% bind_rows(top_row) - + # start making the figure - fig <- hmd %>% - plot_ly(hoverongaps=FALSE) - + fig <- plot_ly(hoverongaps=FALSE) + # finish making the figure fig <- trace_vars$pairs %>% reduce2(c(1,2),\(plot_fig,v,i) { @@ -970,8 +977,27 @@ pool_data %>% # add the trace as either the upper or lower triangle # depending on whether its #1 or #2 plot_fig %>% + { + if (sum(hmd[[str_glue("{v[3]}_na")]],na.rm = T) > 0 ) { + add_trace( + ., + type="heatmap", + hoverinfo = 'none', + data = hmd %>% + filter( across( all_of(str_glue("{v[3]}_na")), \(na) na | is.na(na) ) ) %>% + mutate( "{v[3]}" := if_else(.data[[str_glue("{v[3]}_na")]] == TRUE,0,NA) ), + x = ~.data[[v[1]]], + y = ~.data[[v[2]]], + z = ~.data[[v[3]]], + colorscale = 'Greys', + showscale = FALSE + ) + } else . + } %>% add_trace( type = "heatmap", + data = hmd, #%>% + # filter( across( all_of(str_glue("{v[3]}_na")), \(na) !na | is.na(na) ) ), x = ~.data[[v[1]]], # x variable y = ~.data[[v[2]]], # y variable z = ~.data[[v[3]]], # z (color) variable @@ -979,9 +1005,10 @@ pool_data %>% customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), # zmin and zmax are trying to make it so the "missing" # color doesn't show up in the color scale legend - zmin = ~min(.data[[v[[3]]]],na.rm = TRUE)*0.00001, - zmax = ~max(.data[[v[3]]],na.rm = TRUE), - colorscale = cscale(v[4],n = 10,na = v[6]), # color palette, based on 100 gradations + # zmin = ~min(.data[[v[[3]]]],na.rm = TRUE),#*0.00001, + # zmin = ~min(.data[[ str_glue("{v[3]}_val") ]],na.rm = TRUE),#*0.00001, + # zmax = ~max(.data[[v[3]]],na.rm = TRUE), + colorscale = cscale(v[4],n = 10,na = NA), # color palette, based on 100 gradations # style the colorbar colorbar = list( # show the appropriate display name @@ -1004,7 +1031,14 @@ pool_data %>% layout( # name the figure axes and don't let the user zoom around xaxis = list(title = "Pool 1", fixedrange = TRUE), - yaxis = list(title = "Pool 2", fixedrange = TRUE) + yaxis = list(title = "Pool 2", fixedrange = TRUE), + # title = str_glue("{trace_vars$title} (coverage cutoff: ≥{cutoff}") + title = list( + text = str_glue("{trace_vars$title} (coverage cutoff: ≥{cutoff})"), + font = list( + size = 12 + ) + ) ) %>% # hide the mode bar config(displayModeBar = FALSE) @@ -1013,7 +1047,7 @@ pool_data %>% if (is.null(plot_fig)) { fig_id <- str_glue("heatmap-fig-{method$method}-{shr}-{trace_vars$name}") fig$elementId <- fig_id - # we stick it in a div with class 'fig-container' so + # we stick it in a div with class 'fig-container' so # they're easy to find in the DOM using jquery plot_fig <- tagList( div( @@ -1039,7 +1073,7 @@ pool_data %>% print(json_tag) return(plot_fig) - },.init = NULL) + },.init = NULL) # output the heatmap figure print(heatmap_fig) @@ -1062,7 +1096,7 @@ methods %>% combn(2) %>% array_branch(2) %>% walk(\(pair) { - cat(str_glue("## Fst correlation: {pair[1]} vs {pair[2]}\n\n")) + cat(str_glue("## {pair[1]} vs {pair[2]}\n\n")) print(h5(tags$small(HTML("Note: Due to potentially large datasets, only a subsample of 1,000 Fst values is shown here.")))) # make a joined table with both methods and subsample to 1000 rows @@ -1142,7 +1176,7 @@ fisher_data %>% tags$label( id=str_glue("fisher-cutoff-label-{method$method}"), `for`=str_glue("fisher-cutoff-sel-{method$method}"), - str_glue('Minimum coverage: {params$nf$report_min_coverage}') + str_glue('Minimum coverage: {params$nf$min_coverage_cutoff}') ), div( id=str_glue("fisher-cutoff-sel-{method$method}") @@ -1155,10 +1189,10 @@ fisher_data %>% ' $(function() {{ $("#fisher-cutoff-sel-{method$method}").labeledslider({{ - min: {params$nf$report_min_coverage}, - max: {params$nf$report_max_coverage}, - step: {params$nf$report_coverage_step}, - tickInterval: {params$nf$report_coverage_step}, + min: {params$nf$min_coverage_cutoff}, + max: {params$nf$max_coverage_cutoff}, + step: {params$nf$coverage_cutoff_step}, + tickInterval: {params$nf$coverage_cutoff_step}, change: function (e, ui) {{ $(".fisher-plotz-{method$method}").hide(); $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); diff --git a/conf/filters.config b/conf/filters.config index 54a19d3..ef96a8c 100644 --- a/conf/filters.config +++ b/conf/filters.config @@ -1,6 +1,10 @@ process { // bcftools filters withName: MAX_ALLELE_LENGTH { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -17,6 +21,10 @@ process { } } withName: QUALITY_DEPTH_RATIO { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -33,6 +41,10 @@ process { } } withName: MIN_QUALITY { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -49,6 +61,10 @@ process { } } withName: VARIANT_TYPE { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -65,6 +81,10 @@ process { } } withName: MISPAIRED_READS { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -81,6 +101,10 @@ process { } } withName: ALTERNATE_OBSERVATIONS { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -97,6 +121,10 @@ process { } } withName: MIN_MAPPING_QUALITY { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -113,6 +141,10 @@ process { } } withName: MAPPING_RATIO { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -130,6 +162,10 @@ process { } } withName: MIN_DEPTH { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -146,6 +182,10 @@ process { } } withName: MIN_POOLS { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -162,6 +202,10 @@ process { } } withName: READ_BALANCE { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ @@ -181,6 +225,10 @@ process { // vcftools filters withName: MAX_MEAN_DEPTH { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -188,6 +236,10 @@ process { ].join(' ').trim() } } withName: MIN_MEAN_DEPTH { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -195,12 +247,20 @@ process { ].join(' ').trim() } } withName: HWE_CUTOFF { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ params.hwe_cutoff ? "--hwe ${params.hwe_cutoff}" : "" ].join(' ').trim() } } withName: MINOR_ALLELE_COUNT { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -208,6 +268,10 @@ process { ].join(' ').trim() } } withName: MAX_MISSING { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -215,6 +279,10 @@ process { ].join(' ').trim() } } withName: THIN { + publishDir = [ + path: { "${params.outdir}/test_filter" }, + mode: params.publish_dir_mode + ] ext.prefix = { "${meta.id}_${meta.filter}_filter" } ext.args = { [ "--recode --recode-INFO-all", @@ -226,6 +294,7 @@ process { // ripgrep wrapper to count snps in a vcf // works whether zipped or not withName: 'COUNT_SNPS.*' { + publishDir = [ path: { "${params.outdir}/count" } ] ext.prefix = { "${meta.filter}_filter_count" } ext.args = { [ '-z', diff --git a/conf/modules.config b/conf/modules.config index 8688e19..7150351 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -39,7 +39,7 @@ process { ], [ path: { "${params.outdir}/report" }, - mode: "symlink", + mode: params.publish_dir_mode, pattern: 'artifacts/*' ] ] @@ -318,4 +318,11 @@ process { "--write-index=tbi" ].join(' ').trim() } } + + withName: EXTRACT_SEQUENCES { + publishDir = [ + path: { "${params.outdir}/sequences" }, + mode: params.publish_dir_mode + ] + } } diff --git a/modules/local/extractsequences/environment.yml b/modules/local/extractsequences/environment.yml new file mode 100644 index 0000000..d360ea3 --- /dev/null +++ b/modules/local/extractsequences/environment.yml @@ -0,0 +1,19 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::gmp=6.2.1 + - conda-forge::pandoc=2.19.2 + - conda-forge::r-optparse=1.7.3 + - conda-forge::r-dplyr=1.0.10 + - conda-forge::r-plyr=1.8.7 + - conda-forge::r-stringr=1.4.1 + - conda-forge::r-data.table=1.14.4 + - conda-forge::r-plotly=4.10.0 + - bioconda::r-seqinr=4.2_16 + - bioconda::bioconductor-rsamtools=2.10.0 + - bioconda::bioconductor-shortread=1.52.0 + - bioconda::bioconductor-genomicalignments=1.30.0 + - bioconda::bioconductor-biomart=2.50.0 diff --git a/modules/local/extractsequences/main.nf b/modules/local/extractsequences/main.nf new file mode 100644 index 0000000..8b3af59 --- /dev/null +++ b/modules/local/extractsequences/main.nf @@ -0,0 +1,52 @@ +process EXTRACT_SEQUENCES { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-61c59287265e27f2c4589cfc90013ef6c2c6acf1:fb3e48060a8c0e5108b1b60a2ad7e090cfb9eee5-0': + 'biocontainers/mulled-v2-61c59287265e27f2c4589cfc90013ef6c2c6acf1:fb3e48060a8c0e5108b1b60a2ad7e090cfb9eee5-0' }" + + input: + tuple val(meta), path(fasta), path(fai) + tuple val(meta), path(fst) + val(fst_cutoff) + + output: + tuple val(meta), path("*.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + extractsequences.R \\ + $args \\ + --fst-cutoff ${fst_cutoff} \\ + --output ${prefix}.fasta \\ + --index ${fai} \\ + $fasta \\ + $fst + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + extractsequences: \$(extractsequences --version) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + touch ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + extractsequences: \$(extractsequences --version) + END_VERSIONS + """ +} diff --git a/modules/local/extractsequences/meta.yml b/modules/local/extractsequences/meta.yml new file mode 100644 index 0000000..e628be8 --- /dev/null +++ b/modules/local/extractsequences/meta.yml @@ -0,0 +1,68 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "extractsequences" +## TODO nf-core: Add a description of the module and list keywords +description: write your description here +keywords: + - sort + - example + - genomics +tools: + - "extractsequences": + ## TODO nf-core: Add a description and other details for the software below + description: "" + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: + identifier: + +## TODO nf-core: Add a description of all of the variables used as input +input: + # Only when we have meta + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + ## TODO nf-core: Delete / customise this example input + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + +## TODO nf-core: Add a description of all of the variables used as output +output: + - bam: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + ## TODO nf-core: Delete / customise this example output + - "*.bam": + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - edam: "http://edamontology.org/format_2573" # CRAM + - edam: "http://edamontology.org/format_3462" # SAM + + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mhoban" +maintainers: + - "@mhoban" diff --git a/modules/local/extractsequences/resources/usr/bin/extractsequences.R b/modules/local/extractsequences/resources/usr/bin/extractsequences.R new file mode 100755 index 0000000..4056687 --- /dev/null +++ b/modules/local/extractsequences/resources/usr/bin/extractsequences.R @@ -0,0 +1,94 @@ +#!/usr/bin/env Rscript + +library(optparse) +library(Rsamtools) +library(data.table) +library(stringr) + +# help message formatter +nice_formatter <- function(object) { + cat(object@usage, fill = TRUE) + cat(object@description, fill = TRUE) + cat("Options:", sep = "\n") + + options_list <- object@options + for (ii in seq_along(options_list)) { + option <- options_list[[ii]] + cat(" ") + if (!is.na(option@short_flag)) { + cat(option@short_flag) + if (optparse:::option_needs_argument(option)) { + cat(" ", toupper(option@metavar), sep = "") + } + cat(", ") + } + if (!is.null(option@long_flag)) { + cat(option@long_flag) + if (optparse:::option_needs_argument(option)) { + cat("=", toupper(option@metavar), sep = "") + } + } + cat("\n ") + cat(sub("%default", optparse:::as_string(option@default), option@help)) + cat("\n\n") + } + cat(object@epilogue, fill = TRUE) + return(invisible(NULL)) +} + +option_list <- list( + make_option(c("-i", "--index"), action="store", default=NA, type='character', help="FASTA index file"), + make_option(c("-o", "--output"), action="store", default=NA, type='character', help="Output FASTA file"), + make_option(c("-f", "--fst-cutoff"), action="store", default=0.7, type='double', help="Fst cutoff value") +) + +# use debug arguments if we have 'em +opt_args <- if (exists("debug_args")) debug_args else commandArgs(TRUE) + +# parse command-line options +opt <- parse_args( + OptionParser( + option_list=option_list, + formatter=nice_formatter, + prog="extractsequences.R", + usage="%prog [options] " + ), + convert_hyphens_to_underscores = TRUE, + positional_arguments = 2, + args = opt_args +) + +index_file <- opt$options$index +output_file <- opt$options$output +fst_cutoff <- opt$options$fst_cutoff + +if (!file.exists(opt$args[1])) { + stop(sprintf("FASTA file %s does not exist.",opt$args[1])) +} +if (!file.exists(opt$args[2])) { + stop(sprintf("Fst file %s does not exist.",opt$args[2])) +} +if (!is.na(index_file) & !file.exists(index_file)) { + stop(sprintf("Index file %s does not exist.",index_file)) +} + +# load indexed fasta file & get sequence lengths +fasta <- FaFile(file=opt$args[1],index=index_file) +lengths <- seqlengths(fasta) + +# use data.table for speed & memory (presumably) +# get sites with "strong" fst and create genomic ranges from them +fst <- fread(opt$args[2])[fst > fst_cutoff, ] +fst <- fst[,.(methods = str_c(unique(method),collapse=',')), by=list(chrom,pos)][order(chrom,pos)] +fst <- fst[,.(positions = str_c(pos,"[",methods,"]",collapse=";")), by = chrom] +fst[, `:=`(len = lengths[chrom],name=str_c(chrom,' positions=',positions))] +fst[,range := str_c(chrom,':1-',len)] + +# get pull sequences from fasta using genomic ranges +ranges <- GRanges(fst$range) +seqs <- getSeq(fasta,ranges) +# assign new names to sequences including strongly differentiated positions +names(seqs) <- fst$name + +# save to fasta +writeXStringSet(seqs,output_file,format="fasta") diff --git a/modules/local/extractsequences/tests/main.nf.test b/modules/local/extractsequences/tests/main.nf.test new file mode 100644 index 0000000..cdebcf8 --- /dev/null +++ b/modules/local/extractsequences/tests/main.nf.test @@ -0,0 +1,73 @@ +// TODO nf-core: Once you have added the required tests, please run the following command to build this file: +// nf-core modules test extractsequences +nextflow_process { + + name "Test Process EXTRACT_SEQUENCES" + script "../main.nf" + process "EXTRACT_SEQUENCES" + + tag "modules" + tag "modules_" + tag "extractsequences" + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used + test("sarscov2 - bam") { + + // TODO nf-core: If you are created a test for a chained module + // (the module requires running more than one process to generate the required output) + // add the 'setup' method here. + // You can find more information about how to use a 'setup' method in the docs (https://nf-co.re/docs/contributing/modules#steps-for-creating-nf-test-for-chained-modules). + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + // See https://nf-co.re/docs/contributing/tutorials/nf-test_assertions for more information and examples. + ) + } + + } + + // TODO nf-core: Change the test name preferably indicating the test-data and file-format used but keep the " - stub" suffix. + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + // TODO nf-core: define inputs of the process here. Example: + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + //TODO nf-core: Add all required assertions to verify the test output. + ) + } + + } + +} diff --git a/modules/local/joinfreq/resources/usr/bin/joinfreq.R b/modules/local/joinfreq/resources/usr/bin/joinfreq.R index 05399e3..ac58578 100755 --- a/modules/local/joinfreq/resources/usr/bin/joinfreq.R +++ b/modules/local/joinfreq/resources/usr/bin/joinfreq.R @@ -77,7 +77,7 @@ opt <- parse_args( OptionParser( option_list=option_list, formatter=nice_formatter, - prog="fisher.R", + prog="joinfreq.R", usage="%prog [options] " ), convert_hyphens_to_underscores = TRUE, diff --git a/nextflow.config b/nextflow.config index 6441ffa..75a788f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,16 +46,18 @@ params { visualize_filters = true filter_only = false - // report options - report_min_coverage = 10 - report_max_coverage = 70 - report_coverage_step = 10 + // report/output options + min_coverage_cutoff = 10 + max_coverage_cutoff = 70 + coverage_cutoff_step = 10 + extract_sequences = false // fst options popoolation2 = false grenedalf = false poolfstat = false missing_zeroes = false + fst_cutoff = 0.7 // popoolation/poolfstat min_count = 2 min_coverage = 10 @@ -81,7 +83,7 @@ params { // Boilerplate options - outdir = null + outdir = 'output' publish_dir_mode = 'symlink' email = null email_on_fail = null diff --git a/nextflow_schema.json b/nextflow_schema.json index e350264..f02d887 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ "schema": "assets/schema_input.json", "mimetype": "text/csv", "pattern": "^\\S+\\.(csv|tsv)$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", + "description": "Path to comma/tab-separated file containing information about pools..", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/assesspool/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, @@ -201,7 +201,7 @@ "description": "p-value threshold for Hardy-Weinberg equlibrium test." }, "thin_snps": { - "type": "boolean", + "type": "integer", "description": "Thin sites so that no two sites are within the specified distance from one another." }, "min_mapping_quality": { @@ -287,6 +287,11 @@ "type": "boolean", "description": "Calculate Fst using `poolfstat`" }, + "fst_cutoff": { + "type": "number", + "default": 0.7, + "description": "Threshold at which Fst values are considered \"strongly differentiated.\"" + }, "match_allele_count": { "type": "boolean", "description": "Only keep sites where with matching allele counts across all pools (if something goes wrong early on, try this option)." @@ -378,20 +383,24 @@ "description": "", "default": "", "properties": { - "report_min_coverage": { + "min_coverage_cutoff": { "type": "integer", "default": 10, "description": "Minimum coverage cutoff for report visualizations." }, - "report_max_coverage": { + "max_coverage_cutoff": { "type": "integer", "default": 70, "description": "Maximum coverage cutoff for report visualizations." }, - "report_coverage_step": { + "coverage_cutoff_step": { "type": "integer", "default": 10, "description": "Report coverage cutoff step." + }, + "extract_sequences": { + "type": "boolean", + "description": "Extract reference contigs with sites having Fst > fst_cutoff (for any pairwise comparison)." } } } diff --git a/subworkflows/local/filter_sim/main.nf b/subworkflows/local/filter_sim/main.nf index b8671fa..31b4bed 100644 --- a/subworkflows/local/filter_sim/main.nf +++ b/subworkflows/local/filter_sim/main.nf @@ -179,11 +179,7 @@ workflow FILTER_SIM { // turn all the count maps into tsv files describing filtering results ch_filter_summary = ch_filter_summary - .map{ meta, count -> meta.subMap('id') } - .unique() - .map{ meta -> [ meta, [ filter: 'filter', count: 'count' ] ] } - .concat( ch_filter_summary ) - .collectFile(newLine: true, sort: false) { meta, filter -> [ "${meta.id}.filter", "${filter.filter}\t${filter.count}" ] } + .collectFile(newLine: true, sort: true) { meta, filter -> [ "${meta.id}.filter", "${filter.filter}\t${filter.count}" ] } .map{ [it.baseName, it] } // join back to meta tag diff --git a/subworkflows/local/poolstats.nf b/subworkflows/local/poolstats.nf index 2d714e4..1fc2062 100644 --- a/subworkflows/local/poolstats.nf +++ b/subworkflows/local/poolstats.nf @@ -127,10 +127,12 @@ workflow POOLSTATS { .map { id, meta, fst, method, freq -> [ meta, freq, fst, method ] } .set { ch_join } + // join pairwise fst results to snp frequencies and concatenate into one big file per input ch_join = JOINFREQ( ch_join ).fst_freq - .collectFile( keepHeader: true, sort: false, storeDir: 'output/fst' ){ meta, joined -> [ "${meta.id}.fst", joined ] } + .collectFile( keepHeader: true, sort: true, storeDir: 'output/fst', cache: true ){ meta, joined -> [ "${meta.id}.fst", joined ] } .map{ [it.baseName, it ] } + // rejoin meta tags ch_join = ch_vcf .map{ it[0] } diff --git a/subworkflows/local/postprocess.nf b/subworkflows/local/postprocess.nf index 53ae84e..c1e6176 100644 --- a/subworkflows/local/postprocess.nf +++ b/subworkflows/local/postprocess.nf @@ -1,5 +1,6 @@ -include { RMARKDOWNNOTEBOOK as CREATE_REPORT } from '../../modules/nf-core/rmarkdownnotebook/main' -include { RIPGREP as COUNT_SNPS_FINAL } from '../../modules/nf-core/ripgrep/main' +include { RMARKDOWNNOTEBOOK as CREATE_REPORT } from '../../modules/nf-core/rmarkdownnotebook/main' +include { RIPGREP as COUNT_SNPS_FINAL } from '../../modules/nf-core/ripgrep/main' +include { EXTRACT_SEQUENCES } from '../../modules/local/extractsequences/main' workflow POSTPROCESS { @@ -18,10 +19,18 @@ workflow POSTPROCESS { ch_versions = Channel.empty() + // extract reference contigs with "strongly differentiated" snps + if (params.extract_sequences) { + EXTRACT_SEQUENCES( ch_ref, ch_fst, params.fst_cutoff ) + ch_versions = ch_versions.mix(EXTRACT_SEQUENCES.out.versions.first()) + } + + // collapse pairwise fisher tests into single files ch_fisher_collapsed = ch_fisher - .collectFile( keepHeader: true, sort: false, storeDir: 'output/fishertest' ){ meta, fish -> [ "${meta.id}.fisher", fish ] } + .collectFile( keepHeader: true, sort: true, storeDir: 'output/fishertest' ){ meta, fish -> [ "${meta.id}.fisher", fish ] } .map{ [ it.baseName, it ] } + // join them back to meta tags ch_fisher_collapsed = ch_vcf .map{ it[0] } @@ -40,12 +49,9 @@ workflow POSTPROCESS { // collect final filter summary into tsv files ch_filter_final = ch_filter_final - .map{ meta, count -> meta.subMap('id') } - .unique() - .map{ meta -> [ meta, [ filter: 'filter', count: 'count' ] ] } - .concat( ch_filter_final ) - .collectFile(newLine: true, sort: false) { meta, filter -> [ "${meta.id}.final_filter", "${filter.filter}\t${filter.count}" ] } + .collectFile(newLine: true, sort: true ) { meta, filter -> [ "${meta.id}.final_filter", "${filter.filter}\t${filter.count}" ] } .map{ [ it.baseName, it ] } + // join them back to the meta tags ch_filter_final = ch_vcf .map{ it[0] } @@ -66,10 +72,19 @@ workflow POSTPROCESS { .mix( ch_filter_final.ifEmpty{ [] } ) .groupTuple() + // subset the params object because there's at least one value that changes + // every time, which invalidates the caching + nf_params = [ + 'coverage_cutoff_step', + 'max_coverage_cutoff', + 'min_coverage_cutoff', + 'visualize_filters' + ] + ch_params = ch_input_files. map{ meta, files -> [ meta, files.collect{ [ it.extension, it.name ] }.collectEntries() ] } .map{ meta, p -> [ meta, p + [ - nf: params, + nf: params.subMap(nf_params), tz: TimeZone.getDefault().getID() ]]} From 27cca7ee90bb552f5c41ba7e229dc5b63d7dd049 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Tue, 29 Jul 2025 19:12:48 -0700 Subject: [PATCH 04/39] cut some extra junk out --- assets/assesspool_report.Rmd | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 595a7fa..8b38d63 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -996,18 +996,12 @@ pool_data %>% } %>% add_trace( type = "heatmap", - data = hmd, #%>% - # filter( across( all_of(str_glue("{v[3]}_na")), \(na) !na | is.na(na) ) ), + data = hmd, x = ~.data[[v[1]]], # x variable y = ~.data[[v[2]]], # y variable z = ~.data[[v[3]]], # z (color) variable # this is what gets shown in the tooltip customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), - # zmin and zmax are trying to make it so the "missing" - # color doesn't show up in the color scale legend - # zmin = ~min(.data[[v[[3]]]],na.rm = TRUE),#*0.00001, - # zmin = ~min(.data[[ str_glue("{v[3]}_val") ]],na.rm = TRUE),#*0.00001, - # zmax = ~max(.data[[v[3]]],na.rm = TRUE), colorscale = cscale(v[4],n = 10,na = NA), # color palette, based on 100 gradations # style the colorbar colorbar = list( @@ -1032,7 +1026,6 @@ pool_data %>% # name the figure axes and don't let the user zoom around xaxis = list(title = "Pool 1", fixedrange = TRUE), yaxis = list(title = "Pool 2", fixedrange = TRUE), - # title = str_glue("{trace_vars$title} (coverage cutoff: ≥{cutoff}") title = list( text = str_glue("{trace_vars$title} (coverage cutoff: ≥{cutoff})"), font = list( From b19112d70f48d70f3c4774399348bba385624a80 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Fri, 1 Aug 2025 15:43:21 -1000 Subject: [PATCH 05/39] conver fisher test and sequence extract to use data.table --- .../resources/usr/bin/extractsequences.R | 6 +- .../fishertest/resources/usr/bin/fisher.R | 257 ++++++++++++------ 2 files changed, 180 insertions(+), 83 deletions(-) diff --git a/modules/local/extractsequences/resources/usr/bin/extractsequences.R b/modules/local/extractsequences/resources/usr/bin/extractsequences.R index 4056687..71b08dd 100755 --- a/modules/local/extractsequences/resources/usr/bin/extractsequences.R +++ b/modules/local/extractsequences/resources/usr/bin/extractsequences.R @@ -79,12 +79,16 @@ lengths <- seqlengths(fasta) # use data.table for speed & memory (presumably) # get sites with "strong" fst and create genomic ranges from them fst <- fread(opt$args[2])[fst > fst_cutoff, ] +# group by contig and position, and string together calculation methods that got these high Fst values fst <- fst[,.(methods = str_c(unique(method),collapse=',')), by=list(chrom,pos)][order(chrom,pos)] +# group by just contig and make a string that looks like pos1[];pos2[];etc. fst <- fst[,.(positions = str_c(pos,"[",methods,"]",collapse=";")), by = chrom] +# pull in sequence lengths and make a 'name' column that's positions= fst[, `:=`(len = lengths[chrom],name=str_c(chrom,' positions=',positions))] +# create range column fst[,range := str_c(chrom,':1-',len)] -# get pull sequences from fasta using genomic ranges +# pull sequences from fasta using genomic ranges ranges <- GRanges(fst$range) seqs <- getSeq(fasta,ranges) # assign new names to sequences including strongly differentiated positions diff --git a/modules/local/fishertest/resources/usr/bin/fisher.R b/modules/local/fishertest/resources/usr/bin/fisher.R index b0ce9c1..85e5f18 100755 --- a/modules/local/fishertest/resources/usr/bin/fisher.R +++ b/modules/local/fishertest/resources/usr/bin/fisher.R @@ -1,14 +1,15 @@ #!/usr/bin/env Rscript -suppressPackageStartupMessages(library(dplyr)) -suppressPackageStartupMessages(library(tidyr)) -suppressPackageStartupMessages(library(readr)) -suppressPackageStartupMessages(library(janitor)) -suppressPackageStartupMessages(library(purrr)) +# suppressPackageStartupMessages(library(dplyr)) +# suppressPackageStartupMessages(library(tidyr)) +# suppressPackageStartupMessages(library(readr)) +# suppressPackageStartupMessages(library(janitor)) +# suppressPackageStartupMessages(library(purrr)) +suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(optparse)) suppressPackageStartupMessages(library(matrixStats)) -options(dplyr.summarise.inform = FALSE) +# options(dplyr.summarise.inform = FALSE) # help message formatter @@ -163,15 +164,19 @@ if (!file.exists(opt$args[1])) { # split(ceiling(seq_along(.)/50)) %>% -freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) +# freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) +freq_snps <- fread(opt$args[1]) if (all(is.na(pools))) { - pools <- freq_snps %>% - slice(1) %>% - pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% - pull(pool) %>% - sort() %>% - unique() + pools <- melt(freq_snps[1], measure = measure(pool,measure,pattern = r'[^(.+)\.(REF_CNT)$]'),value.name = 'count')[ + pool != 'TOTAL' + ][order(pool)]$pool + # pools <- freq_snps %>% + # slice(1) %>% + # pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% + # pull(pool) %>% + # sort() %>% + # unique() } # make sure we're dealing with two pools @@ -181,88 +186,176 @@ if (npool != 2) { stop("Frequency file must contain exactly two pools") } -freq_snp_og <- freq_snps +cols <- names(freq_snps) +pr <- paste0('^(',paste0(pools,collapse='|'),')') + # continue filtering -freq_snps <- freq_snp_og %>% - select(CHROM:ALT,starts_with(paste0(pools,"."),ignore.case = FALSE)) %>% - mutate( - `TOTAL.REF_CNT` = rowSums(pick(ends_with(".REF_CNT",ignore.case = FALSE))), - `TOTAL.ALT_CNT` = rowSums(pick(ends_with(".ALT_CNT",ignore.case = FALSE))), - `TOTAL.DEPTH` = rowSums(pick(ends_with(".DEPTH",ignore.case = FALSE))), - ) %>% - mutate( - lwr = floor((POS-1)/window_step)*window_step+1, - upr = lwr+window_size-1, - middle=floor((upr+lwr)/2) - ) %>% - filter( if_all(ends_with(".DEPTH"),~.x >= min_depth) ) %>% - add_count(CHROM,middle,name="snp_count") %>% - rename_with(make_clean_names,.cols = starts_with("TOTAL.",ignore.case = FALSE)) %>% - filter( - if_all(starts_with("total_",ignore.case = FALSE) & ends_with("_cnt",ignore.case = FALSE),~.x >= min_count), - total_ref_cnt != total_depth, - total_depth >= min_depth - ) %>% - rename_with(CHROM:ALT,.fn=make_clean_names) %>% - select(chrom,pos,middle,ref:alt,ends_with(".REF_CNT"),ends_with(".ALT_CNT"),starts_with("total_"),everything()) - -fisher_results <- freq_snps %>% - select(order(colnames(.))) %>% - select( - chrom, pos, middle, snp_count,start=lwr,end=upr, - ends_with(".DEPTH",ignore.case = FALSE), - ends_with(".REF_CNT",ignore.case = FALSE), - ends_with(".ALT_CNT",ignore.case = FALSE) - ) %>% - pivot_longer( - -c(chrom,pos,middle,snp_count,start,end,ends_with(".DEPTH",ignore.case = FALSE)), - names_to = c("pop","measure"), - values_to = "count", - names_pattern = "^(.+)\\.([^.]+)$" - ) %>% - group_by(chrom,pos,middle,start,end,snp_count,across(ends_with(".DEPTH",ignore.case = FALSE))) %>% - summarise(pval = fisher.test(matrix(count,ncol=2))$p.value) %>% - ungroup() %>% - mutate(min_cov = matrixStats::rowMins(as.matrix(pick(ends_with(".DEPTH",ignore.case = FALSE))))) +# freq_snps <- freq_snps %>% +# select(CHROM:ALT,starts_with(paste0(pools,"."),ignore.case = FALSE)) %>% +# mutate( +# `TOTAL.REF_CNT` = rowSums(pick(ends_with(".REF_CNT",ignore.case = FALSE))), +# `TOTAL.ALT_CNT` = rowSums(pick(ends_with(".ALT_CNT",ignore.case = FALSE))), +# `TOTAL.DEPTH` = rowSums(pick(ends_with(".DEPTH",ignore.case = FALSE))), +# ) %>% +# mutate( +# lwr = floor((POS-1)/window_step)*window_step+1, +# upr = lwr+window_size-1, +# middle=floor((upr+lwr)/2) +# ) %>% +# filter( if_all(ends_with(".DEPTH"),~.x >= min_depth) ) %>% +# add_count(CHROM,middle,name="snp_count") %>% +# rename_with(make_clean_names,.cols = starts_with("TOTAL.",ignore.case = FALSE)) %>% +# filter( +# if_all(starts_with("total_",ignore.case = FALSE) & ends_with("_cnt",ignore.case = FALSE),~.x >= min_count), +# total_ref_cnt != total_depth, +# total_depth >= min_depth +# ) %>% +# rename_with(CHROM:ALT,.fn=make_clean_names) %>% +# select(chrom,pos,middle,ref:alt,ends_with(".REF_CNT"),ends_with(".ALT_CNT"),starts_with("total_"),everything()) + +# try to do as much data.table stuff in-place (using `:=`) as possible to increase memory efficiency + +# first, delete columns we don't care about +freq_snps[,cols[-c(1:4,grep(pr,cols))] := NULL ] +# add a total ref count +freq_snps[,TOTAL.REF_CNT := rowSums(.SD),.SDcols=patterns(r'[\.REF_CNT$]')] +# add a total alt count +freq_snps[,TOTAL.ALT_CNT := rowSums(.SD),.SDcols=patterns(r'[\.ALT_CNT$]')] +# add total depth +freq_snps[,TOTAL.DEPTH := rowSums(.SD),.SDcols=patterns(r'[\.DEPTH$]')] +# add window start +freq_snps[,start := floor((POS-1)/window_step)*window_step+1] +# add window end +freq_snps[,end := start+window_size-1] +# add window center (position) +freq_snps[,middle := floor((end+start)/2)] +# add window snp count +freq_snps[,snp_count := .N, by=.(CHROM,middle)] +# filter to minimum depth +freq_snps <- freq_snps[ freq_snps[,do.call(pmin,.SD) >= min_depth, .SDcols = patterns(r'[\.DEPTH$]')] ] +# filter to minimum count, remove invariant sites, and minimum depth +freq_snps <- freq_snps[ freq_snps[ , do.call(pmin,.SD) >= min_count, .SDcols = patterns(r'[^TOTAL\..+_CNT$]') ] ][ + TOTAL.REF_CNT != TOTAL.DEPTH & TOTAL.DEPTH >= min_depth +] +setnames(freq_snps,old=c("CHROM","POS","REF","ALT"),new=tolower) +setcolorder(freq_snps,neworder = c('chrom','pos','middle','ref','alt')) +freq_snps[,grep('total',names(freq_snps),ignore.case = TRUE) := NULL ] +cols <- names(freq_snps) +mv <- c(cols[1:3],'snp_count','start','end',grep(r'[\.DEPTH$]',cols,value=TRUE),grep(r'[_CNT$]',cols,value=TRUE)) +setcolorder(freq_snps,mv) + +# fisher_results <- freq_snps %>% +# select(order(colnames(.))) %>% +# select( +# chrom, pos, middle, snp_count,start=lwr,end=upr, +# ends_with(".DEPTH",ignore.case = FALSE), +# ends_with(".REF_CNT",ignore.case = FALSE), +# ends_with(".ALT_CNT",ignore.case = FALSE) +# ) %>% +# pivot_longer( +# -c(chrom,pos,middle,snp_count,start,end,ends_with(".DEPTH",ignore.case = FALSE)), +# names_to = c("pop","measure"), +# values_to = "count", +# names_pattern = "^(.+)\\.([^.]+)$" +# ) %>% +# group_by(chrom,pos,middle,start,end,snp_count,across(ends_with(".DEPTH",ignore.case = FALSE))) %>% +# summarise(pval = fisher.test(matrix(count,ncol=2))$p.value) %>% +# ungroup() %>% +# mutate(min_cov = matrixStats::rowMins(as.matrix(pick(ends_with(".DEPTH",ignore.case = FALSE))))) + +# calculate fisher results + +# reshape frequency data and order ref and alt counts appropriately +fisher_results <- melt(freq_snps, measure = measure(pop,measure,pattern = r'[^(.+)\.(.+_CNT)]'),value.name = 'count')[order(chrom,pos,-measure,pop)] + +# get columns to group by +byc <- grep('^(pop|count|measure|ref|alt)$',names(fisher_results),invert = T,value = TRUE) +# calculate fisher tests for each combination +fisher_results <- fisher_results[,.(pval = fisher.test(matrix(count,ncol=2))$p.value),by=byc][order(chrom,pos)] +fisher_results[,avg_min_cov := do.call(pmin,.SD),.SDcols = patterns(r'[\.DEPTH$]')] if (adjust_p) { - fisher_results <- fisher_results %>% - mutate(p_adj = p.adjust(pval,method=adjust_method)) + fisher_results[, pval := p.adjust(pval,method=adjust_method)] + # fisher_results <- fisher_results %>% + # mutate(p_adj = p.adjust(pval,method=adjust_method)) } if (save_all & window_size > 1) { ss <- sprintf("%s/%s_%s_all_snps_fisher.tsv",outdir,file_prefix,paste0(pools,collapse="-")) - fisher_results %>% - mutate( - fisher = -log10(pval), - pop1 = pools[1], - pop2 = pools[2], - window_size = 1, - covered_fraction = 1, - start=pos, - end=pos, - ) %>% - select(chrom,pos,snps,covered_fraction,avg_min_cov = min_cov,pop1,pop2,fisher) %>% - write_tsv(ss) + fisher_results[,`:=`( + fisher = -log10(pval), + pop1 = pools[1], + pop2 = pools[2], + window_size = 1, + covered_fraction = 1, + start=pos, + end=pos + )] + + fwrite( + fisher_results[,c('chrom','pos','covered_fraction','avg_min_cov','pop1','pop2','fisher'),with=FALSE], + file = ss, + sep="\t" + ) + # fisher_results %>% + # mutate( + # fisher = -log10(pval), + # pop1 = pools[1], + # pop2 = pools[2], + # window_size = 1, + # covered_fraction = 1, + # start=pos, + # end=pos, + # ) %>% + # select(chrom,pos,snps,covered_fraction,avg_min_cov = min_cov,pop1,pop2,fisher) %>% + # write_tsv(ss) } -fisher_results <- fisher_results %>% - group_by(chrom,middle,start,end,snp_count) %>% - summarise( +if (window_type != 'single') { + fisher_results <- fisher_results[,.( fisher = -log10(p_combine(pval)), pop1 = pools[1], pop2 = pools[2], - window_size = n(), + window_size = .N, covered_fraction = unique(snp_count)/window_size, - avg_min_cov = mean(min_cov) - ) %>% - ungroup() %>% - mutate( - pos = floor((start+end)/2), - method = 'assesspool' - ) %>% - select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fisher,method) + avg_min_cov = mean(avg_min_cov) + ), by = .(chrom,middle,start,end,snp_count)] + fisher_results[,pos := floor((start+end)/2)] +} else { + fisher_results[,`:=`( + covered_fraction = 1, + window_size = 1, + pop1 = pools[1], + pop2 = pools[2], + fisher = pval + )] +} + +fisher_results[,method := 'assesspool'] +cols <- names(fisher_results) +cn <- which(cols %in% c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','fisher','method')) +fisher_results[,cols[-cn] := NULL] +setcolorder(fisher_results,c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','fisher','method')) + + +# fisher_results <- fisher_results %>% +# group_by(chrom,middle,start,end,snp_count) %>% +# summarise( +# fisher = -log10(p_combine(pval)), +# pop1 = pools[1], +# pop2 = pools[2], +# window_size = n(), +# covered_fraction = unique(snp_count)/window_size, +# avg_min_cov = mean(min_cov) +# ) %>% +# ungroup() %>% +# mutate( +# pos = floor((start+end)/2), +# method = 'assesspool' +# ) %>% +# select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fisher,method) ss <- sprintf("%s/%s_%s_window_%d_%d_fisher.tsv",outdir,file_prefix,paste0(pools,collapse="-"),window_size,window_step) -write_tsv(fisher_results,ss) \ No newline at end of file +fwrite(fisher_results,ss,sep="\t") +# write_tsv(fisher_results,ss) \ No newline at end of file From b0e5f5316c6f322a185956dda1cb247b4b9a1a9b Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Fri, 1 Aug 2025 16:07:29 -1000 Subject: [PATCH 06/39] convert poolfstat to use data.table --- .../fst/resources/usr/bin/poolfstat.R | 72 ++++++++++++------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R index 08ef2d1..e84129a 100755 --- a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R +++ b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R @@ -3,10 +3,11 @@ suppressPackageStartupMessages({ library(poolfstat) library(optparse) - library(tibble) - library(stringr) - library(readr) - library(dplyr) + # library(tibble) + # library(stringr) + # library(readr) + # library(dplyr) + library(data.table) }) @@ -99,33 +100,54 @@ pooldata <- popsync2pooldata( fst <- computeFST(pooldata,sliding.window.size=opt$options$window_size) # get population combo string -fn <- str_c(pooldata@poolnames,collapse='-') +# fn <- str_c(pooldata@poolnames,collapse='-') +fn <- paste0(pooldata@poolnames,collapse='-') # save global Fst -global_fst <- tibble(pop1=pooldata@poolnames[1],pop2=pooldata@poolnames[2],fst=fst$Fst[1]) -write_tsv(global_fst,str_glue("{opt$options$prefix}_global_{fn}.tsv")) +# global_fst <- tibble(pop1=pooldata@poolnames[1],pop2=pooldata@poolnames[2],fst=fst$Fst[1]) +# write_tsv(global_fst,str_glue("{opt$options$prefix}_global_{fn}.tsv")) +global_fst <- data.table(pop1=pooldata@poolnames[1],pop2=pooldata@poolnames[2],fst=fst$Fst[1]) +fwrite(global_fst,sprintf("%s_global_%s.tsv",opt$options$prefix,fn),sep="\t") # save sliding window Fst, if they exist if (!is.null(fst$sliding.windows.fvalues)) { - sliding <- as_tibble(fst$sliding.windows.fvalues) %>% - rename(chrom=1,start=2,end=3,mid=4,cum_mid=5,fst=6) - write_tsv(sliding,str_glue("{opt$options$prefix}_sliding-{opt$options$window_size}_{fn}.tsv")) + # sliding <- as_tibble(fst$sliding.windows.fvalues) %>% + # rename(chrom=1,start=2,end=3,mid=4,cum_mid=5,fst=6) + # write_tsv(sliding,str_glue("{opt$options$prefix}_sliding-{opt$options$window_size}_{fn}.tsv")) + sliding <- as.data.table(fst$sliding.windows.fvalues) + setnames(sliding,new=c('chrom','start','end','mid','cum_mid','fst')) + fwrite(sliding,sprintf("%s_sliding-%d_%s.tsv",opt$options$prefix,opt$options$window_size,fn),sep="\t") } -pools <- str_c(str_c(pooldata@poolnames,collapse=':'),".fst") +# pools <- str_c(str_c(pooldata@poolnames,collapse=':'),".fst") +pools <- paste0(paste0(pooldata@poolnames,collapse=':'),".fst") # save per-snp Fst, match popoolation output -snp_fst <- pooldata@snp.info %>% - as_tibble() %>% - select(chrom=1,pos=2) %>% - mutate( - window_size = opt$options$window_size, - covered_fraction = 1, - # depth = rowSums(pooldata@readcoverage), - avg_min_cov = do.call(pmin,as.data.frame(pooldata@readcoverage)), - pop1 = opt$options$pool_names[1], - pop2 = opt$options$pool_names[2], - fst = fst$snp.Fstats$Fst - ) %>% - select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fst) -write_tsv(snp_fst,str_glue("{opt$options$prefix}_{fn}.fst"),col_names=opt$options$headers) \ No newline at end of file +# snp_fst <- pooldata@snp.info %>% +# as_tibble() %>% +# select(chrom=1,pos=2) %>% +# mutate( +# window_size = opt$options$window_size, +# covered_fraction = 1, +# # depth = rowSums(pooldata@readcoverage), +# avg_min_cov = do.call(pmin,as.data.frame(pooldata@readcoverage)), +# pop1 = opt$options$pool_names[1], +# pop2 = opt$options$pool_names[2], +# fst = fst$snp.Fstats$Fst +# ) %>% +# select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fst) +# write_tsv(snp_fst,str_glue("{opt$options$prefix}_{fn}.fst"),col_names=opt$options$headers) + +snp_fst <- as.data.table(pooldata@snp.info) +cols <- names(snp_fst) +snp_fst[,cols[-c(1:2)] := NULL] +setnames(snp_fst,c('chrom','pos')) +snp_fst[,`:=`( + window_size = opt$options$window_size, + covered_fraction = 1, + avg_min_cov = do.call(pmin,as.data.table(pooldata@readcoverage)), + pop1 = opt$options$pool_names[1], + pop2 = opt$options$pool_names[2], + fst = fst$snp.Fstats$Fst +)] +fwrite(snp_fst,sprintf("%s_%s.fst",opt$options$prefix,fn),col.names = opt$options$headers) From 6b1255310828052059026e609dbcd53be7e5db3a Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Fri, 1 Aug 2025 19:11:21 -1000 Subject: [PATCH 07/39] convert joinfreq to data.table --- .../joinfreq/resources/usr/bin/joinfreq.R | 191 ++++++++++++------ 1 file changed, 131 insertions(+), 60 deletions(-) diff --git a/modules/local/joinfreq/resources/usr/bin/joinfreq.R b/modules/local/joinfreq/resources/usr/bin/joinfreq.R index ac58578..7ccf000 100755 --- a/modules/local/joinfreq/resources/usr/bin/joinfreq.R +++ b/modules/local/joinfreq/resources/usr/bin/joinfreq.R @@ -1,15 +1,16 @@ #!/usr/bin/env Rscript suppressPackageStartupMessages({ - library(dplyr) - library(tidyr) - library(readr) - library(purrr) - library(stringr) + # library(dplyr) + # library(tidyr) + # library(readr) + # library(purrr) + # library(stringr) library(optparse) + library(data.table) }) -options(dplyr.summarise.inform = FALSE) +# options(dplyr.summarise.inform = FALSE) # help message formatter @@ -105,62 +106,132 @@ if (!file.exists(opt$args[2])) { stop(sprintf("Fst file %s does not exist.",opt$args[1])) } -freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) -fst_wide <- read_tsv(opt$args[2],col_types = cols(),progress = FALSE) -if (all(c("start","end") %in% names(fst_wide))) { - fst_wide <- fst_wide %>% - mutate(pos = floor((start+end)/2)) %>% - select(-c(start,end)) %>% - select(chrom,pos,everything()) +freq_snps <- fread(opt$args[1]) +fst <- fread(opt$args[2]) + +# freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) +# fst_wide <- read_tsv(opt$args[2],col_types = cols(),progress = FALSE) +if (all(c("start","end") %in% names(fst))) { + fst[, pos := floor((start+end)/2) ] + fst[, c('start','end') := NULL ] + setcolorder(fst,c('chrom','pos')) + + # fst_wide <- fst_wide %>% + # mutate(pos = floor((start+end)/2)) %>% + # select(-c(start,end)) %>% + # select(chrom,pos,everything()) } # load fst file, pivot long, and filter out NAs -fst <- fst_wide %>% - { - if (calc_method == "grenedalf") { - select(.,chrom:pos,ends_with(".fst",ignore.case = FALSE)) %>% - rename_with(\(n) { - str_match(n,"^(.+):(.+)\\.(fst)$") %>% - array_branch(1) %>% - map_chr(\(x) { - f <- sort(x[2:3]) - str_c(f[1],":",f[2],".",x[4]) - }) - },.cols = -c(chrom:pos)) %>% - pivot_longer(-c(chrom:pos),names_to=c("pop1","pop2",".value"),names_pattern = "^([^:]+):(.+)\\.(fst)$") - } else select(.,chrom,pos,pop1,pop2,fst) - } %>% - filter(!is.na(fst)) - -freq_snps <- freq_snps %>% - pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% - select(CHROM,POS,pool,measure,count) %>% - pivot_wider(names_from="measure",values_from = "count") %>% - rename_with(str_to_lower,.cols = c(1:2)) - -# continue filtering -combined <- fst %>% - left_join(freq_snps,by=c("chrom","pos","pop1" = "pool")) %>% - left_join(freq_snps,by=c("chrom","pos","pop2" = "pool"),suffix=c(".pop1",".pop2")) %>% - mutate( - # summarize read depths - TOTAL.REF_CNT = REF_CNT.pop1 + REF_CNT.pop2, - TOTAL.ALT_CNT = ALT_CNT.pop1 + ALT_CNT.pop2, - TOTAL.DEPTH = DEPTH.pop1 + DEPTH.pop2 - ) %>% - filter( - # filter by minimum minimum depth and others - DEPTH.pop1 >= min_depth & DEPTH.pop2 >= min_depth, - TOTAL.ALT_CNT >= min_count & TOTAL.REF_CNT >= min_count, - TOTAL.REF_CNT != TOTAL.DEPTH, - TOTAL.DEPTH >= min_depth - ) %>% - mutate( - avg_min_cov = pmin(DEPTH.pop1, DEPTH.pop2), - method = calc_method - ) %>% - arrange(chrom,pos,pop1,pop2) %>% - rename_with(str_to_lower) +if (calc_method == "grenedalf") { + # delete unused columns + cols <- names(fst) + fst[,c(cols[grep(r'[\.(missing|numeric|passed|masked|empty|invariant)$]',cols)]) := NULL] + + # make sure the pool name pairs are sorted, so that they + # look like a:b.fst and not like b:a.fst + + # first, pull out fst column names and strip off '.fst' + old_cols <- grep(r'[\.fst$]',names(fst),value=TRUE) + fst_cols <- gsub(r'[\.fst$]','',old_cols) + # split them into pairs by colon, sort the pairs, smash them back together, and re-add '.fst' + fst_cols <- paste0(sapply(strsplit(fst_cols,':'),\(x) paste0(sort(x),collapse=':')),'.fst') + # rename the original columns to the sorted version + setnames(fst,old = old_cols, new=fst_cols) + + # now convert to long format + fst <- melt(fst, measure = measure(pop1,pop2,pattern = r'[^([^:]+):(.+)\.fst$]'),value.name = 'fst') +} else { + setcolorder(fst,c('chrom','pos','pop1','pop2','fst')) + fst[,c(names(fst)[-c(1:5)]) := NULL] +} +# fst <- fst_wide %>% +# { +# if (calc_method == "grenedalf") { +# select(.,chrom:pos,ends_with(".fst",ignore.case = FALSE)) %>% +# rename_with(\(n) { +# str_match(n,"^(.+):(.+)\\.(fst)$") %>% +# array_branch(1) %>% +# map_chr(\(x) { +# f <- sort(x[2:3]) +# str_c(f[1],":",f[2],".",x[4]) +# }) +# },.cols = -c(chrom:pos)) %>% +# pivot_longer(-c(chrom:pos),names_to=c("pop1","pop2",".value"),names_pattern = "^([^:]+):(.+)\\.(fst)$") +# } else select(.,chrom,pos,pop1,pop2,fst) +# } %>% +# filter(!is.na(fst)) + +# pivot frequency table longer + + # fst_wide <- melt(fst_wide, measure = measure(pop1,pop2,pattern = r'[^([^:]+):(.+)\.fst$]'),value.name = 'fst') +freq_snps[,c(grep('^TOTAL',names(freq_snps))) := NULL ] +freq_snps <- melt(freq_snps, measure = measure(pool,measure,pattern = r'[^([^.]+)\.(.+)$]'),value.name = 'count') +freq_snps <- dcast(freq_snps,CHROM+POS+pool ~ measure,value.var = "count")[order(CHROM,POS,pool)] +setnames(freq_snps,1:2,new=tolower) + +# freq_snps2 <- freq_snps %>% +# pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% +# select(CHROM,POS,pool,measure,count) %>% +# pivot_wider(names_from="measure",values_from = "count") %>% +# rename_with(str_to_lower,.cols = c(1:2)) %>% +# arrange(chrom,pos,pool) + +combined <- merge(fst,freq_snps,by.x = c('chrom','pos','pop1'),by.y = c('chrom','pos','pool')) +combined <- merge(combined,freq_snps,by.x = c('chrom','pos','pop2'),by.y = c('chrom','pos','pool')) +setnames( + combined, + c('REF_CNT.x','REF_CNT.y','ALT_CNT.x','ALT_CNT.y','DEPTH.x','DEPTH.y'), + c('REF_CNT.pop1','REF_CNT.pop2','ALT_CNT.pop1','ALT_CNT.pop2','DEPTH.pop1','DEPTH.pop2') +) +combined[,`:=`( + TOTAL.REF_CNT = REF_CNT.pop1 + REF_CNT.pop2, + TOTAL.ALT_CNT = ALT_CNT.pop1 + ALT_CNT.pop2, + TOTAL.DEPTH = DEPTH.pop1 + DEPTH.pop2 +)] +combined <- combined[ + DEPTH.pop1 >= min_depth & DEPTH.pop2 >= min_depth & + TOTAL.ALT_CNT >= min_count & TOTAL.REF_CNT >= min_count & + TOTAL.REF_CNT != TOTAL.DEPTH & + TOTAL.DEPTH >= min_depth +][order(chrom,pos,pop1,pop2)] +combined[,`:=`( + avg_min_cov = do.call(pmin,.SD), + method = calc_method +), .SDcols = patterns('^DEPTH\\.')] +setnames(combined,new=tolower) +setcolorder( + combined, + c('chrom','pos','pop1','pop2','fst','avg_min_cov','method','alt_cnt.pop1','alt_cnt.pop2', + 'ref_cnt.pop1','ref_cnt.pop2','depth.pop1','depth.pop2','total.alt_cnt','total.ref_cnt','total.depth') +) outfile <- sprintf("%s/%s_fst_frequency.tsv",outdir,file_prefix) -write_tsv(combined,outfile) \ No newline at end of file +fwrite(combined,outfile,sep="\t") + +# # continue filtering +# combined <- fst %>% +# left_join(freq_snps,by=c("chrom","pos","pop1" = "pool")) %>% +# left_join(freq_snps,by=c("chrom","pos","pop2" = "pool"),suffix=c(".pop1",".pop2")) %>% +# mutate( +# # summarize read depths +# TOTAL.REF_CNT = REF_CNT.pop1 + REF_CNT.pop2, +# TOTAL.ALT_CNT = ALT_CNT.pop1 + ALT_CNT.pop2, +# TOTAL.DEPTH = DEPTH.pop1 + DEPTH.pop2 +# ) %>% +# filter( +# # filter by minimum minimum depth and others +# DEPTH.pop1 >= min_depth & DEPTH.pop2 >= min_depth, +# TOTAL.ALT_CNT >= min_count & TOTAL.REF_CNT >= min_count, +# TOTAL.REF_CNT != TOTAL.DEPTH, +# TOTAL.DEPTH >= min_depth +# ) %>% +# mutate( +# avg_min_cov = pmin(DEPTH.pop1, DEPTH.pop2), +# method = calc_method +# ) %>% +# arrange(chrom,pos,pop1,pop2) %>% +# rename_with(str_to_lower) + +# outfile <- sprintf("%s/%s_fst_frequency.tsv",outdir,file_prefix) +# write_tsv(combined,outfile) \ No newline at end of file From 84c179729f76b44e75755e24bf15f70d03b1d641 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Sat, 2 Aug 2025 12:54:15 -1000 Subject: [PATCH 08/39] update containers and conda environments for data.table conversions --- .../local/extractsequences/environment.yml | 16 +++----------- modules/local/extractsequences/main.nf | 7 +++--- .../resources/usr/bin/extractsequences.R | 9 ++++---- modules/local/fishertest/environment.yml | 7 +----- modules/local/fishertest/main.nf | 7 +++--- .../fishertest/resources/usr/bin/fisher.R | 22 +++++++++++-------- modules/local/joinfreq/environment.yml | 6 +---- modules/local/joinfreq/main.nf | 7 +++--- modules/local/poolfstat/fst/environment.yml | 5 +---- modules/local/poolfstat/fst/main.nf | 7 +++--- 10 files changed, 39 insertions(+), 54 deletions(-) diff --git a/modules/local/extractsequences/environment.yml b/modules/local/extractsequences/environment.yml index d360ea3..78c0fb4 100644 --- a/modules/local/extractsequences/environment.yml +++ b/modules/local/extractsequences/environment.yml @@ -4,16 +4,6 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::gmp=6.2.1 - - conda-forge::pandoc=2.19.2 - - conda-forge::r-optparse=1.7.3 - - conda-forge::r-dplyr=1.0.10 - - conda-forge::r-plyr=1.8.7 - - conda-forge::r-stringr=1.4.1 - - conda-forge::r-data.table=1.14.4 - - conda-forge::r-plotly=4.10.0 - - bioconda::r-seqinr=4.2_16 - - bioconda::bioconductor-rsamtools=2.10.0 - - bioconda::bioconductor-shortread=1.52.0 - - bioconda::bioconductor-genomicalignments=1.30.0 - - bioconda::bioconductor-biomart=2.50.0 + - conda-forge::r-optparse=1.7.5 + - conda-forge::r-data.table=1.17.8 + - bioconda::bioconductor-rsamtools=2.22.0 diff --git a/modules/local/extractsequences/main.nf b/modules/local/extractsequences/main.nf index 8b3af59..c4a4d54 100644 --- a/modules/local/extractsequences/main.nf +++ b/modules/local/extractsequences/main.nf @@ -3,9 +3,10 @@ process EXTRACT_SEQUENCES { label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-61c59287265e27f2c4589cfc90013ef6c2c6acf1:fb3e48060a8c0e5108b1b60a2ad7e090cfb9eee5-0': - 'biocontainers/mulled-v2-61c59287265e27f2c4589cfc90013ef6c2c6acf1:fb3e48060a8c0e5108b1b60a2ad7e090cfb9eee5-0' }" + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/mulled-v2-61c59287265e27f2c4589cfc90013ef6c2c6acf1:fb3e48060a8c0e5108b1b60a2ad7e090cfb9eee5-0': + // 'biocontainers/mulled-v2-61c59287265e27f2c4589cfc90013ef6c2c6acf1:fb3e48060a8c0e5108b1b60a2ad7e090cfb9eee5-0' }" + container 'fishbotherer/extractr:1.0.0' input: tuple val(meta), path(fasta), path(fai) diff --git a/modules/local/extractsequences/resources/usr/bin/extractsequences.R b/modules/local/extractsequences/resources/usr/bin/extractsequences.R index 71b08dd..d0b2de6 100755 --- a/modules/local/extractsequences/resources/usr/bin/extractsequences.R +++ b/modules/local/extractsequences/resources/usr/bin/extractsequences.R @@ -3,7 +3,6 @@ library(optparse) library(Rsamtools) library(data.table) -library(stringr) # help message formatter nice_formatter <- function(object) { @@ -80,13 +79,13 @@ lengths <- seqlengths(fasta) # get sites with "strong" fst and create genomic ranges from them fst <- fread(opt$args[2])[fst > fst_cutoff, ] # group by contig and position, and string together calculation methods that got these high Fst values -fst <- fst[,.(methods = str_c(unique(method),collapse=',')), by=list(chrom,pos)][order(chrom,pos)] +fst <- fst[,.(methods = paste0(unique(method),collapse=',')), by=list(chrom,pos)][order(chrom,pos)] # group by just contig and make a string that looks like pos1[];pos2[];etc. -fst <- fst[,.(positions = str_c(pos,"[",methods,"]",collapse=";")), by = chrom] +fst <- fst[,.(positions = paste0(pos,"[",methods,"]",collapse=";")), by = chrom] # pull in sequence lengths and make a 'name' column that's positions= -fst[, `:=`(len = lengths[chrom],name=str_c(chrom,' positions=',positions))] +fst[, `:=`(len = lengths[chrom],name=paste0(chrom,' positions=',positions))] # create range column -fst[,range := str_c(chrom,':1-',len)] +fst[,range := paste0(chrom,':1-',len)] # pull sequences from fasta using genomic ranges ranges <- GRanges(fst$range) diff --git a/modules/local/fishertest/environment.yml b/modules/local/fishertest/environment.yml index f101652..bbc8b6e 100644 --- a/modules/local/fishertest/environment.yml +++ b/modules/local/fishertest/environment.yml @@ -2,10 +2,5 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::r-dplyr=1.1.4 - - conda-forge::r-tidyr=1.3.1 - - conda-forge::r-readr=2.1.5 - - conda-forge::r-janitor=2.2.1 - - conda-forge::r-purrr=1.0.4 - conda-forge::r-optparse=1.7.5 - - conda-forge::r-matrixstats=1.5.0 + - conda-forge::r-data.table=1.17.8 diff --git a/modules/local/fishertest/main.nf b/modules/local/fishertest/main.nf index ca3d5cb..89cb735 100644 --- a/modules/local/fishertest/main.nf +++ b/modules/local/fishertest/main.nf @@ -3,9 +3,10 @@ process FISHERTEST { label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-dcb5179d6045c4df41d4319c99d446681271222e:4856497379063722ac7efef973f14ae775bd88ca-0': - 'biocontainers/mulled-v2-dcb5179d6045c4df41d4319c99d446681271222e:4856497379063722ac7efef973f14ae775bd88ca-0' }" + container 'fishbotherer/fishertest:1.0.2' + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/mulled-v2-dd9299acb4f2a6af9c57792a648c9b309383d16a:006ebbae55f43556724857309229bb9ee9b5f648-0': + // 'biocontainers/mulled-v2-dd9299acb4f2a6af9c57792a648c9b309383d16a:006ebbae55f43556724857309229bb9ee9b5f648-0' }" input: tuple val(meta), val(pools), path(frequency) diff --git a/modules/local/fishertest/resources/usr/bin/fisher.R b/modules/local/fishertest/resources/usr/bin/fisher.R index 85e5f18..29959d7 100755 --- a/modules/local/fishertest/resources/usr/bin/fisher.R +++ b/modules/local/fishertest/resources/usr/bin/fisher.R @@ -7,7 +7,7 @@ # suppressPackageStartupMessages(library(purrr)) suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(optparse)) -suppressPackageStartupMessages(library(matrixStats)) +# suppressPackageStartupMessages(library(matrixStats)) # options(dplyr.summarise.inform = FALSE) @@ -60,12 +60,12 @@ gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ pval_callback <- function(opt, flag, val, parser, ...) { - val <- char.expand(val,c("multiply","geometric-mean")) - switch( - val, - 'multiply' = prod, - 'geometric-mean' = gm_mean - ) + char.expand(val,c("multiply","geometric-mean")) + # switch( + # val, + # 'multiply' = prod, + # 'geometric-mean' = gm_mean + # ) } expand_callback <- function(opt, flag, val, parser, args, ...) { @@ -92,7 +92,7 @@ option_list <- list( make_option(c("-C", "--min-count"), action="store", default=2, type='double', help="Minimum Minimal allowed read count per base"), make_option(c("-c", "--min-coverage"), action="store", default=0, type='double', help="Minimum coverage per pool"), make_option(c("-x", "--max-coverage"), action="store", default=1e03, type='double', help="Maximum coverage per pool"), - make_option(c("-a", "--pval-aggregate"), action="callback", default=prod, type='character', help="P-value aggregation method",callback=pval_callback), + make_option(c("-a", "--pval-aggregate"), action="callback", default='multiply', type='character', help="P-value aggregation method",callback=pval_callback), make_option(c("-A", "--all-snps"), action="store_true", default=FALSE, type='logical', help="Save fisher test results for all SNPs, regardless of window size"), make_option(c("-j", "--adjust-pval"), action="store_true", default=FALSE, type='logical', help="Adjust p-value for multiple comparisons"), make_option(c("-J", "--adjust-method"), action="store", default="bonferroni", type='character', help="P-value adjustment method"), @@ -137,7 +137,11 @@ threads <- opt$options$threads thread_lines <- opt$options$lines_per_thread file_prefix <- opt$options$prefix outdir <- opt$options$output_dir -p_combine <- opt$options$pval_aggregate +p_combine <- switch( + opt$options$pval_aggregate, + 'multiply' = prod, + 'geometric-mean' = gm_mean +) save_all <- opt$options$all_snps adjust_p <- opt$options$adjust_pval adjust_method <- opt$options$adjust_method diff --git a/modules/local/joinfreq/environment.yml b/modules/local/joinfreq/environment.yml index 7ff4f81..6a14ca3 100644 --- a/modules/local/joinfreq/environment.yml +++ b/modules/local/joinfreq/environment.yml @@ -2,10 +2,6 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::r-dplyr=1.1.4 - - conda-forge::r-tidyr=1.3.1 - - conda-forge::r-readr=2.1.5 - - conda-forge::r-purrr=1.0.4 - - conda=forge::r-stringr=1.5.1 - conda-forge::r-optparse=1.7.5 + - conda-forge::r-data.table=1.17.8 diff --git a/modules/local/joinfreq/main.nf b/modules/local/joinfreq/main.nf index 050dd97..b2f8e42 100644 --- a/modules/local/joinfreq/main.nf +++ b/modules/local/joinfreq/main.nf @@ -3,9 +3,10 @@ process JOINFREQ { label 'process_single_mem' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0': - 'biocontainers/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0' }" + container 'fishbotherer/fishertest:1.0.2' + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0': + // 'biocontainers/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0' }" input: tuple val(meta), path(frequency), path(fst), val(method) diff --git a/modules/local/poolfstat/fst/environment.yml b/modules/local/poolfstat/fst/environment.yml index f2ca38b..6f1a026 100644 --- a/modules/local/poolfstat/fst/environment.yml +++ b/modules/local/poolfstat/fst/environment.yml @@ -4,7 +4,4 @@ channels: dependencies: - conda=forge::r-poolfstat=3.0.0 - conda=forge::r-optparse=1.7.5 - - conda=forge::r-tibble=3.2.1 - - conda=forge::r-stringr=1.5.1 - - conda=forge::r-readr=2.1.5 - - conda=forge::r-dplyr=1.1.4 + - conda-forge::r-data.table=1.17.8 diff --git a/modules/local/poolfstat/fst/main.nf b/modules/local/poolfstat/fst/main.nf index 79835ab..2e6f158 100644 --- a/modules/local/poolfstat/fst/main.nf +++ b/modules/local/poolfstat/fst/main.nf @@ -3,9 +3,10 @@ process POOLFSTAT_FST { label 'process_medium_low' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-5d425a6e26ab032a4dcba5bc997f43ab9299830a:a1958d454debf3df6f397d04d5f094df68682ef9-0': - 'biocontainers/mulled-v2-5d425a6e26ab032a4dcba5bc997f43ab9299830a:a1958d454debf3df6f397d04d5f094df68682ef9-0' }" + container 'fishbotherer/poolfstat:1.0.2' + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/mulled-v2-5d425a6e26ab032a4dcba5bc997f43ab9299830a:a1958d454debf3df6f397d04d5f094df68682ef9-0': + // 'biocontainers/mulled-v2-5d425a6e26ab032a4dcba5bc997f43ab9299830a:a1958d454debf3df6f397d04d5f094df68682ef9-0' }" input: tuple val(meta), val(pool_map), path(sync) From fb8616e0c7fb149bbe0d1e2c733255ce14287cdd Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 10:56:49 -1000 Subject: [PATCH 09/39] small fixes to fisher & poolfstat --- .../fishertest/resources/usr/bin/fisher.R | 2 +- .../fst/resources/usr/bin/poolfstat.R | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/local/fishertest/resources/usr/bin/fisher.R b/modules/local/fishertest/resources/usr/bin/fisher.R index 29959d7..97346b8 100755 --- a/modules/local/fishertest/resources/usr/bin/fisher.R +++ b/modules/local/fishertest/resources/usr/bin/fisher.R @@ -332,7 +332,7 @@ if (window_type != 'single') { window_size = 1, pop1 = pools[1], pop2 = pools[2], - fisher = pval + fisher = -log10(pval) )] } diff --git a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R index e84129a..4adc96d 100755 --- a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R +++ b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R @@ -114,9 +114,9 @@ if (!is.null(fst$sliding.windows.fvalues)) { # sliding <- as_tibble(fst$sliding.windows.fvalues) %>% # rename(chrom=1,start=2,end=3,mid=4,cum_mid=5,fst=6) # write_tsv(sliding,str_glue("{opt$options$prefix}_sliding-{opt$options$window_size}_{fn}.tsv")) - sliding <- as.data.table(fst$sliding.windows.fvalues) - setnames(sliding,new=c('chrom','start','end','mid','cum_mid','fst')) - fwrite(sliding,sprintf("%s_sliding-%d_%s.tsv",opt$options$prefix,opt$options$window_size,fn),sep="\t") + setDT(fst$sliding.windows.fvalues) + setnames(fst$sliding.windows.fvalues,new=c('chrom','start','end','mid','cum_mid','fst')) + fwrite(fst$sliding.windows.fvalues,sprintf("%s_sliding-%d_%s.tsv",opt$options$prefix,opt$options$window_size,fn),sep="\t") } # pools <- str_c(str_c(pooldata@poolnames,collapse=':'),".fst") @@ -138,11 +138,11 @@ pools <- paste0(paste0(pooldata@poolnames,collapse=':'),".fst") # select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fst) # write_tsv(snp_fst,str_glue("{opt$options$prefix}_{fn}.fst"),col_names=opt$options$headers) -snp_fst <- as.data.table(pooldata@snp.info) -cols <- names(snp_fst) -snp_fst[,cols[-c(1:2)] := NULL] -setnames(snp_fst,c('chrom','pos')) -snp_fst[,`:=`( +setDT(pooldata@snp.info) +cols <- names(pooldata@snp.info) +pooldata@snp.info[,cols[-c(1:2)] := NULL] +setnames(pooldata@snp.info,c('chrom','pos')) +pooldata@snp.info[,`:=`( window_size = opt$options$window_size, covered_fraction = 1, avg_min_cov = do.call(pmin,as.data.table(pooldata@readcoverage)), @@ -150,4 +150,4 @@ snp_fst[,`:=`( pop2 = opt$options$pool_names[2], fst = fst$snp.Fstats$Fst )] -fwrite(snp_fst,sprintf("%s_%s.fst",opt$options$prefix,fn),col.names = opt$options$headers) +fwrite(pooldata@snp.info,sprintf("%s_%s.fst",opt$options$prefix,fn),col.names = opt$options$headers) From b110cd935425e09b9b36ba71ca2738dffaf60103 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 10:56:57 -1000 Subject: [PATCH 10/39] convert report to data.table --- assets/assesspool_report.Rmd | 1477 ++++++++++++++++------------------ 1 file changed, 714 insertions(+), 763 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 8b38d63..1289ae0 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -118,16 +118,13 @@ if (!is.null(params$tz)) { # load libraries library(tidyr) +library(data.table) library(knitr) -library(yaml) library(plotly) library(DT) -library(readr) -library(dplyr) library(stringr) library(purrr) library(paletteer) -library(fs) library(htmltools) if (params$debug) { @@ -136,7 +133,6 @@ if (params$debug) { # set some global options knitr::opts_chunk$set(echo = FALSE, warning = FALSE) -options(dplyr.summarise.inform = FALSE) ``` ```{r init, include=FALSE} @@ -170,8 +166,9 @@ cutoffs <- seq(params$nf$min_coverage_cutoff,params$nf$max_coverage_cutoff,param # start with fst show_fst <- !is.null(params$fst) if (show_fst) { - pool_data <- read_tsv(params$fst) %>% - mutate(pair = str_glue("{pop1} - {pop2}")) + pool_data <- fread(params$fst) + pool_data[,pair := paste0(pop1," - ",pop2)] + setorder(pool_data,method,chrom,pos) } else { pool_data <- FALSE } @@ -181,14 +178,17 @@ fisher_data <- NULL # load fisher data if applicable show_fisher <- !is.null(params$fisher) if (show_fisher) { - fisher_data <- read_tsv(params$fisher) %>% - select(chrom,pos,pop1,pop2,avg_min_cov,log_fisher=fisher,method) + fisher_data <- fread(params$fisher) + fisher_data[,c(names(fisher_data)[!names(fisher_data) %in% c('chrom','pos','pop1','pop2','avg_min_cov','fisher','method')]) := NULL] + setnames(fisher_data,'fisher','log_fisher') + setcolorder(fisher_data,c('chrom','pos','pop1','pop2','avg_min_cov','log_fisher','method')) + setorder(fisher_data,method,chrom,pos) } # whether to show VCF filtering show_filter <- params$nf$visualize_filters & !is.null(params$filter) if (show_filter) { - filters <- read_tsv(params$filter,col_names = c('filter','count')) + filters <- fread(params$filter,col.names = c('filter','count')) } else { filters <- NULL } @@ -196,7 +196,7 @@ if (show_filter) { # whether to show cumulative filtering show_final_filter <- params$nf$visualize_filters & !is.null(params$final_filter) if (show_final_filter) { - final_filters <- read_tsv(params$final_filter,col_names = c('filter','count')) + final_filters <- fread(params$final_filter,col.names = c('filter','count')) } else { final_filters <- NULL } @@ -213,6 +213,7 @@ pools_shared <- c('All SNPs'=FALSE,'Shared loci (all pools)'=TRUE) This report summarizes the output of population genetic analyses of a pool-seq RAD library. + ```{r filtering, results='asis', eval=any_filter} # TODO: show filter option values on plot/table cat("# Filtering {.tabset}\n") @@ -239,7 +240,7 @@ c(show_filter,show_final_filter) %>% cat(filter_headers[i],"\n\n") # get relevant filtering data - filter_data <- report_data[[i]] + filter_data <- report_data[[i]][order(-count)] # show some header info and notes print( @@ -251,30 +252,18 @@ c(show_filter,show_final_filter) %>% # rearrange filter data a bit # make sure 'Unfiltered' is in bold and comes first - filter_data <- filter_data %>% - arrange(desc(count)) %>% - mutate( - filter = replace(filter,which(filter == "before"),"Unfiltered"), - filter = replace(filter,which(filter == "cumulative"),"All filters"), - filter = forcats::fct_reorder(filter,-count), - filter = forcats::fct_relevel(filter,"Unfiltered"), - filtered = count[filter == "Unfiltered"] - count, - filtered = case_when( - filter == "Unfiltered" ~ count, - .default = filtered - ), - changed = filtered > 0 - ) + filter_data[,filter := replace(filter,which(filter == "before"),"Unfiltered")] + filter_data[,filter := replace(filter,which(filter == "cumulative"),"All filters")] + filter_data[,filter := forcats::fct_reorder(filter,-count)] + filter_data[,filter := forcats::fct_relevel(filter,"Unfiltered")] + filter_data[,filtered := count[filter == "Unfiltered"] - count] + filter_data[,filtered := ifelse(filter == "Unfiltered",count,filtered)] + filter_data[,changed := filtered > 0] # figure out which options did nothing - changed <- filter_data %>% - filter(changed) - unchanged <- filter_data %>% - filter(!changed) %>% - pull(filter) %>% - as.character() - unchanged <- str_glue("{unchanged}") - unchanged <- str_c(unchanged,collapse=", ") + changed <- filter_data[changed == TRUE] + unchanged <- as.character(filter_data[changed == FALSE]$filter) + unchanged <- str_flatten_comma(unchanged) # show options that didn't do anything if (unchanged != "") { print(h5(tags$small(HTML(str_glue( @@ -323,264 +312,252 @@ c(show_filter,show_final_filter) %>% cat("# High-level summaries {.tabset}\n\n") # walk through data by method -pool_data %>% - group_by(method) %>% - group_walk(\(fst_data,method) { - # output tab header - cat(str_glue("## {method$method} {{.tabset}}\n\n")) - - # walk through whether or not we're looking only at snps shared across all pools - pools_shared %>% - iwalk(\(shared,hdr) { - # output tab header - cat(str_glue("### {hdr} {{.tabset}}\n\n")) - - # filter data for all pools if necessary - if (shared) { - fst_data <- fst_data %>% - mutate(pop1=factor(pop1),pop2=factor(pop2)) %>% - group_by(chrom,pos) %>% - filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% - ungroup() %>% - arrange(chrom,pos,pop1,pop2) - } - - # repeatedly filter the dataset for different coverage levels - # and bind those together into a new dataset - fst_cov <- cutoffs %>% - map(\(cov) { - fst_data %>% - # filter by average minimum coverage - filter(avg_min_cov >= cov) %>% - # retain the cutoff level in the data - mutate(cutoff=cov) - }) %>% - # and bind resulting subsets together into a superset - list_rbind() - - # summarize the data by cutoff level - fst_grp <- fst_cov %>% - group_by(cutoff) %>% - summarise( - fst_sd = sd(fst), # std. dev. of fst - fst = mean(fst), # mean fst - fst_se = fst_sd / sqrt(n()), # std. err. of fst - cov = mean(avg_min_cov), # mean coverage - snps = n_distinct(chrom,pos,na.rm = TRUE), # snp count - contigs = n_distinct(chrom,na.rm=TRUE), # contig count - snps_per_contig = snps / contigs, # mean snps per contig - ci_lower= fst - qt(0.975, n() -1) * fst_se, - ci_upper= fst + qt(0.975, n() -1) * fst_se - ) %>% - arrange(cutoff) %>% - ungroup() %>% - # give the columns friendly names - select( - `Coverage cutoff`=cutoff, - `Mean Fst`=fst, - `Fst (SD)`=fst_sd, - `Fst (SEM)`=fst_se, - `Mean coverage`=cov, - SNPs=snps, - Contigs=contigs, - `SNPs per contig`=snps_per_contig, - `CI (lower)`=ci_lower, - `CI (upper)`=ci_upper - ) +nothing <- pool_data[,{ + # output tab header + fst_data <- .SD + cat(str_glue("## {method} {{.tabset}}\n\n")) + + # walk through whether or not we're looking only at snps shared across all pools + pools_shared %>% + iwalk(\(shared,hdr) { + # output tab header + cat(str_glue("### {hdr} {{.tabset}}\n\n")) + + # filter data for all pools if necessary + if (shared) { + fst_data <- fst_data[ fst_data[, .I[all( unique(c(levels(fst_data$pop1),levels(pop2))) %in% c(pop1,pop2) )], by=c('chrom','pos')]$V1 ] + setorder(fst_data,chrom,pos,pop1,pop2) + } - # construct csv download filename - shared <- ifelse(shared,"all","shared") - filename <- str_glue("{method$method}_{shared}_summary") - - # a javascript callback that rounds numbers to the - # nearest thousand. otherwise the table is full of stuff like 0.3412341234123412341234 - js <- DT::JS( - "function ( data, type, row, meta ) {", - "return type === 'display' ?", - "''+(Math.round((data+Number.EPSILON)*1000)/1000).toLocaleString()+'' :", - "data;", - "}" + # repeatedly filter the dataset for different coverage levels + # and bind those together into a new dataset + fst_cov <- cutoffs %>% + map(\(cov) { + fst_data[avg_min_cov >= cov][,cutoff := cov] + }) %>% + # and bind resulting subsets together into a superset + rbindlist() + + fst_cov <- fst_cov[,{ + fst_sd <- sd(fst) + fst_se <- fst_sd / sqrt(.N) + n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) + n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) + fst_mean <- mean(fst) + .( + fst_sd = fst_sd, + fst = fst_mean, + fst_se = fst_se, + cov = mean(avg_min_cov), + snps = n_snps, # snp count + contigs = n_contigs, + snps_per_contig = n_snps / n_contigs, + ci_lower= fst_mean - qt(0.975, .N -1) * fst_se, + ci_upper= fst_mean + qt(0.975, .N -1) * fst_se ) + },by=cutoff] + setorder(fst_cov,cutoff) + setcolorder(fst_cov,c('cutoff','fst','fst_sd','fst_se','cov','snps','snps_per_contig','ci_lower','ci_upper')) + setnames( fst_cov, + c('cutoff','fst','fst_sd','fst_se','cov','snps','contigs','snps_per_contig','ci_lower','ci_upper'), + c('Coverage cutoff', 'Mean Fst', 'Fst (SD)', 'Fst (SEM)', + 'Mean coverage', 'SNPs', 'Contigs', 'SNPs per contig', 'CI (lower)', 'CI (upper)') + ) - # show plot tab - cat(str_glue("#### Plot\n\n")) - - print(h5("Global summary statistics by coverage cutoff")) - - # create interactive plot and add traces - fig <- fst_grp %>% - plot_ly() %>% - add_trace( - y = ~ `Mean Fst`, - x = ~`Coverage cutoff`, - name = "Mean Fst", - visible = T, - mode = "lines+markers", - type = "scatter", - error_y = ~list( - type='data', - array = `Fst (SD)` - ) - ) %>% - add_trace( - y = ~SNPs, - x = ~`Coverage cutoff`, - name = "SNPs", - mode = "lines+markers", - type = "scatter", - visible=F - ) %>% - add_trace( - y = ~`SNPs per contig`, - x = ~`Coverage cutoff`, - name = "Mean SNPs per contig", - visible = F, - mode = "lines+markers", - type = "scatter" - ) %>% - add_trace( - y = ~Contigs, - x = ~`Coverage cutoff`, - name = "Contigs", - visible = F, - mode = "lines+markers", - type = "scatter" + # show plot tab + cat(str_glue("#### Plot\n\n")) + + print(h5("Global summary statistics by coverage cutoff")) + + # create interactive plot and add traces + fig <- fst_cov %>% + plot_ly() %>% + add_trace( + y = ~ `Mean Fst`, + x = ~`Coverage cutoff`, + name = "Mean Fst", + visible = T, + mode = "lines+markers", + type = "scatter", + error_y = ~list( + type='data', + array = `Fst (SD)` ) + ) %>% + add_trace( + y = ~SNPs, + x = ~`Coverage cutoff`, + name = "SNPs", + mode = "lines+markers", + type = "scatter", + visible=F + ) %>% + add_trace( + y = ~`SNPs per contig`, + x = ~`Coverage cutoff`, + name = "Mean SNPs per contig", + visible = F, + mode = "lines+markers", + type = "scatter" + ) %>% + add_trace( + y = ~Contigs, + x = ~`Coverage cutoff`, + name = "Contigs", + visible = F, + mode = "lines+markers", + type = "scatter" + ) - # configure button layout and behavior - button_layout <- list( - type = "buttons", - direction = "right", - xanchor = 'center', - yanchor = "top", - pad = list('r' = 0, 't' = 10, 'b' = 10), - x = 0.5, - y = 1.27, - buttons = list( - list( - method = "update", - args = list( - list( - visible = list( - T, F, F, F - ) - ), - list( - yaxis = list(title = "Mean Fst") + # configure button layout and behavior + button_layout <- list( + type = "buttons", + direction = "right", + xanchor = 'center', + yanchor = "top", + pad = list('r' = 0, 't' = 10, 'b' = 10), + x = 0.5, + y = 1.27, + buttons = list( + list( + method = "update", + args = list( + list( + visible = list( + T, F, F, F ) ), - label = "Fst" + list( + yaxis = list(title = "Mean Fst") + ) ), - list( - method = "update", - args = list( - list( - visible = list( - F, T, F, F - ) - ), - list(yaxis = list(title = "SNPs") + label = "Fst" + ), + list( + method = "update", + args = list( + list( + visible = list( + F, T, F, F ) ), - label = "SNPs" + list(yaxis = list(title = "SNPs") + ) ), - list( - method = "update", - args = list( - list( - visible = list( - F, F, T, F - ) - ), - list( - yaxis = list(title = "Mean SNPs per contig") + label = "SNPs" + ), + list( + method = "update", + args = list( + list( + visible = list( + F, F, T, F ) ), - label = "SNPs per contig" + list( + yaxis = list(title = "Mean SNPs per contig") + ) ), - list( - method = "update", - args = list( - list( - visible = list( - F, F, F, T - ) - ), - list( - yaxis = list(title = "Number of contigs") + label = "SNPs per contig" + ), + list( + method = "update", + args = list( + list( + visible = list( + F, F, F, T ) ), - label = "Contigs" - ) + list( + yaxis = list(title = "Number of contigs") + ) + ), + label = "Contigs" ) ) + ) - # configure annotation properties - annotation <- list( - list( - text = "Statistic", - x = .15, - y = 1.22, - xref = 'paper', - yref = 'paper', - showarrow = FALSE - ) + # configure annotation properties + annotation <- list( + list( + text = "Statistic", + x = .15, + y = 1.22, + xref = 'paper', + yref = 'paper', + showarrow = FALSE ) + ) - # add layout and annotation to figure - fig <- fig %>% - layout( - yaxis = list(title = "Mean Fst",fixedrange=TRUE), - xaxis = list(title = "Coverage cutoff",fixedrange=TRUE), - title = list(text=str_glue("Summary statistics by coverage cutoff - {method$method}"), yref= 'paper', y=1, font=list(size=12)), - showlegend = F, - updatemenus = list(button_layout), - annotations = annotation - ) %>% - config(displayModeBar = FALSE) - - # display figure - print(tagList(fig)) - - # show data table tab - cat(str_glue("#### Table\n\n")) - - print(h5("Global summary statistics by coverage cutoff")) - - # output the datatable - print( - tagList( - datatable( - fst_grp, - escape = FALSE, - rownames = FALSE, - filter = "none", - autoHideNavigation = TRUE, - extensions = 'Buttons', - options = list( - dom = 'tB', - buttons = list( - list( - extend='csv', - title=filename, - exportOptions = list( modifier = list(page = "all"), orthogonal = "export" - ) - ) - ), - scrollX = TRUE, - columnDefs = list( - list( - targets = c(2:5,8:10)-1, - render = js + # add layout and annotation to figure + fig <- fig %>% + layout( + yaxis = list(title = "Mean Fst",fixedrange=TRUE), + xaxis = list(title = "Coverage cutoff",fixedrange=TRUE), + title = list(text=str_glue("Summary statistics by coverage cutoff - {method}"), yref= 'paper', y=1, font=list(size=12)), + showlegend = F, + updatemenus = list(button_layout), + annotations = annotation + ) %>% + config(displayModeBar = FALSE) + + # display figure + print(tagList(fig)) + + # show data table tab + cat(str_glue("#### Table\n\n")) + + # construct csv download filename + shared <- ifelse(shared,"all","shared") + filename <- str_glue("{method}_{shared}_summary") + + # a javascript callback that rounds numbers to the + # nearest thousand. otherwise the table is full of stuff like 0.3412341234123412341234 + js <- DT::JS( + "function ( data, type, row, meta ) {", + "return type === 'display' ?", + "''+(Math.round((data+Number.EPSILON)*1000)/1000).toLocaleString()+'' :", + "data;", + "}" + ) + + print(h5("Global summary statistics by coverage cutoff")) + + # output the datatable + print( + tagList( + datatable( + fst_cov, + escape = FALSE, + rownames = FALSE, + filter = "none", + autoHideNavigation = TRUE, + extensions = 'Buttons', + options = list( + dom = 'tB', + buttons = list( + list( + extend='csv', + title=filename, + exportOptions = list( modifier = list(page = "all"), orthogonal = "export" ) ) + ), + scrollX = TRUE, + columnDefs = list( + list( + targets = c(2:5,8:10)-1, + render = js + ) ) - ) %>% - formatRound(c(1,6,7),digits=0) - ) + ) + ) %>% + formatRound(c(1,6,7),digits=0) ) - }) - }) + ) + }) +},by=method] + ``` @@ -588,210 +565,199 @@ pool_data %>% ```{r fst-cov-all, results='asis', eval=show_fst} cat("# Pairwise Fst by minimum coverage {.tabset}\n\n") # walk through calculation methods -pool_data %>% - group_by(method) %>% - group_walk(\(fst_data,method) { - # output tab header - cat(str_glue("## {method$method} {{.tabset}}\n\n")) - - # walk through whether or not we're looking only at snps shared across all pools - pools_shared %>% - iwalk(\(shared,hdr) { - # output tab header - cat(str_glue("### {hdr} {{.tabset}}\n\n")) - shr <- ifelse(shared,"shared","all") - - # filter data for all pools if necessary - if (shared) { - fst_data <- fst_data %>% - mutate(pop1=factor(pop1),pop2=factor(pop2)) %>% - group_by(chrom,pos) %>% - filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% - ungroup() %>% - arrange(chrom,pos,pop1,pop2) - } - - # calculate plot x-axis range - # basically, get the min and max mean fst across all coverage cutoffs - # we're doing this so all the different versions of the plot have the - # same x-axis. (it might be a bit data intensive but worthwhile) - xrange <- reduce(cutoffs,\(range,cutoff) { - r <- fst_data %>% - filter(avg_min_cov >= cutoff) %>% - group_by(pop1,pop2) %>% - summarise(fst = mean(fst)) %>% - pull(fst) %>% - range() - c(min(range[1],r[1]),max(range[2],r[2])) - },.init = c(0,0) ) - xrange <- c(min(xrange[1]-abs(0.1*xrange[1]),0),min(xrange[2]+abs(0.1*xrange[2]),1)) - - # get initial figures and output json script tags - fst_figs <- cutoffs %>% - reduce(\(figs,cutoff) { - # summarize the data by comparison and cutoff level - # precompute box plot stats because the datasets are too big - # and pandoc runs out of memory - fst_grp <- fst_data %>% - filter(avg_min_cov >= cutoff) %>% - group_by(pop1,pop2,pair) %>% - summarise( - fst_sd = sd(fst), # std. dev. of fst - q1 = as.numeric(quantile(fst)[2]), - q3 = as.numeric(quantile(fst)[4]), - lwr = boxplot.stats(fst)$stats[1], - upr = boxplot.stats(fst)$stats[5], - med = median(fst), - min = min(fst), - max = max(fst), - fst = mean(fst), # mean fst - fst_se = fst_sd / sqrt(n()), # std. err. of fst - cov = mean(avg_min_cov), # mean coverage - snps = n_distinct(chrom,pos,na.rm=TRUE), # snp count - contigs = n_distinct(chrom,na.rm=TRUE), # contig count - snps_per_contig = snps / contigs # mean snps per contig - ) %>% - # sort by cutoff and comparison - arrange(cutoff,pair) %>% - ungroup() - - # plot the scatterplot - fig <- fst_grp %>% - plot_ly( - x = ~ fst, - y = ~ pair, - color = ~snps, - # frame = ~str_glue("≥{cutoff}"), - # hover text templates - text = ~str_c(snps," SNPs
",contigs," contigs"), - hovertemplate = "Fst (%{y}): %{x}
%{text}", - type = 'scatter', - mode = 'markers', - marker = list(size = 12) - ) %>% - colorbar(title = "SNPs") %>% - layout( - margin=list(l = 100, r = 20, b = 10, t = 30), - yaxis = list(title="",tickangle=-35,tickfont=list(size=12)), - xaxis = list(title="Fst",tickfont=list(size=12), range=as.list(xrange)), - title = list(text=str_glue("Fst - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) - ) %>% - config(toImageButtonOptions = list( - format = "svg", - width = 900, - height = 600 - )) - - # output the figure to html - if (is.null(figs$scatter_fig)) { - # assign div id to figure - fig_id <- str_glue("fst-fig-{method$method}-{shr}") - fig$elementId <- fig_id - - figs$scatter_fig <- tagList( - div( - `data-fig`=fig_id, - `data-json-prefix`=str_glue("scatter-json-{method$method}-{shr}"), - class="fig-container", - tagList(fig) - ) +nothing <- pool_data[,{ + # output tab header + cat(str_glue("## {method} {{.tabset}}\n\n")) + + fst_data <- .SD + + # walk through whether or not we're looking only at snps shared across all pools + pools_shared %>% + iwalk(\(shared,hdr) { + # output tab header + cat(str_glue("### {hdr} {{.tabset}}\n\n")) + shr <- ifelse(shared,"shared","all") + + # filter data for all pools if necessary + if (shared) { + fst_data <- fst_data[ fst_data[, .I[all( unique(c(levels(fst_data$pop1),levels(pop2))) %in% c(pop1,pop2) )], by=c('chrom','pos')]$V1 ] + setorder(fst_data,chrom,pos,pop1,pop2) + } + + # calculate plot x-axis range + # basically, get the min and max mean fst across all coverage cutoffs + # we're doing this so all the different versions of the plot have the + # same x-axis. (it might be a bit data intensive but worthwhile) + xrange <- reduce(cutoffs,\(range,cutoff) { + r <- range(fst_data[avg_min_cov >= cutoff][,.(fst = mean(fst,na.rm=TRUE)),by=.(pop1,pop2)]$fst) + c(min(range[1],r[1]),max(range[2],r[2])) + },.init = c(0,0) ) + xrange <- c(min(xrange[1]-abs(0.1*xrange[1]),0),min(xrange[2]+abs(0.1*xrange[2]),1)) + + # get initial figures and output json script tags + fst_figs <- cutoffs %>% + reduce(\(figs,cutoff) { + # summarize the data by comparison and cutoff level + # precompute box plot stats because the datasets are too big + # and pandoc runs out of memory + fst_grp <- fst_data[avg_min_cov >= cutoff][,{ + fst_sd <- sd(fst) + n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) + n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) + .( + fst_sd = fst_sd, # std. dev. of fst + q1 = as.numeric(quantile(fst)[2]), + q3 = as.numeric(quantile(fst)[4]), + lwr = boxplot.stats(fst)$stats[1], + upr = boxplot.stats(fst)$stats[5], + med = median(fst), + min = min(fst), + max = max(fst), + fst_se = fst_sd / sqrt(.N), # std. err. of fst + fst = mean(fst), # mean fst + cov = mean(avg_min_cov), # mean coverage + snps = n_snps, # snp count + contigs = n_contigs, # contig count + snps_per_contig = n_snps / n_contigs # mean snps per contig + ) + },by=.(pop1,pop2,pair)] + setorder(fst_grp,pair) + + # plot the scatterplot + fig <- fst_grp %>% + plot_ly( + x = ~ fst, + y = ~ pair, + color = ~snps, + # frame = ~str_glue("≥{cutoff}"), + # hover text templates + text = ~str_c(snps," SNPs
",contigs," contigs"), + hovertemplate = "Fst (%{y}): %{x}
%{text}", + type = 'scatter', + mode = 'markers', + marker = list(size = 12) + ) %>% + colorbar(title = "SNPs") %>% + layout( + margin=list(l = 100, r = 20, b = 10, t = 30), + yaxis = list(title="",tickangle=-35,tickfont=list(size=12)), + xaxis = list(title="Fst",tickfont=list(size=12), range=as.list(xrange)), + title = list(text=str_glue("Fst - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) + ) %>% + config(toImageButtonOptions = list( + format = "svg", + width = 900, + height = 600 + )) + + # output the figure to html + if (is.null(figs$scatter_fig)) { + # assign div id to figure + fig_id <- str_glue("fst-fig-{method}-{shr}") + fig$elementId <- fig_id + + figs$scatter_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("scatter-json-{method}-{shr}"), + class="fig-container", + tagList(fig) ) - } - - # get and print the json for the figure - fig_json <- fig %>% - plotly_json(jsonedit = FALSE) %>% - jsonlite::minify() - json_tag <- tags$script( - id = str_glue("scatter-json-{method$method}-{shr}-{cutoff}"), - type = 'application/json', - HTML(fig_json) ) - print(json_tag) - - # get outlier points, limit to a subsample of 30 per pair because - # too many makes the thing crash. - outs <- fst_data %>% - filter(avg_min_cov >= cutoff) %>% - group_by(pop1,pop2,pair) %>% - summarise(outs = boxplot.stats(fst)$out) %>% - slice_sample(n=30) %>% - ungroup() - - # generate boxplot figure - fig <- fst_grp %>% - plot_ly( - type = "box", - y = ~pair, - q1 = ~q1, - q3 = ~q3, - median = ~med, - lowerfence = ~lwr, - upperfence = ~upr - ) %>% - add_trace( - data = outs, - inherit = FALSE, - y = ~pair, - x = ~outs, - type="scatter", - mode="markers", - frame = ~str_glue("≥{cutoff}") - ) %>% - layout(showlegend = FALSE) %>% - layout( - yaxis = list(tickangle=-35,tickfont=list(size=12),title=""), - xaxis = list(tickfont=list(size=12),title="Fst"), - title = list(text=str_glue("Fst distribution - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) - ) %>% - config(toImageButtonOptions = list( - format = "svg", - width = 900, - height = 600 - )) - - # save figure for output - if (is.null(figs$box_fig)) { - fig_id <- str_glue("box-fig-{method$method}-{shr}") - fig$elementId <- fig_id - figs$box_fig <- tagList( - div( - `data-fig`=fig_id, - `data-json-prefix`=str_glue("box-json-{method$method}-{shr}"), - class="fig-container", - tagList(fig) - ) + } + + # get and print the json for the figure + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + json_tag <- tags$script( + id = str_glue("scatter-json-{method}-{shr}-{cutoff}"), + type = 'application/json', + HTML(fig_json) + ) + print(json_tag) + + # get outlier points, limit to a subsample of 30 per pair because + # too many makes the thing crash. + outs <- fst_data[avg_min_cov >= cutoff][ ,{ + outliers <- boxplot.stats(fst)$out + .( outs = sample(outliers,min(length(outliers),30)) ) + },by=.(pop1,pop2,pair)] + + # generate boxplot figure + fig <- fst_grp %>% + plot_ly( + type = "box", + y = ~pair, + q1 = ~q1, + q3 = ~q3, + median = ~med, + lowerfence = ~lwr, + upperfence = ~upr + ) %>% + add_trace( + data = outs, + inherit = FALSE, + y = ~pair, + x = ~outs, + type="scatter", + mode="markers", + frame = ~str_glue("≥{cutoff}") + ) %>% + layout(showlegend = FALSE) %>% + layout( + yaxis = list(tickangle=-35,tickfont=list(size=12),title=""), + xaxis = list(tickfont=list(size=12),title="Fst"), + title = list(text=str_glue("Fst distribution - {hdr} (coverage ≥{cutoff})"), y=0.95, font=list(size=12)) + ) %>% + config(toImageButtonOptions = list( + format = "svg", + width = 900, + height = 600 + )) + + # save figure for output + if (is.null(figs$box_fig)) { + fig_id <- str_glue("box-fig-{method}-{shr}") + fig$elementId <- fig_id + figs$box_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("box-json-{method}-{shr}"), + class="fig-container", + tagList(fig) ) - } - - # get/print figure json - fig_json <- fig %>% - plotly_json(jsonedit = FALSE) %>% - jsonlite::minify() - json_tag <- tags$script( - id = str_glue("box-json-{method$method}-{shr}-{cutoff}"), - type = 'application/json', - HTML(fig_json) ) - print(json_tag) - return(figs) - },.init = list(scatter_fig = NULL, box_fig = NULL)) - - # show scatter plot tab - cat("#### Scatter plot (means)\n\n") - - print(fst_figs$scatter_fig) - - # show box plot tab - cat("#### Box plot (distributions)\n\n") - print(h5(tags$small(HTML( - "Note: subsampled to 30 outliers per pair." - )))) - - print(fst_figs$box_fig) - - }) - }) + } + + # get/print figure json + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + json_tag <- tags$script( + id = str_glue("box-json-{method}-{shr}-{cutoff}"), + type = 'application/json', + HTML(fig_json) + ) + print(json_tag) + return(figs) + },.init = list(scatter_fig = NULL, box_fig = NULL)) + + # show scatter plot tab + cat("#### Scatter plot (means)\n\n") + + print(fst_figs$scatter_fig) + + # show box plot tab + cat("#### Box plot (distributions)\n\n") + print(h5(tags$small(HTML( + "Note: subsampled to 30 outliers per pair." + )))) + + print(fst_figs$box_fig) + + }) +},by=method] ``` @@ -859,220 +825,216 @@ cscale <- function(pal,n=10,na="darkgrey",min=0.00001) { fix_na <- function(x,na="n/a") replace(x,which(x == "NA"),na) # walk through calculation methods -pool_data %>% - group_by(method) %>% - group_walk(\(fst_data,method) { - # show tab header - cat(str_glue("## {method$method} {{.tabset}}\n\n")) - - # walk through shared status - pools_shared %>% - iwalk(\(shared,hdr) { - # show tab header - cat(str_glue("### {hdr} {{.tabset}}\n\n")) - shr <- ifelse(shared,"shared","all") - - # filter data for all pools if necessary - if (shared) { - fst_data <- fst_data %>% - mutate(pop1=factor(pop1),pop2=factor(pop2)) %>% - group_by(chrom,pos) %>% - filter( all( unique(c(levels(pop1),levels(pop2))) %in% c(pop1,pop2) ) ) %>% - ungroup() %>% - arrange(chrom,pos,pop1,pop2) - } - - # walk through variable pairs - var_pairs %>% - walk(\(trace_vars) { - # show yet another tab header - cat(str_glue("#### {trace_vars$title}\n\n")) - - print(h5(HTML(trace_vars$description))) - print(h5(tags$small(HTML( - "Note: Grey cells indicate missing data." - )))) - - # generate figure and output - # all necessary figure json - heatmap_fig <- cutoffs %>% - reduce(\(plot_fig,cutoff) { - # summarize by pair and cutoff value - fst_grp <- fst_data %>% - filter(avg_min_cov >= cutoff) %>% - group_by(pop1,pop2) %>% - summarise( - fst_sd = sd(fst,na.rm=TRUE), - fst_se = fst_sd / sqrt(n()), - cov = mean(avg_min_cov,na.rm=TRUE), - cov_sd = sd(avg_min_cov,na.rm = TRUE), - cov_se = cov_sd / sqrt(n()), - snps = n_distinct(chrom,pos,na.rm=TRUE), - contigs = n_distinct(chrom,na.rm = TRUE), - snps_per_contig = snps / contigs, - fst = mean(fst,na.rm=TRUE) - ) %>% - ungroup() %>% - # retain the cutoff level in the data - mutate(cutoff=cutoff) - - # all this next bit is so that the heatmaps look good - # with all relevant rows of both the upper and lower triangles - - # get all possible pool names - all_pops <- sort(union(unique(fst_grp$pop1),unique(fst_grp$pop2))) - - # create a tibble of all possible combinations (sorted) - full_table <- all_pops %>% - combn(2) %>% - apply(MARGIN = 2,FUN=\(x) sort(x)) %>% - t() %>% - as_tibble() %>% - rename(pop1=1,pop2=2) - - # convert pool names to factors with all possible levels - # make sure pop2 has levels in reverse so the figure looks right - # (i.e., a nice triangle) - # we left join from the table of all combos so we have all combos, even - # if some have missing data - hmd <- full_table %>% - left_join(fst_grp,by=c("pop1","pop2")) %>% - arrange(pop1,desc(pop2)) %>% - mutate( - pop1 = factor(pop1,levels=all_pops), - pop2 = factor(pop2,levels=rev(all_pops)) - ) %>% - mutate( - # record whether value was NA - across(where(is.numeric),\(x) is.na(x),.names = "{.col}_na"), - # create columns for the tooltip text - across(where(is.numeric),\(x) x,.names = "{.col}_val") - ) %>% - arrange(pop1,desc(pop2)) - # get the factor level that would be the top row of the plot - top_level <- last(levels(hmd$pop2)) - # now we make a dummy table for just that top row - # if we don't do this, the top triangle is missing its top row - top_row <- hmd %>% - filter(pop1 == top_level) %>% - # switch pop1 and pop2 - mutate(temp=pop1,pop1=pop2,pop2=temp) %>% - # drop the temp column - select(-temp) %>% - # set all numeric values to NA - mutate(across(-c(pop1,pop2),~NA)) - # bind dummy values to plot data - # it seems to matter that we bind these - # to the end, rather than the other way around - hmd <- hmd %>% - bind_rows(top_row) - - # start making the figure - fig <- plot_ly(hoverongaps=FALSE) - - # finish making the figure - fig <- trace_vars$pairs %>% - reduce2(c(1,2),\(plot_fig,v,i) { - if (!all(is.na(v))) { - # add the trace as either the upper or lower triangle - # depending on whether its #1 or #2 - plot_fig %>% - { - if (sum(hmd[[str_glue("{v[3]}_na")]],na.rm = T) > 0 ) { - add_trace( - ., - type="heatmap", - hoverinfo = 'none', - data = hmd %>% - filter( across( all_of(str_glue("{v[3]}_na")), \(na) na | is.na(na) ) ) %>% - mutate( "{v[3]}" := if_else(.data[[str_glue("{v[3]}_na")]] == TRUE,0,NA) ), - x = ~.data[[v[1]]], - y = ~.data[[v[2]]], - z = ~.data[[v[3]]], - colorscale = 'Greys', - showscale = FALSE - ) - } else . - } %>% - add_trace( - type = "heatmap", - data = hmd, - x = ~.data[[v[1]]], # x variable - y = ~.data[[v[2]]], # y variable - z = ~.data[[v[3]]], # z (color) variable - # this is what gets shown in the tooltip - customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), - colorscale = cscale(v[4],n = 10,na = NA), # color palette, based on 100 gradations - # style the colorbar - colorbar = list( - # show the appropriate display name - title=list(text=value_map[v[3]]), - # determine the position of the colorbar - # bottom if it's for the bottom triangle, top if it's for the top - y = ifelse(i == 1,0,1), - # which side of the colorbar we reckon the y-anchor from - yanchor = ifelse(i == 1,"bottom","top"), - len = 0.5, - lenmode = "fraction" - ), - # text template - text = ~str_glue("{pop1} - {pop2}"), - # hover text template (hide the trace name with ) - hovertemplate=str_glue("%{{text}}
{value_map[v[3]]}: %{{customdata}}") - ) - } else return(plot_fig) - },.init = fig) %>% - layout( - # name the figure axes and don't let the user zoom around - xaxis = list(title = "Pool 1", fixedrange = TRUE), - yaxis = list(title = "Pool 2", fixedrange = TRUE), - title = list( - text = str_glue("{trace_vars$title} (coverage cutoff: ≥{cutoff})"), - font = list( - size = 12 +nothing <- pool_data[,{ + # show tab header + fst_data <- .SD + cat(str_glue("## {method} {{.tabset}}\n\n")) + + # walk through shared status + pools_shared %>% + iwalk(\(shared,hdr) { + # show tab header + cat(str_glue("### {hdr} {{.tabset}}\n\n")) + shr <- ifelse(shared,"shared","all") + + # filter data for all pools if necessary + if (shared) { + fst_data <- fst_data[ fst_data[, .I[all( unique(c(levels(fst_data$pop1),levels(pop2))) %in% c(pop1,pop2) )], by=c('chrom','pos')]$V1 ] + setorder(fst_data,chrom,pos,pop1,pop2) + } + + # walk through variable pairs + var_pairs %>% + walk(\(trace_vars) { + # show yet another tab header + cat(str_glue("#### {trace_vars$title}\n\n")) + + print(h5(HTML(trace_vars$description))) + print(h5(tags$small(HTML( + "Note: Grey cells indicate missing data." + )))) + + # generate figure and output + # all necessary figure json + heatmap_fig <- cutoffs %>% + reduce(\(plot_fig,cutoff) { + # summarize by pair and cutoff value + fst_grp <- fst_data[avg_min_cov >= cutoff][,{ + fst_sd <- sd(fst,na.rm=TRUE) + cov_sd <- sd(avg_min_cov,na.rm=TRUE) + n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) + n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) + .( + fst_sd = fst_sd, # std. dev. of fst + fst_se = fst_sd / sqrt(.N), # std. err. of fst + fst = mean(fst,na.rm=TRUE), # mean fst + cov = mean(avg_min_cov), # mean coverage + cov_sd = cov_sd, + cov_se = cov_sd / sqrt(.N), + snps = n_snps, # snp count + contigs = n_contigs, # contig count + snps_per_contig = n_snps / n_contigs # mean snps per contig + ) + },by=.(pop1,pop2)] + fst_grp[,cutoff := cutoff] + + # all this next bit is so that the heatmaps look good + # with all relevant rows of both the upper and lower triangles + + # get all possible pool names + all_pops <- sort(union(unique(fst_grp$pop1),unique(fst_grp$pop2))) + + # create a tibble of all possible combinations (sorted) + full_table <- all_pops %>% + combn(2) %>% + apply(MARGIN = 2,FUN=\(x) sort(x)) %>% + t() %>% + as.data.table() + setnames(full_table,c('pop1','pop2')) + + # convert pool names to factors with all possible levels + # make sure pop2 has levels in reverse so the figure looks right + # (i.e., a nice triangle) + # we left join from the table of all combos so we have all combos, even + # if some have missing data + hmd <- merge(full_table,fst_grp,by = c('pop1','pop2'),all=TRUE)[order(pop1,-pop2)] + hmd[,`:=`( + pop1 = factor(pop1,levels=all_pops), + pop2 = factor(pop2,levels=rev(all_pops)) + )] + hmd[, c(paste0(names(.SD),'_na')) := lapply(.SD,is.na), .SDcols = is.numeric] + hmd[, c(paste0(names(.SD),'_val')) := .SD, .SDcols = is.numeric ] + setorder(hmd,pop1,-pop2) + + # get the factor level that would be the top row of the plot + top_level <- last(levels(hmd$pop2)) + + # now we make a dummy table for just that top row + # if we don't do this, the top triangle is missing its top row + + # switch pop1 and pop2 + top_row <- hmd[pop1 == top_level] + top_row[, temp := pop1 ] + top_row[,`:=`( pop1=pop2, pop2=temp, temp=NULL )] + + # set all numeric values to NA + # cols = names(top_row)[!names(top_row) %in% c('pop1','pop2')] + # top_row[, c(cols) := NA ] + top_row[, (names(top_row)[sapply(top_row, is.numeric)]) := lapply(.SD, \(x) NA), .SDcols = is.numeric] + + # bind dummy values to plot data + # it seems to matter that we bind these + # to the end, rather than the other way around + hmd <- rbind(hmd,top_row) + + # start making the figure + fig <- plot_ly(hoverongaps=FALSE) + + # finish making the figure + fig <- trace_vars$pairs %>% + reduce2(c(1,2),\(plot_fig,v,i) { + if (!all(is.na(v))) { + # add the trace as either the upper or lower triangle + # depending on whether its #1 or #2 + col <- str_glue("{v[3]}_na") + hmdd <- hmd[get(col) | is.na(get(col))] + hmdd[,c(v[3]) := fifelse(.SD[[col]] == TRUE,0,NA) ] + + plot_fig %>% + { + if (sum(hmd[[str_glue("{v[3]}_na")]],na.rm = T) > 0 ) { + add_trace( + ., + type="heatmap", + hoverinfo = 'none', + data = hmdd, + x = ~.data[[v[1]]], + y = ~.data[[v[2]]], + z = ~.data[[v[3]]], + colorscale = 'Greys', + showscale = FALSE + ) + } else . + } %>% + add_trace( + type = "heatmap", + data = hmd, + x = ~.data[[v[1]]], # x variable + y = ~.data[[v[2]]], # y variable + z = ~.data[[v[3]]], # z (color) variable + # this is what gets shown in the tooltip + customdata = ~fix_na( sprintf( str_c("%",v[5]), .data[[ str_glue("{v[3]}_val") ]] ) ), + colorscale = cscale(v[4],n = 10,na = NA), # color palette, based on 100 gradations + # style the colorbar + colorbar = list( + # show the appropriate display name + title=list(text=value_map[v[3]]), + # determine the position of the colorbar + # bottom if it's for the bottom triangle, top if it's for the top + y = ifelse(i == 1,0,1), + # which side of the colorbar we reckon the y-anchor from + yanchor = ifelse(i == 1,"bottom","top"), + len = 0.5, + lenmode = "fraction" + ), + # text template + text = ~str_glue("{pop1} - {pop2}"), + # hover text template (hide the trace name with ) + hovertemplate=str_glue("%{{text}}
{value_map[v[3]]}: %{{customdata}}") ) - ) - ) %>% - # hide the mode bar - config(displayModeBar = FALSE) - - # get the figure for output - if (is.null(plot_fig)) { - fig_id <- str_glue("heatmap-fig-{method$method}-{shr}-{trace_vars$name}") - fig$elementId <- fig_id - # we stick it in a div with class 'fig-container' so - # they're easy to find in the DOM using jquery - plot_fig <- tagList( - div( - `data-fig`=fig_id, - `data-json-prefix`=str_glue("heatmap-json-{method$method}-{shr}-{trace_vars$name}"), - class="fig-container", - tagList(fig) + } else return(plot_fig) + },.init = fig) %>% + layout( + # name the figure axes and don't let the user zoom around + xaxis = list(title = "Pool 1", fixedrange = TRUE), + yaxis = list(title = "Pool 2", fixedrange = TRUE), + title = list( + text = str_glue("{trace_vars$title} (coverage cutoff: ≥{cutoff})"), + font = list( + size = 12 ) ) - } - - # get the figure json and output it - fig_json <- fig %>% - plotly_json(jsonedit = FALSE) %>% - jsonlite::minify() - # create and print script tag with figure data json - # we use this to draw the other cutoff level figures when the slider is dragged - json_tag <- tags$script( - id = str_glue("heatmap-json-{method$method}-{shr}-{trace_vars$name}-{cutoff}"), - type = 'application/json', - HTML(fig_json) + ) %>% + # hide the mode bar + config(displayModeBar = FALSE) + + # get the figure for output + if (is.null(plot_fig)) { + fig_id <- str_glue("heatmap-fig-{method}-{shr}-{trace_vars$name}") + fig$elementId <- fig_id + # we stick it in a div with class 'fig-container' so + # they're easy to find in the DOM using jquery + plot_fig <- tagList( + div( + `data-fig`=fig_id, + `data-json-prefix`=str_glue("heatmap-json-{method}-{shr}-{trace_vars$name}"), + class="fig-container", + tagList(fig) + ) ) - print(json_tag) - - return(plot_fig) - },.init = NULL) - - # output the heatmap figure - print(heatmap_fig) - }) - }) - }) + } + + # get the figure json and output it + fig_json <- fig %>% + plotly_json(jsonedit = FALSE) %>% + jsonlite::minify() + # create and print script tag with figure data json + # we use this to draw the other cutoff level figures when the slider is dragged + json_tag <- tags$script( + id = str_glue("heatmap-json-{method}-{shr}-{trace_vars$name}-{cutoff}"), + type = 'application/json', + HTML(fig_json) + ) + print(json_tag) + + return(plot_fig) + },.init = NULL) + + # output the heatmap figure + print(heatmap_fig) + }) + }) +},by=method] ``` @@ -1083,33 +1045,29 @@ cat("# Fst method correlations {.tabset}\n\n") # get calculation methods methods <- unique(pool_data$method) # split the dataset into subsets by calculation method -splot <- pool_data %>% - split(.$method) methods %>% combn(2) %>% array_branch(2) %>% - walk(\(pair) { - cat(str_glue("## {pair[1]} vs {pair[2]}\n\n")) + walk(\(methods) { + cat(str_glue("## {methods[1]} vs {methods[2]}\n\n")) print(h5(tags$small(HTML("Note: Due to potentially large datasets, only a subsample of 1,000 Fst values is shown here.")))) - - # make a joined table with both methods and subsample to 1000 rows - corr <- splot[[pair[1]]] %>% - slice_sample(n=1000) %>% - inner_join(splot[[pair[2]]],by=c("chrom","pos","pop1","pop2")) + + # make a joined table with both methods and subsample to max 1000 rows + corr <- merge( + pool_data[ method == methods[1] ][sample(.N,1000),], + pool_data[ method == methods[2] ], + by = c('chrom','pos','pop1','pop2') + ) # do a linear regression lmm <- lm(fst.y ~ fst.x, data=corr) # get predicted data for the trend line - corr <- corr %>% - mutate( - predicted = lmm %>% - predict(corr %>% select(fst.x)) - ) + corr[,predicted := predict(lmm,corr[,.(fst.x)])] # get method names - xtitle <- pair[1] - ytitle <- pair[2] + xtitle <- methods[1] + ytitle <- methods[2] # plot the figure fig <- corr %>% @@ -1156,89 +1114,82 @@ print(h5(tags$small( "Fisher test results are presented as static plots since these datasets can get huge, making plotly go slow or crash." ))) -fisher_data %>% - group_by(method) %>% - group_walk(\(fishr,method) { - cat(str_glue("## {method$method}\n\n")) - - # TODO: fix cutoff slider for fisher plots - # make slider for coverage cutoff - cutoff_slider <- div( - class="coverage-slider", - tagList( - tags$label( - id=str_glue("fisher-cutoff-label-{method$method}"), - `for`=str_glue("fisher-cutoff-sel-{method$method}"), - str_glue('Minimum coverage: {params$nf$min_coverage_cutoff}') - ), - div( - id=str_glue("fisher-cutoff-sel-{method$method}") - ) +nothing <- fisher_data[,{ + fishr <- .SD + cat(str_glue("## {method}\n\n")) + + # TODO: fix cutoff slider for fisher plots + # make slider for coverage cutoff + cutoff_slider <- div( + class="coverage-slider", + tagList( + tags$label( + id=str_glue("fisher-cutoff-label-{method}"), + `for`=str_glue("fisher-cutoff-sel-{method}"), + str_glue('Minimum coverage: {params$nf$min_coverage_cutoff}') + ), + div( + id=str_glue("fisher-cutoff-sel-{method}") ) ) - print(cutoff_slider) - - cutoff_script <- tags$script(HTML(str_glue( + ) + print(cutoff_slider) + + cutoff_script <- tags$script(HTML(str_glue( ' $(function() {{ - $("#fisher-cutoff-sel-{method$method}").labeledslider({{ + $("#fisher-cutoff-sel-{method}").labeledslider({{ min: {params$nf$min_coverage_cutoff}, max: {params$nf$max_coverage_cutoff}, step: {params$nf$coverage_cutoff_step}, tickInterval: {params$nf$coverage_cutoff_step}, change: function (e, ui) {{ - $(".fisher-plotz-{method$method}").hide(); - $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); - $("#fisher-cutoff-label-{method$method}").text(`Minimum coverage: ${{ui.value}}`); + $(".fisher-plotz-{method}").hide(); + $(`#fisher-plot-{method}-${{ui.value}}`).show(); + $("#fisher-cutoff-label-{method}").text(`Minimum coverage: ${{ui.value}}`); }}, slide: function (e, ui) {{ - $(".fisher-plotz-{method$method}").hide(); - $(`#fisher-plot-{method$method}-${{ui.value}}`).show(); - $("#fisher-cutoff-label-{method$method}").text(`Minimum coverage: ${{ui.value}}`); + $(".fisher-plotz-{method}").hide(); + $(`#fisher-plot-{method}-${{ui.value}}`).show(); + $("#fisher-cutoff-label-{method}").text(`Minimum coverage: ${{ui.value}}`); }} }}); }}); ' - ))) - print(cutoff_script) - - print(h6(HTML("Points above red line indicate p < 0.05."))) - - figs <- cutoffs %>% - # set_names() %>% - map(\(cov) { - # cat(str_glue("### Coverage ≥{cov}\n\n")) - fig <- fishr %>% - # filter by average minimum coverage - filter(avg_min_cov >= cov) %>% - # retain the cutoff level in the data - # mutate(cutoff=cov) %>% - group_by(chrom,pos) %>% - summarise( log_fisher = mean(log_fisher) ) %>% - ungroup() %>% - arrange(chrom,pos) %>% - mutate(snp = row_number()) %>% - ggplot() + - geom_point(aes(x=snp,y=log_fisher),size=1) + - geom_hline(yintercept = -log10(0.05), color="red") + - theme_bw() + - labs(x = 'SNP', y=expression(paste(-log[10]~"(p-value)"))) - - disp <- if(cov == first(cutoffs)) "" else "display: none" - plotTag( - fig, - alt=str_glue("Fisher's exast test p-values for coverage ≥{cov}"), - width = 672, - attribs = list( - id = str_glue("fisher-plot-{method$method}-{cov}"), - class = str_glue('fisher-plotz-{method$method}'), - style = disp - ) - ) - }) - print(div( - id='fisher-container', - tagList( figs ) - )) - }) + ))) + print(cutoff_script) + + print(h6(HTML("Points above red line indicate p < 0.05."))) + + figs <- cutoffs %>% + # set_names() %>% + map(\(cov) { + + # filter by coverage cutoff and get mean fisher p-value by snp + figdata <- fishr[avg_min_cov >= cov][,.(log_fisher = mean(log_fisher)), by=.(chrom,pos)][order(chrom,pos)] + figdata[,snp := .I] + + fig <- ggplot(figdata) + + geom_point(aes(x=snp,y=log_fisher),size=1) + + geom_hline(yintercept = -log10(0.05), color="red") + + theme_bw() + + labs(x = 'SNP', y=expression(paste(-log[10]~"(p-value)"))) + + disp <- if(cov == first(cutoffs)) "" else "display: none" + plotTag( + fig, + alt=str_glue("Fisher's exast test p-values for coverage ≥{cov}"), + width = 672, + attribs = list( + id = str_glue("fisher-plot-{method}-{cov}"), + class = str_glue('fisher-plotz-{method}'), + style = disp + ) + ) + }) + print(div( + id='fisher-container', + tagList( figs ) + )) +},by=method] ``` From bd58cfabaf9adbef80946eb44abf493970e9cf7e Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 11:03:25 -1000 Subject: [PATCH 11/39] remove commented-out dplyr code from R scripts --- .../fishertest/resources/usr/bin/fisher.R | 106 +----------------- .../joinfreq/resources/usr/bin/joinfreq.R | 77 +------------ .../fst/resources/usr/bin/poolfstat.R | 27 ----- 3 files changed, 8 insertions(+), 202 deletions(-) diff --git a/modules/local/fishertest/resources/usr/bin/fisher.R b/modules/local/fishertest/resources/usr/bin/fisher.R index 97346b8..b07bbc9 100755 --- a/modules/local/fishertest/resources/usr/bin/fisher.R +++ b/modules/local/fishertest/resources/usr/bin/fisher.R @@ -1,16 +1,7 @@ #!/usr/bin/env Rscript -# suppressPackageStartupMessages(library(dplyr)) -# suppressPackageStartupMessages(library(tidyr)) -# suppressPackageStartupMessages(library(readr)) -# suppressPackageStartupMessages(library(janitor)) -# suppressPackageStartupMessages(library(purrr)) suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(optparse)) -# suppressPackageStartupMessages(library(matrixStats)) - -# options(dplyr.summarise.inform = FALSE) - # help message formatter nice_formatter <- function(object) { @@ -61,11 +52,6 @@ gm_mean = function(x, na.rm=TRUE, zero.propagate = FALSE){ pval_callback <- function(opt, flag, val, parser, ...) { char.expand(val,c("multiply","geometric-mean")) - # switch( - # val, - # 'multiply' = prod, - # 'geometric-mean' = gm_mean - # ) } expand_callback <- function(opt, flag, val, parser, args, ...) { @@ -163,24 +149,12 @@ if (!file.exists(opt$args[1])) { stop(sprintf("Frequency file %s does not exist.",opt$args[1])) } -# filter frequency table down to just what we'd consider snps -# this filtering should match popoolation/poolfst - -# split(ceiling(seq_along(.)/50)) %>% - -# freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) freq_snps <- fread(opt$args[1]) if (all(is.na(pools))) { pools <- melt(freq_snps[1], measure = measure(pool,measure,pattern = r'[^(.+)\.(REF_CNT)$]'),value.name = 'count')[ pool != 'TOTAL' ][order(pool)]$pool - # pools <- freq_snps %>% - # slice(1) %>% - # pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% - # pull(pool) %>% - # sort() %>% - # unique() } # make sure we're dealing with two pools @@ -194,29 +168,6 @@ cols <- names(freq_snps) pr <- paste0('^(',paste0(pools,collapse='|'),')') # continue filtering -# freq_snps <- freq_snps %>% -# select(CHROM:ALT,starts_with(paste0(pools,"."),ignore.case = FALSE)) %>% -# mutate( -# `TOTAL.REF_CNT` = rowSums(pick(ends_with(".REF_CNT",ignore.case = FALSE))), -# `TOTAL.ALT_CNT` = rowSums(pick(ends_with(".ALT_CNT",ignore.case = FALSE))), -# `TOTAL.DEPTH` = rowSums(pick(ends_with(".DEPTH",ignore.case = FALSE))), -# ) %>% -# mutate( -# lwr = floor((POS-1)/window_step)*window_step+1, -# upr = lwr+window_size-1, -# middle=floor((upr+lwr)/2) -# ) %>% -# filter( if_all(ends_with(".DEPTH"),~.x >= min_depth) ) %>% -# add_count(CHROM,middle,name="snp_count") %>% -# rename_with(make_clean_names,.cols = starts_with("TOTAL.",ignore.case = FALSE)) %>% -# filter( -# if_all(starts_with("total_",ignore.case = FALSE) & ends_with("_cnt",ignore.case = FALSE),~.x >= min_count), -# total_ref_cnt != total_depth, -# total_depth >= min_depth -# ) %>% -# rename_with(CHROM:ALT,.fn=make_clean_names) %>% -# select(chrom,pos,middle,ref:alt,ends_with(".REF_CNT"),ends_with(".ALT_CNT"),starts_with("total_"),everything()) - # try to do as much data.table stuff in-place (using `:=`) as possible to increase memory efficiency # first, delete columns we don't care about @@ -248,25 +199,6 @@ cols <- names(freq_snps) mv <- c(cols[1:3],'snp_count','start','end',grep(r'[\.DEPTH$]',cols,value=TRUE),grep(r'[_CNT$]',cols,value=TRUE)) setcolorder(freq_snps,mv) -# fisher_results <- freq_snps %>% -# select(order(colnames(.))) %>% -# select( -# chrom, pos, middle, snp_count,start=lwr,end=upr, -# ends_with(".DEPTH",ignore.case = FALSE), -# ends_with(".REF_CNT",ignore.case = FALSE), -# ends_with(".ALT_CNT",ignore.case = FALSE) -# ) %>% -# pivot_longer( -# -c(chrom,pos,middle,snp_count,start,end,ends_with(".DEPTH",ignore.case = FALSE)), -# names_to = c("pop","measure"), -# values_to = "count", -# names_pattern = "^(.+)\\.([^.]+)$" -# ) %>% -# group_by(chrom,pos,middle,start,end,snp_count,across(ends_with(".DEPTH",ignore.case = FALSE))) %>% -# summarise(pval = fisher.test(matrix(count,ncol=2))$p.value) %>% -# ungroup() %>% -# mutate(min_cov = matrixStats::rowMins(as.matrix(pick(ends_with(".DEPTH",ignore.case = FALSE))))) - # calculate fisher results # reshape frequency data and order ref and alt counts appropriately @@ -280,8 +212,6 @@ fisher_results[,avg_min_cov := do.call(pmin,.SD),.SDcols = patterns(r'[\.DEPTH$] if (adjust_p) { fisher_results[, pval := p.adjust(pval,method=adjust_method)] - # fisher_results <- fisher_results %>% - # mutate(p_adj = p.adjust(pval,method=adjust_method)) } @@ -295,25 +225,13 @@ if (save_all & window_size > 1) { covered_fraction = 1, start=pos, end=pos - )] - + )] + fwrite( fisher_results[,c('chrom','pos','covered_fraction','avg_min_cov','pop1','pop2','fisher'),with=FALSE], file = ss, sep="\t" ) - # fisher_results %>% - # mutate( - # fisher = -log10(pval), - # pop1 = pools[1], - # pop2 = pools[2], - # window_size = 1, - # covered_fraction = 1, - # start=pos, - # end=pos, - # ) %>% - # select(chrom,pos,snps,covered_fraction,avg_min_cov = min_cov,pop1,pop2,fisher) %>% - # write_tsv(ss) } if (window_type != 'single') { @@ -343,23 +261,5 @@ fisher_results[,cols[-cn] := NULL] setcolorder(fisher_results,c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','fisher','method')) -# fisher_results <- fisher_results %>% -# group_by(chrom,middle,start,end,snp_count) %>% -# summarise( -# fisher = -log10(p_combine(pval)), -# pop1 = pools[1], -# pop2 = pools[2], -# window_size = n(), -# covered_fraction = unique(snp_count)/window_size, -# avg_min_cov = mean(min_cov) -# ) %>% -# ungroup() %>% -# mutate( -# pos = floor((start+end)/2), -# method = 'assesspool' -# ) %>% -# select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fisher,method) - ss <- sprintf("%s/%s_%s_window_%d_%d_fisher.tsv",outdir,file_prefix,paste0(pools,collapse="-"),window_size,window_step) -fwrite(fisher_results,ss,sep="\t") -# write_tsv(fisher_results,ss) \ No newline at end of file +fwrite(fisher_results,ss,sep="\t") \ No newline at end of file diff --git a/modules/local/joinfreq/resources/usr/bin/joinfreq.R b/modules/local/joinfreq/resources/usr/bin/joinfreq.R index 7ccf000..22d28fe 100755 --- a/modules/local/joinfreq/resources/usr/bin/joinfreq.R +++ b/modules/local/joinfreq/resources/usr/bin/joinfreq.R @@ -1,18 +1,10 @@ #!/usr/bin/env Rscript suppressPackageStartupMessages({ - # library(dplyr) - # library(tidyr) - # library(readr) - # library(purrr) - # library(stringr) library(optparse) library(data.table) }) -# options(dplyr.summarise.inform = FALSE) - - # help message formatter nice_formatter <- function(object) { cat(object@usage, fill = TRUE) @@ -109,17 +101,10 @@ if (!file.exists(opt$args[2])) { freq_snps <- fread(opt$args[1]) fst <- fread(opt$args[2]) -# freq_snps <- read_tsv(opt$args[1],col_types = cols(),progress = FALSE) -# fst_wide <- read_tsv(opt$args[2],col_types = cols(),progress = FALSE) if (all(c("start","end") %in% names(fst))) { fst[, pos := floor((start+end)/2) ] fst[, c('start','end') := NULL ] setcolorder(fst,c('chrom','pos')) - - # fst_wide <- fst_wide %>% - # mutate(pos = floor((start+end)/2)) %>% - # select(-c(start,end)) %>% - # select(chrom,pos,everything()) } # load fst file, pivot long, and filter out NAs @@ -127,10 +112,10 @@ if (calc_method == "grenedalf") { # delete unused columns cols <- names(fst) fst[,c(cols[grep(r'[\.(missing|numeric|passed|masked|empty|invariant)$]',cols)]) := NULL] - + # make sure the pool name pairs are sorted, so that they # look like a:b.fst and not like b:a.fst - + # first, pull out fst column names and strip off '.fst' old_cols <- grep(r'[\.fst$]',names(fst),value=TRUE) fst_cols <- gsub(r'[\.fst$]','',old_cols) @@ -138,45 +123,20 @@ if (calc_method == "grenedalf") { fst_cols <- paste0(sapply(strsplit(fst_cols,':'),\(x) paste0(sort(x),collapse=':')),'.fst') # rename the original columns to the sorted version setnames(fst,old = old_cols, new=fst_cols) - - # now convert to long format + + # now convert to long format fst <- melt(fst, measure = measure(pop1,pop2,pattern = r'[^([^:]+):(.+)\.fst$]'),value.name = 'fst') } else { setcolorder(fst,c('chrom','pos','pop1','pop2','fst')) fst[,c(names(fst)[-c(1:5)]) := NULL] } -# fst <- fst_wide %>% -# { -# if (calc_method == "grenedalf") { -# select(.,chrom:pos,ends_with(".fst",ignore.case = FALSE)) %>% -# rename_with(\(n) { -# str_match(n,"^(.+):(.+)\\.(fst)$") %>% -# array_branch(1) %>% -# map_chr(\(x) { -# f <- sort(x[2:3]) -# str_c(f[1],":",f[2],".",x[4]) -# }) -# },.cols = -c(chrom:pos)) %>% -# pivot_longer(-c(chrom:pos),names_to=c("pop1","pop2",".value"),names_pattern = "^([^:]+):(.+)\\.(fst)$") -# } else select(.,chrom,pos,pop1,pop2,fst) -# } %>% -# filter(!is.na(fst)) # pivot frequency table longer - - # fst_wide <- melt(fst_wide, measure = measure(pop1,pop2,pattern = r'[^([^:]+):(.+)\.fst$]'),value.name = 'fst') freq_snps[,c(grep('^TOTAL',names(freq_snps))) := NULL ] freq_snps <- melt(freq_snps, measure = measure(pool,measure,pattern = r'[^([^.]+)\.(.+)$]'),value.name = 'count') freq_snps <- dcast(freq_snps,CHROM+POS+pool ~ measure,value.var = "count")[order(CHROM,POS,pool)] setnames(freq_snps,1:2,new=tolower) -# freq_snps2 <- freq_snps %>% -# pivot_longer(-c(CHROM:ALT,starts_with("TOTAL",ignore.case = FALSE)),names_to = c("pool","measure"),values_to="count",names_pattern = "^(.+)\\.([^.]+)$") %>% -# select(CHROM,POS,pool,measure,count) %>% -# pivot_wider(names_from="measure",values_from = "count") %>% -# rename_with(str_to_lower,.cols = c(1:2)) %>% -# arrange(chrom,pos,pool) - combined <- merge(fst,freq_snps,by.x = c('chrom','pos','pop1'),by.y = c('chrom','pos','pool')) combined <- merge(combined,freq_snps,by.x = c('chrom','pos','pop2'),by.y = c('chrom','pos','pool')) setnames( @@ -207,31 +167,4 @@ setcolorder( ) outfile <- sprintf("%s/%s_fst_frequency.tsv",outdir,file_prefix) -fwrite(combined,outfile,sep="\t") - -# # continue filtering -# combined <- fst %>% -# left_join(freq_snps,by=c("chrom","pos","pop1" = "pool")) %>% -# left_join(freq_snps,by=c("chrom","pos","pop2" = "pool"),suffix=c(".pop1",".pop2")) %>% -# mutate( -# # summarize read depths -# TOTAL.REF_CNT = REF_CNT.pop1 + REF_CNT.pop2, -# TOTAL.ALT_CNT = ALT_CNT.pop1 + ALT_CNT.pop2, -# TOTAL.DEPTH = DEPTH.pop1 + DEPTH.pop2 -# ) %>% -# filter( -# # filter by minimum minimum depth and others -# DEPTH.pop1 >= min_depth & DEPTH.pop2 >= min_depth, -# TOTAL.ALT_CNT >= min_count & TOTAL.REF_CNT >= min_count, -# TOTAL.REF_CNT != TOTAL.DEPTH, -# TOTAL.DEPTH >= min_depth -# ) %>% -# mutate( -# avg_min_cov = pmin(DEPTH.pop1, DEPTH.pop2), -# method = calc_method -# ) %>% -# arrange(chrom,pos,pop1,pop2) %>% -# rename_with(str_to_lower) - -# outfile <- sprintf("%s/%s_fst_frequency.tsv",outdir,file_prefix) -# write_tsv(combined,outfile) \ No newline at end of file +fwrite(combined,outfile,sep="\t") \ No newline at end of file diff --git a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R index 4adc96d..e484d62 100755 --- a/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R +++ b/modules/local/poolfstat/fst/resources/usr/bin/poolfstat.R @@ -3,10 +3,6 @@ suppressPackageStartupMessages({ library(poolfstat) library(optparse) - # library(tibble) - # library(stringr) - # library(readr) - # library(dplyr) library(data.table) }) @@ -100,44 +96,21 @@ pooldata <- popsync2pooldata( fst <- computeFST(pooldata,sliding.window.size=opt$options$window_size) # get population combo string -# fn <- str_c(pooldata@poolnames,collapse='-') fn <- paste0(pooldata@poolnames,collapse='-') # save global Fst -# global_fst <- tibble(pop1=pooldata@poolnames[1],pop2=pooldata@poolnames[2],fst=fst$Fst[1]) -# write_tsv(global_fst,str_glue("{opt$options$prefix}_global_{fn}.tsv")) global_fst <- data.table(pop1=pooldata@poolnames[1],pop2=pooldata@poolnames[2],fst=fst$Fst[1]) fwrite(global_fst,sprintf("%s_global_%s.tsv",opt$options$prefix,fn),sep="\t") # save sliding window Fst, if they exist if (!is.null(fst$sliding.windows.fvalues)) { - # sliding <- as_tibble(fst$sliding.windows.fvalues) %>% - # rename(chrom=1,start=2,end=3,mid=4,cum_mid=5,fst=6) - # write_tsv(sliding,str_glue("{opt$options$prefix}_sliding-{opt$options$window_size}_{fn}.tsv")) setDT(fst$sliding.windows.fvalues) setnames(fst$sliding.windows.fvalues,new=c('chrom','start','end','mid','cum_mid','fst')) fwrite(fst$sliding.windows.fvalues,sprintf("%s_sliding-%d_%s.tsv",opt$options$prefix,opt$options$window_size,fn),sep="\t") } -# pools <- str_c(str_c(pooldata@poolnames,collapse=':'),".fst") pools <- paste0(paste0(pooldata@poolnames,collapse=':'),".fst") -# save per-snp Fst, match popoolation output -# snp_fst <- pooldata@snp.info %>% -# as_tibble() %>% -# select(chrom=1,pos=2) %>% -# mutate( -# window_size = opt$options$window_size, -# covered_fraction = 1, -# # depth = rowSums(pooldata@readcoverage), -# avg_min_cov = do.call(pmin,as.data.frame(pooldata@readcoverage)), -# pop1 = opt$options$pool_names[1], -# pop2 = opt$options$pool_names[2], -# fst = fst$snp.Fstats$Fst -# ) %>% -# select(chrom,pos,window_size,covered_fraction,avg_min_cov,pop1,pop2,fst) -# write_tsv(snp_fst,str_glue("{opt$options$prefix}_{fn}.fst"),col_names=opt$options$headers) - setDT(pooldata@snp.info) cols <- names(pooldata@snp.info) pooldata@snp.info[,cols[-c(1:2)] := NULL] From 541c2699ed1f293734b01c0ca75846a0bf98b7b5 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 11:41:07 -1000 Subject: [PATCH 12/39] small tweak to report --- assets/assesspool_report.Rmd | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 1289ae0..69b79a6 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -205,7 +205,10 @@ if (show_final_filter) { any_filter <- show_filter | show_final_filter # used for tab selection -pools_shared <- c('All SNPs'=FALSE,'Shared loci (all pools)'=TRUE) +pools_shared <- c( + 'All SNPs'=FALSE, + 'SNPs shared by all pools'=TRUE +) ``` From c856189c3e9fc482bdc096010a8d11f16410810f Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 12:58:50 -1000 Subject: [PATCH 13/39] add na.rm to quantile --- assets/assesspool_report.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 69b79a6..01213d0 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -609,8 +609,8 @@ nothing <- pool_data[,{ n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) .( fst_sd = fst_sd, # std. dev. of fst - q1 = as.numeric(quantile(fst)[2]), - q3 = as.numeric(quantile(fst)[4]), + q1 = as.numeric(quantile(fst,na.rm=TRUE)[2]), + q3 = as.numeric(quantile(fst,na.rm=TRUE)[4]), lwr = boxplot.stats(fst)$stats[1], upr = boxplot.stats(fst)$stats[5], med = median(fst), From cd0486f48d20d01a4db24e7044d77fbe51c97d7d Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 13:10:54 -1000 Subject: [PATCH 14/39] add na.rm where necessary --- assets/assesspool_report.Rmd | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 01213d0..9986468 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -342,16 +342,16 @@ nothing <- pool_data[,{ rbindlist() fst_cov <- fst_cov[,{ - fst_sd <- sd(fst) + fst_sd <- sd(fst,na.rm=TRUE) fst_se <- fst_sd / sqrt(.N) n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) - fst_mean <- mean(fst) + fst_mean <- mean(fst,na.rm=TRUE) .( fst_sd = fst_sd, fst = fst_mean, fst_se = fst_se, - cov = mean(avg_min_cov), + cov = mean(avg_min_cov,na.rm=TRUE), snps = n_snps, # snp count contigs = n_contigs, snps_per_contig = n_snps / n_contigs, @@ -604,7 +604,7 @@ nothing <- pool_data[,{ # precompute box plot stats because the datasets are too big # and pandoc runs out of memory fst_grp <- fst_data[avg_min_cov >= cutoff][,{ - fst_sd <- sd(fst) + fst_sd <- sd(fst,na.rm=TRUE) n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) .( @@ -613,12 +613,12 @@ nothing <- pool_data[,{ q3 = as.numeric(quantile(fst,na.rm=TRUE)[4]), lwr = boxplot.stats(fst)$stats[1], upr = boxplot.stats(fst)$stats[5], - med = median(fst), - min = min(fst), - max = max(fst), + med = median(fst,na.rm=TRUE), + min = min(fst,na.rm=TRUE), + max = max(fst,na.rm=TRUE), fst_se = fst_sd / sqrt(.N), # std. err. of fst - fst = mean(fst), # mean fst - cov = mean(avg_min_cov), # mean coverage + fst = mean(fst,na.rm=TRUE), # mean fst + cov = mean(avg_min_cov,na.rm=TRUE), # mean coverage snps = n_snps, # snp count contigs = n_contigs, # contig count snps_per_contig = n_snps / n_contigs # mean snps per contig @@ -871,7 +871,7 @@ nothing <- pool_data[,{ fst_sd = fst_sd, # std. dev. of fst fst_se = fst_sd / sqrt(.N), # std. err. of fst fst = mean(fst,na.rm=TRUE), # mean fst - cov = mean(avg_min_cov), # mean coverage + cov = mean(avg_min_cov,na.rm=TRUE), # mean coverage cov_sd = cov_sd, cov_se = cov_sd / sqrt(.N), snps = n_snps, # snp count @@ -1169,7 +1169,7 @@ nothing <- fisher_data[,{ map(\(cov) { # filter by coverage cutoff and get mean fisher p-value by snp - figdata <- fishr[avg_min_cov >= cov][,.(log_fisher = mean(log_fisher)), by=.(chrom,pos)][order(chrom,pos)] + figdata <- fishr[avg_min_cov >= cov][,.(log_fisher = mean(log_fisher,na.rm=TRUE)), by=.(chrom,pos)][order(chrom,pos)] figdata[,snp := .I] fig <- ggplot(figdata) + From b917e3d08cf8d279d386652223dd0bc7235d89da Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 16:26:52 -1000 Subject: [PATCH 15/39] use args2 for gawk processes, as theyre supposed to --- conf/modules.config | 95 +++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7150351..f37baf1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -53,23 +53,24 @@ process { .keySet() .sort() .withIndex() - .collect{ k,i -> "-v pop${i+1}='${k}'" } + - [ - """' - BEGIN{ - FS=OFS="\\t" - print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tfst\\tmethod" - } { - gsub(/^.+=/,"",\$NF) - fst=\$NF; nf=NF - \$(nf+3)="popoolation" - \$(nf+2)=fst - \$(nf+1)=pop2 - \$nf=pop1 - print - } - '""" - ] ).join(' ').trim() + .collect{ k,i -> "-v pop${i+1}='${k}'" } + ).join(' ').trim() + } + ext.args2 = { + """' + BEGIN{ + FS=OFS="\\t" + print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tfst\\tmethod" + } { + gsub(/^.+=/,"",\$NF) + fst=\$NF; nf=NF + \$(nf+3)="popoolation" + \$(nf+2)=fst + \$(nf+1)=pop2 + \$nf=pop1 + print + } + '""" } } @@ -81,23 +82,24 @@ process { .keySet() .sort() .withIndex() - .collect{ k,i -> "-v pop${i+1}='${k}'" } + - [ - """' - BEGIN{ - FS=OFS="\\t" - print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tfisher\\tmethod" - } { - gsub(/^.+=/,"",\$NF) - fisher=\$NF; nf=NF - \$(nf+3)="popoolation" - \$(nf+2)=fisher - \$(nf+1)=pop2 - \$nf=pop1 - print - } - '""" - ] ).join(' ').trim() + .collect{ k,i -> "-v pop${i+1}='${k}'" } + ).join(' ').trim() + } + ext.args2 = { + """' + BEGIN{ + FS=OFS="\\t" + print "chrom\\tpos\\twindow_size\\tcovered_fraction\\tavg_min_cov\\tpop1\\tpop2\\tlog_fisher\\tmethod" + } { + gsub(/^.+=/,"",\$NF) + fisher=\$NF; nf=NF + \$(nf+3)="popoolation" + \$(nf+2)=fisher + \$(nf+1)=pop2 + \$nf=pop1 + print + } + '""" } } @@ -109,19 +111,18 @@ process { .keySet() .sort() .withIndex() - .collect{ k,i -> "-v col${i+1}='${k}'" } + - [ - "-F \$'\\t'", - "-v OFS='\\t'", - """' - NR == 1 { - for (i=1; i <= NF; i++) { cols[\$i]=i } - # print \$1, \$2, \$3, \$cols[col1], \$cols[col2] - next - } - NR > 1 { print \$1, \$2, \$3, \$cols[col1], \$cols[col2] } - '""" - ] ).join(' ').trim() + .collect{ k,i -> "-v col${i+1}='${k}'" } + ).join(' ').trim() + } + ext.args2 = { + """' + BEGIN { FS=OFS="\\t" } + NR == 1 { + for (i=1; i <= NF; i++) { cols[\$i]=i } + next + } + NR > 1 { print \$1, \$2, \$3, \$cols[col1], \$cols[col2] } + '""" } } From 3a3e5bdb935ea91456973b8822ea643d153d87bc Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 16:27:08 -1000 Subject: [PATCH 16/39] change column to log_fisher --- modules/local/fishertest/resources/usr/bin/fisher.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/local/fishertest/resources/usr/bin/fisher.R b/modules/local/fishertest/resources/usr/bin/fisher.R index b07bbc9..70106dd 100755 --- a/modules/local/fishertest/resources/usr/bin/fisher.R +++ b/modules/local/fishertest/resources/usr/bin/fisher.R @@ -218,7 +218,7 @@ if (adjust_p) { if (save_all & window_size > 1) { ss <- sprintf("%s/%s_%s_all_snps_fisher.tsv",outdir,file_prefix,paste0(pools,collapse="-")) fisher_results[,`:=`( - fisher = -log10(pval), + log_fisher = -log10(pval), pop1 = pools[1], pop2 = pools[2], window_size = 1, @@ -236,7 +236,7 @@ if (save_all & window_size > 1) { if (window_type != 'single') { fisher_results <- fisher_results[,.( - fisher = -log10(p_combine(pval)), + log_fisher = -log10(p_combine(pval)), pop1 = pools[1], pop2 = pools[2], window_size = .N, @@ -250,15 +250,15 @@ if (window_type != 'single') { window_size = 1, pop1 = pools[1], pop2 = pools[2], - fisher = -log10(pval) + log_fisher = -log10(pval) )] } fisher_results[,method := 'assesspool'] cols <- names(fisher_results) -cn <- which(cols %in% c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','fisher','method')) +cn <- which(cols %in% c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','log_fisher','method')) fisher_results[,cols[-cn] := NULL] -setcolorder(fisher_results,c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','fisher','method')) +setcolorder(fisher_results,c('chrom','pos','window_size','covered_fraction','avg_min_cov','pop1','pop2','log_fisher','method')) ss <- sprintf("%s/%s_%s_window_%d_%d_fisher.tsv",outdir,file_prefix,paste0(pools,collapse="-"),window_size,window_step) From acf355694231ae0f0e33e5457858cd5c27dca032 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Mon, 4 Aug 2025 16:27:33 -1000 Subject: [PATCH 17/39] change pops to factors in fst table (maybe causes memory error in pandoc?) --- assets/assesspool_report.Rmd | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 9986468..508e9b4 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -168,6 +168,10 @@ show_fst <- !is.null(params$fst) if (show_fst) { pool_data <- fread(params$fst) pool_data[,pair := paste0(pop1," - ",pop2)] + pool_data[,`:=`( + pop1 = factor(pop1), + pop2 = factor(pop2) + )] setorder(pool_data,method,chrom,pos) } else { pool_data <- FALSE @@ -179,8 +183,8 @@ fisher_data <- NULL show_fisher <- !is.null(params$fisher) if (show_fisher) { fisher_data <- fread(params$fisher) - fisher_data[,c(names(fisher_data)[!names(fisher_data) %in% c('chrom','pos','pop1','pop2','avg_min_cov','fisher','method')]) := NULL] - setnames(fisher_data,'fisher','log_fisher') + fisher_data[,c(names(fisher_data)[!names(fisher_data) %in% c('chrom','pos','pop1','pop2','avg_min_cov','log_fisher','method')]) := NULL] + # setnames(fisher_data,'fisher','log_fisher') setcolorder(fisher_data,c('chrom','pos','pop1','pop2','avg_min_cov','log_fisher','method')) setorder(fisher_data,method,chrom,pos) } From f2f0e094fb23fb13eb71ad2fa2896b65cd62ec9c Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Wed, 6 Aug 2025 16:17:20 -1000 Subject: [PATCH 18/39] add sync input option, tweak report, etc. --- assets/assesspool_report.Rmd | 150 +++--- assets/schema_input.json | 8 +- modules/local/extractsequences/main.nf | 12 +- modules/local/fishertest/main.nf | 18 +- modules/local/grenedalf/sync.nf | 10 +- modules/local/joinfreq/main.nf | 12 +- .../joinfreq/resources/usr/bin/joinfreq.R | 4 +- modules/local/poolfstat/fst/main.nf | 12 +- .../nf-core/bcftools/reheader/environment.yml | 5 - modules/nf-core/bcftools/reheader/main.nf | 79 --- modules/nf-core/bcftools/reheader/meta.yml | 76 --- .../bcftools/reheader/tests/bcf.config | 4 - .../bcftools/reheader/tests/main.nf.test | 394 --------------- .../bcftools/reheader/tests/main.nf.test.snap | 469 ------------------ .../nf-core/bcftools/reheader/tests/tags.yml | 2 - .../bcftools/reheader/tests/vcf.config | 4 - .../bcftools/reheader/tests/vcf.gz.config | 4 - .../reheader/tests/vcf_gz_index.config | 4 - .../reheader/tests/vcf_gz_index_csi.config | 4 - .../reheader/tests/vcf_gz_index_tbi.config | 5 - subworkflows/local/filter.nf | 37 +- .../{filter_sim => filter_sim_vcf}/main.nf | 2 +- .../{filter_sim => filter_sim_vcf}/meta.yml | 0 .../tests/main.nf.test | 4 +- subworkflows/local/poolstats.nf | 87 ++-- subworkflows/local/postprocess.nf | 32 +- subworkflows/local/preprocess.nf | 108 ++-- .../utils_nfcore_assesspool_pipeline/main.nf | 6 +- workflows/assesspool.nf | 25 +- 29 files changed, 281 insertions(+), 1296 deletions(-) delete mode 100644 modules/nf-core/bcftools/reheader/environment.yml delete mode 100644 modules/nf-core/bcftools/reheader/main.nf delete mode 100644 modules/nf-core/bcftools/reheader/meta.yml delete mode 100644 modules/nf-core/bcftools/reheader/tests/bcf.config delete mode 100644 modules/nf-core/bcftools/reheader/tests/main.nf.test delete mode 100644 modules/nf-core/bcftools/reheader/tests/main.nf.test.snap delete mode 100644 modules/nf-core/bcftools/reheader/tests/tags.yml delete mode 100644 modules/nf-core/bcftools/reheader/tests/vcf.config delete mode 100644 modules/nf-core/bcftools/reheader/tests/vcf.gz.config delete mode 100644 modules/nf-core/bcftools/reheader/tests/vcf_gz_index.config delete mode 100644 modules/nf-core/bcftools/reheader/tests/vcf_gz_index_csi.config delete mode 100644 modules/nf-core/bcftools/reheader/tests/vcf_gz_index_tbi.config rename subworkflows/local/{filter_sim => filter_sim_vcf}/main.nf (99%) rename subworkflows/local/{filter_sim => filter_sim_vcf}/meta.yml (100%) rename subworkflows/local/{filter_sim => filter_sim_vcf}/tests/main.nf.test (95%) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 508e9b4..71a53e1 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -11,8 +11,8 @@ output: df_print: paged # tables are printed as an html table with support for pagination over rows and columns highlight: pygments pdf_document: true - pdf_document: - toc: yes + self_contained: false + lib_dir: artifacts date: "`r Sys.Date()`" params: nf: null @@ -315,19 +315,16 @@ c(show_filter,show_final_filter) %>% }) ``` -```{r summaries, summaries, results='asis', eval=show_fst} + +```{r summaries, results='asis', eval=show_fst} cat("# High-level summaries {.tabset}\n\n") -# walk through data by method nothing <- pool_data[,{ - # output tab header fst_data <- .SD cat(str_glue("## {method} {{.tabset}}\n\n")) - # walk through whether or not we're looking only at snps shared across all pools pools_shared %>% iwalk(\(shared,hdr) { - # output tab header cat(str_glue("### {hdr} {{.tabset}}\n\n")) # filter data for all pools if necessary @@ -336,87 +333,73 @@ nothing <- pool_data[,{ setorder(fst_data,chrom,pos,pop1,pop2) } - # repeatedly filter the dataset for different coverage levels - # and bind those together into a new dataset fst_cov <- cutoffs %>% - map(\(cov) { - fst_data[avg_min_cov >= cov][,cutoff := cov] + map(\(cutoff) { + fst_data[avg_min_cov >= cutoff][,cutoff := cutoff][,{ + fst_sd <- sd(fst,na.rm=TRUE) + fst_se <- fst_sd / sqrt(.N) + n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) + n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) + fst_mean <- mean(fst,na.rm=TRUE) + .( + fst = fst_mean, + fst_sd = fst_sd, + snps = n_snps, # snp count + contigs = n_contigs, + snps_per_contig = n_snps / n_contigs + ) + },by=cutoff] }) %>% - # and bind resulting subsets together into a superset rbindlist() - - fst_cov <- fst_cov[,{ - fst_sd <- sd(fst,na.rm=TRUE) - fst_se <- fst_sd / sqrt(.N) - n_snps <- uniqueN(.SD[,.(chrom,pos)],na.rm=TRUE) - n_contigs <- uniqueN(.SD$chrom,na.rm=TRUE) - fst_mean <- mean(fst,na.rm=TRUE) - .( - fst_sd = fst_sd, - fst = fst_mean, - fst_se = fst_se, - cov = mean(avg_min_cov,na.rm=TRUE), - snps = n_snps, # snp count - contigs = n_contigs, - snps_per_contig = n_snps / n_contigs, - ci_lower= fst_mean - qt(0.975, .N -1) * fst_se, - ci_upper= fst_mean + qt(0.975, .N -1) * fst_se - ) - },by=cutoff] setorder(fst_cov,cutoff) - setcolorder(fst_cov,c('cutoff','fst','fst_sd','fst_se','cov','snps','snps_per_contig','ci_lower','ci_upper')) + setcolorder(fst_cov,c('cutoff','fst','fst_sd','snps','contigs','snps_per_contig')) setnames( fst_cov, - c('cutoff','fst','fst_sd','fst_se','cov','snps','contigs','snps_per_contig','ci_lower','ci_upper'), - c('Coverage cutoff', 'Mean Fst', 'Fst (SD)', 'Fst (SEM)', - 'Mean coverage', 'SNPs', 'Contigs', 'SNPs per contig', 'CI (lower)', 'CI (upper)') + c('cutoff','fst','fst_sd','snps','contigs','snps_per_contig'), + c('Coverage cutoff', 'Mean Fst', 'Fst (SD)', 'SNPs', 'Contigs', 'SNPs per contig') ) - - # show plot tab + cat(str_glue("#### Plot\n\n")) - print(h5("Global summary statistics by coverage cutoff")) - - # create interactive plot and add traces + fig <- fst_cov %>% plot_ly() %>% add_trace( - y = ~ `Mean Fst`, + name = 'Mean Fst', + type = 'scatter', + mode = 'lines+markers', + visible=TRUE, x = ~`Coverage cutoff`, - name = "Mean Fst", - visible = T, - mode = "lines+markers", - type = "scatter", + y = ~`Mean Fst`, error_y = ~list( type='data', array = `Fst (SD)` - ) + ) ) %>% add_trace( - y = ~SNPs, + name = 'SNPs', + type = 'scatter', + mode = 'lines+markers', + visible=FALSE, x = ~`Coverage cutoff`, - name = "SNPs", - mode = "lines+markers", - type = "scatter", - visible=F + y = ~`SNPs` ) %>% add_trace( - y = ~`SNPs per contig`, + name = 'SNPs per contig', + type = 'scatter', + mode = 'lines+markers', + visible=FALSE, x = ~`Coverage cutoff`, - name = "Mean SNPs per contig", - visible = F, - mode = "lines+markers", - type = "scatter" + y = ~`SNPs per contig` ) %>% add_trace( - y = ~Contigs, + name = 'Contigs', + type = 'scatter', + mode = 'lines+markers', + visible=FALSE, x = ~`Coverage cutoff`, - name = "Contigs", - visible = F, - mode = "lines+markers", - type = "scatter" - ) - - # configure button layout and behavior + y = ~`Contigs` + ) + button_layout <- list( type = "buttons", direction = "right", @@ -483,7 +466,7 @@ nothing <- pool_data[,{ ) ) ) - + # configure annotation properties annotation <- list( list( @@ -495,7 +478,7 @@ nothing <- pool_data[,{ showarrow = FALSE ) ) - + # add layout and annotation to figure fig <- fig %>% layout( @@ -508,16 +491,16 @@ nothing <- pool_data[,{ ) %>% config(displayModeBar = FALSE) - # display figure + # display figure print(tagList(fig)) - + # show data table tab cat(str_glue("#### Table\n\n")) # construct csv download filename shared <- ifelse(shared,"all","shared") filename <- str_glue("{method}_{shared}_summary") - + # a javascript callback that rounds numbers to the # nearest thousand. otherwise the table is full of stuff like 0.3412341234123412341234 js <- DT::JS( @@ -527,9 +510,9 @@ nothing <- pool_data[,{ "data;", "}" ) - + print(h5("Global summary statistics by coverage cutoff")) - + # output the datatable print( tagList( @@ -553,22 +536,23 @@ nothing <- pool_data[,{ scrollX = TRUE, columnDefs = list( list( - targets = c(2:5,8:10)-1, + targets = c(2:3)-1, render = js ) ) ) ) %>% - formatRound(c(1,6,7),digits=0) + formatRound(c(1,4:6),digits=0) ) ) + + }) + },by=method] - ``` - ```{r fst-cov-all, results='asis', eval=show_fst} cat("# Pairwise Fst by minimum coverage {.tabset}\n\n") # walk through calculation methods @@ -959,13 +943,13 @@ nothing <- pool_data[,{ y = ~.data[[v[2]]], z = ~.data[[v[3]]], colorscale = 'Greys', - showscale = FALSE + showscale = FALSE ) } else . } %>% add_trace( type = "heatmap", - data = hmd, + data = hmd, x = ~.data[[v[1]]], # x variable y = ~.data[[v[2]]], # y variable z = ~.data[[v[3]]], # z (color) variable @@ -1065,17 +1049,17 @@ methods %>% pool_data[ method == methods[2] ], by = c('chrom','pos','pop1','pop2') ) - + # do a linear regression lmm <- lm(fst.y ~ fst.x, data=corr) - + # get predicted data for the trend line corr[,predicted := predict(lmm,corr[,.(fst.x)])] - + # get method names xtitle <- methods[1] ytitle <- methods[2] - + # plot the figure fig <- corr %>% plot_ly() %>% @@ -1110,7 +1094,7 @@ methods %>% ) %>% # disappear the legends hide_guides() - + print(tagList(fig)) }) ``` @@ -1173,7 +1157,7 @@ nothing <- fisher_data[,{ map(\(cov) { # filter by coverage cutoff and get mean fisher p-value by snp - figdata <- fishr[avg_min_cov >= cov][,.(log_fisher = mean(log_fisher,na.rm=TRUE)), by=.(chrom,pos)][order(chrom,pos)] + figdata <- fishr[avg_min_cov >= cov][,.(log_fisher = mean(log_fisher,na.rm=TRUE)), by=.(chrom,pos)][order(chrom,pos)] figdata[,snp := .I] fig <- ggplot(figdata) + diff --git a/assets/schema_input.json b/assets/schema_input.json index 8acf4a5..4685348 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,12 +13,12 @@ "errorMessage": "Project name must be provided and cannot contain spaces", "meta": ["id"] }, - "vcf": { + "input": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.vcf(\\.gz)?$", - "errorMessage": "VCF file must be provided in the `vcf` column, cannot contain spaces, and must have extension '.vcf' or '.vcf.gz'" + "pattern": "^\\S+\\.(vcf|sync)(\\.gz)?$", + "errorMessage": "Input file must be provided in the `input` column, cannot contain spaces, and must have extension '.vcf', '.sync', 'vcf.gz', or '.sync.gz'" }, "vcf_index": { "type": "string", @@ -42,6 +42,6 @@ "errorMessage": "Must provide pool size(s), either a single number or a comma-separated list" } }, - "required": ["project","vcf", "reference","pool_sizes"] + "required": ["project","input", "reference","pool_sizes"] } } diff --git a/modules/local/extractsequences/main.nf b/modules/local/extractsequences/main.nf index c4a4d54..6a7fe95 100644 --- a/modules/local/extractsequences/main.nf +++ b/modules/local/extractsequences/main.nf @@ -1,6 +1,6 @@ process EXTRACT_SEQUENCES { tag "$meta.id" - label 'process_single' + label 'process_single_mem' conda "${moduleDir}/environment.yml" // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -34,7 +34,10 @@ process EXTRACT_SEQUENCES { cat <<-END_VERSIONS > versions.yml "${task.process}": - extractsequences: \$(extractsequences --version) + R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + Rsamtools: \$(Rscript -e "cat(paste(packageVersion('Rsamtools')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ @@ -47,7 +50,10 @@ process EXTRACT_SEQUENCES { cat <<-END_VERSIONS > versions.yml "${task.process}": - extractsequences: \$(extractsequences --version) + R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + Rsamtools: \$(Rscript -e "cat(paste(packageVersion('Rsamtools')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ } diff --git a/modules/local/fishertest/main.nf b/modules/local/fishertest/main.nf index 89cb735..835f7b7 100644 --- a/modules/local/fishertest/main.nf +++ b/modules/local/fishertest/main.nf @@ -33,13 +33,8 @@ process FISHERTEST { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - optparse: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - janitor: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('janitor')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('matrixstats')),sep='.')") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ @@ -52,13 +47,8 @@ process FISHERTEST { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - optparse: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - janitor: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('janitor')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('matrixstats')),sep='.')") + optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ } diff --git a/modules/local/grenedalf/sync.nf b/modules/local/grenedalf/sync.nf index b558d42..739f222 100644 --- a/modules/local/grenedalf/sync.nf +++ b/modules/local/grenedalf/sync.nf @@ -9,8 +9,10 @@ process GRENEDALF_SYNC { input: tuple val(meta), path(vcf), path(index) + tuple val(meta), path(sync) tuple val(meta), path(fasta) tuple val(meta), path(fai) + tuple val(meta), path(sample_map) output: tuple val(meta), path("*.sync"), emit: sync @@ -24,14 +26,18 @@ process GRENEDALF_SYNC { def prefix = task.ext.prefix ?: "${meta.id}" def fasta_arg = fasta ? "--reference-genome-fasta ${fasta}" : "" def fai_arg = fai ? "--reference-genome-fai ${fai}" : "" + def input_opt = vcf ? '--vcf-path' : (sync ? '--sync-path' : '') + def input_arg = vcf ?: (sync ?: '') + def remap_arg = sample_map ? "--rename-samples-list ${sample_map}" : "" """ grenedalf sync \\ --threads ${task.cpus} \\ ${args} \\ ${fasta_arg} \\ ${fai_arg} \\ - --file-prefix "${prefix}_" \\ - --vcf-path ${vcf} + ${remap_arg} \\ + ${input_opt} ${input_arg} \\ + --file-prefix "${prefix}_" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/joinfreq/main.nf b/modules/local/joinfreq/main.nf index b2f8e42..ac79efd 100644 --- a/modules/local/joinfreq/main.nf +++ b/modules/local/joinfreq/main.nf @@ -33,12 +33,8 @@ process JOINFREQ { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tidyr: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ @@ -51,12 +47,8 @@ process JOINFREQ { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") - tidyr: \$(Rscript -e "cat(paste(packageVersion('tidyr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ } diff --git a/modules/local/joinfreq/resources/usr/bin/joinfreq.R b/modules/local/joinfreq/resources/usr/bin/joinfreq.R index 22d28fe..014398c 100755 --- a/modules/local/joinfreq/resources/usr/bin/joinfreq.R +++ b/modules/local/joinfreq/resources/usr/bin/joinfreq.R @@ -133,7 +133,7 @@ if (calc_method == "grenedalf") { # pivot frequency table longer freq_snps[,c(grep('^TOTAL',names(freq_snps))) := NULL ] -freq_snps <- melt(freq_snps, measure = measure(pool,measure,pattern = r'[^([^.]+)\.(.+)$]'),value.name = 'count') +freq_snps <- melt(freq_snps, measure = measure(pool,measure,pattern = r'[^(.+)\.([A-Z_]+)$]'),value.name = 'count') freq_snps <- dcast(freq_snps,CHROM+POS+pool ~ measure,value.var = "count")[order(CHROM,POS,pool)] setnames(freq_snps,1:2,new=tolower) @@ -165,6 +165,6 @@ setcolorder( c('chrom','pos','pop1','pop2','fst','avg_min_cov','method','alt_cnt.pop1','alt_cnt.pop2', 'ref_cnt.pop1','ref_cnt.pop2','depth.pop1','depth.pop2','total.alt_cnt','total.ref_cnt','total.depth') ) - + outfile <- sprintf("%s/%s_fst_frequency.tsv",outdir,file_prefix) fwrite(combined,outfile,sep="\t") \ No newline at end of file diff --git a/modules/local/poolfstat/fst/main.nf b/modules/local/poolfstat/fst/main.nf index 2e6f158..1b61887 100644 --- a/modules/local/poolfstat/fst/main.nf +++ b/modules/local/poolfstat/fst/main.nf @@ -40,11 +40,7 @@ process POOLFSTAT_FST { R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") poolfstat: \$(Rscript -e "cat(paste(packageVersion('poolfstat')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tibble')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ @@ -59,11 +55,7 @@ process POOLFSTAT_FST { R: \$(Rscript -e "cat(paste(R.version[c('major','minor')],collapse='.'))") poolfstat: \$(Rscript -e "cat(paste(packageVersion('poolfstat')),sep='.')") optparse: \$(Rscript -e "cat(paste(packageVersion('optparse')),sep='.')") - tibble: \$(Rscript -e "cat(paste(packageVersion('tibble')),sep='.')") - stringr: \$(Rscript -e "cat(paste(packageVersion('stringr')),sep='.')") - readr: \$(Rscript -e "cat(paste(packageVersion('readr')),sep='.')") - purrr: \$(Rscript -e "cat(paste(packageVersion('purrr')),sep='.')") - dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr')),sep='.')") + data.table: \$(Rscript -e "cat(paste(packageVersion('data.table')),sep='.')") END_VERSIONS """ } diff --git a/modules/nf-core/bcftools/reheader/environment.yml b/modules/nf-core/bcftools/reheader/environment.yml deleted file mode 100644 index 5c00b11..0000000 --- a/modules/nf-core/bcftools/reheader/environment.yml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - bioconda::bcftools=1.20 diff --git a/modules/nf-core/bcftools/reheader/main.nf b/modules/nf-core/bcftools/reheader/main.nf deleted file mode 100644 index 9cf6d0d..0000000 --- a/modules/nf-core/bcftools/reheader/main.nf +++ /dev/null @@ -1,79 +0,0 @@ -process BCFTOOLS_REHEADER { - tag "$meta.id" - label 'process_low' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0': - 'biocontainers/bcftools:1.20--h8b25389_0' }" - - input: - tuple val(meta), path(vcf), path(header), path(samples) - tuple val(meta2), path(fai) - - output: - tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf - tuple val(meta), path("*.{csi,tbi}") , emit: index, optional: true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def fai_argument = fai ? "--fai $fai" : "" - def header_argument = header ? "--header $header" : "" - def samples_argument = samples ? "--samples $samples" : "" - - def args2 = task.ext.args2 ?: '--output-type z' - def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : - args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : - args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : - args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : - "vcf" - """ - bcftools \\ - reheader \\ - $fai_argument \\ - $header_argument \\ - $samples_argument \\ - $args \\ - --threads $task.cpus \\ - $vcf \\ - | bcftools view \\ - $args2 \\ - --output ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') - END_VERSIONS - """ - - stub: - def args2 = task.ext.args2 ?: '--output-type z' - def prefix = task.ext.prefix ?: "${meta.id}" - - def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : - args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : - args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : - args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : - "vcf" - def index = args2.contains("--write-index=tbi") || args2.contains("-W=tbi") ? "tbi" : - args2.contains("--write-index=csi") || args2.contains("-W=csi") ? "csi" : - args2.contains("--write-index") || args2.contains("-W") ? "csi" : - "" - def create_cmd = extension.endsWith(".gz") ? "echo '' | gzip >" : "touch" - def create_index = extension.endsWith(".gz") && index.matches("csi|tbi") ? "touch ${prefix}.${extension}.${index}" : "" - - """ - ${create_cmd} ${prefix}.${extension} - ${create_index} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/bcftools/reheader/meta.yml b/modules/nf-core/bcftools/reheader/meta.yml deleted file mode 100644 index 47e5344..0000000 --- a/modules/nf-core/bcftools/reheader/meta.yml +++ /dev/null @@ -1,76 +0,0 @@ -name: bcftools_reheader -description: Reheader a VCF file -keywords: - - reheader - - vcf - - update header -tools: - - reheader: - description: | - Modify header of VCF/BCF files, change sample names. - homepage: http://samtools.github.io/bcftools/bcftools.html - documentation: http://samtools.github.io/bcftools/bcftools.html#reheader - doi: 10.1093/gigascience/giab008 - licence: ["MIT"] - identifier: biotools:bcftools -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - vcf: - type: file - description: VCF/BCF file - pattern: "*.{vcf.gz,vcf,bcf}" - - header: - type: file - description: New header to add to the VCF - pattern: "*.{header.txt}" - - samples: - type: file - description: File containing sample names to update (one sample per line) - pattern: "*.{samples.txt}" - - - meta2: - type: map - description: | - Groovy Map containing reference information - e.g. [ id:'genome' ] - - fai: - type: file - description: Fasta index to update header sequences with - pattern: "*.{fai}" -output: - - vcf: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.{vcf,vcf.gz,bcf,bcf.gz}": - type: file - description: VCF with updated header, bgzipped per default - pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" - - index: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.{csi,tbi}": - type: file - description: Index of VCF with updated header - pattern: "*.{csi,tbi}" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@bjohnnyd" - - "@jemten" - - "@ramprasadn" -maintainers: - - "@bjohnnyd" - - "@jemten" - - "@ramprasadn" diff --git a/modules/nf-core/bcftools/reheader/tests/bcf.config b/modules/nf-core/bcftools/reheader/tests/bcf.config deleted file mode 100644 index 2b7dff5..0000000 --- a/modules/nf-core/bcftools/reheader/tests/bcf.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.args2 = { "--no-version --output-type b" } - ext.prefix = "tested" -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/main.nf.test b/modules/nf-core/bcftools/reheader/tests/main.nf.test deleted file mode 100644 index 96c1b7b..0000000 --- a/modules/nf-core/bcftools/reheader/tests/main.nf.test +++ /dev/null @@ -1,394 +0,0 @@ -nextflow_process { - - name "Test Process BCFTOOLS_REHEADER" - script "../main.nf" - process "BCFTOOLS_REHEADER" - tag "modules" - tag "modules_nfcore" - tag "bcftools" - tag "bcftools/reheader" - - test("sarscov2 - [vcf, [], []], fai - vcf output") { - - config "./vcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output") { - - config "./vcf.gz.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - index") { - - config "./vcf_gz_index.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.vcf, - process.out.index.collect { it.collect { it instanceof Map ? it : file(it).name } }, - process.out.versions - ).match() }, - { assert process.out.index[0][1].endsWith(".csi") } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index") { - - config "./vcf_gz_index_csi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.vcf, - process.out.index.collect { it.collect { it instanceof Map ? it : file(it).name } }, - process.out.versions - ).match() }, - { assert process.out.index[0][1].endsWith(".csi") } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index") { - - config "./vcf_gz_index_tbi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.vcf, - process.out.index.collect { it.collect { it instanceof Map ? it : file(it).name } }, - process.out.versions - ).match() }, - { assert process.out.index[0][1].endsWith(".tbi") } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - bcf output") { - - config "./bcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, header, []], []") { - - config "./vcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true), - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - [] - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], samples], fai") { - - config "./vcf.config" - when { - - process { - """ - ch_no_samples = Channel.of([ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [] - ]) - ch_samples = Channel.of(["samples.txt", "new_name"]) - .collectFile(newLine:true) - input[0] = ch_no_samples.combine(ch_samples) - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - stub") { - - options "-stub" - config "./vcf.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - file(process.out.vcf[0][1]).name, - process.out.versions, - ).match() } - ) - } - - } - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - index -stub") { - - options "-stub" - config "./vcf_gz_index.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index -stub") { - - options "-stub" - config "./vcf_gz_index_csi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index -stub") { - - options "-stub" - config "./vcf_gz_index_tbi.config" - when { - - process { - """ - input[0] = [ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), - [], - [] - ] - input[1] = [ - [ id:'genome' ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} diff --git a/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap b/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap deleted file mode 100644 index 87a3654..0000000 --- a/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap +++ /dev/null @@ -1,469 +0,0 @@ -{ - "sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index": { - "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.tbi" - ] - ], - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T10:09:05.955833763" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - index -stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:52:41.444952182" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - tbi index -stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:53:04.314827944" - }, - "sarscov2 - [vcf, [], []], fai - vcf output": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:50:41.983008108" - }, - "sarscov2 - [vcf, [], []], fai - bcf output": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.bcf.gz:md5,c8a304c8d2892039201154153c8cd536" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.bcf.gz:md5,c8a304c8d2892039201154153c8cd536" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:51:43.072513252" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:50:53.055630152" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - index": { - "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi" - ] - ], - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T10:08:37.999924355" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index -stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:52:52.512269206" - }, - "sarscov2 - [vcf, [], []], fai - stub": { - "content": [ - "tested.vcf", - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-05-31T15:16:36.337112514" - }, - "sarscov2 - [vcf, [], []], fai - vcf.gz output - csi index": { - "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz:md5,8e722884ffb75155212a3fc053918766" - ] - ], - [ - [ - { - "id": "test", - "single_end": false - }, - "test_vcf.vcf.gz.csi" - ] - ], - [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T10:08:55.434831174" - }, - "sarscov2 - [vcf, [], samples], fai": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,c64c373c10b0be24b29d6f18708ec1e8" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,c64c373c10b0be24b29d6f18708ec1e8" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:52:12.216002665" - }, - "sarscov2 - [vcf, header, []], []": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,3189bc9a720d5d5d3006bf72d91300cb" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ], - "index": [ - - ], - "vcf": [ - [ - { - "id": "test", - "single_end": false - }, - "tested.vcf:md5,3189bc9a720d5d5d3006bf72d91300cb" - ] - ], - "versions": [ - "versions.yml:md5,486e3d4ebc1dbf5c0a4dfaebae12ea34" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-09-03T09:51:54.062386022" - } -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/tags.yml b/modules/nf-core/bcftools/reheader/tests/tags.yml deleted file mode 100644 index c252941..0000000 --- a/modules/nf-core/bcftools/reheader/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -bcftools/reheader: - - modules/nf-core/bcftools/reheader/** diff --git a/modules/nf-core/bcftools/reheader/tests/vcf.config b/modules/nf-core/bcftools/reheader/tests/vcf.config deleted file mode 100644 index 820f2ae..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.args2 = { "--no-version" } - ext.prefix = "tested" -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/vcf.gz.config b/modules/nf-core/bcftools/reheader/tests/vcf.gz.config deleted file mode 100644 index c3031c3..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf.gz.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.args2 = { "--no-version --output-type z" } - ext.prefix = "tested" -} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index.config b/modules/nf-core/bcftools/reheader/tests/vcf_gz_index.config deleted file mode 100644 index 1e050ec..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.prefix = { "${meta.id}_vcf" } - ext.args2 = "--output-type z --write-index --no-version" -} diff --git a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_csi.config b/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_csi.config deleted file mode 100644 index 536e4b4..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_csi.config +++ /dev/null @@ -1,4 +0,0 @@ -process { - ext.prefix = { "${meta.id}_vcf" } - ext.args2 = "--output-type z --write-index=csi --no-version" -} diff --git a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_tbi.config b/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_tbi.config deleted file mode 100644 index 91a80db..0000000 --- a/modules/nf-core/bcftools/reheader/tests/vcf_gz_index_tbi.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - ext.prefix = { "${meta.id}_vcf" } - ext.args2 = "--output-type z --write-index=tbi --no-version" - -} diff --git a/subworkflows/local/filter.nf b/subworkflows/local/filter.nf index 3e010da..896dfc3 100644 --- a/subworkflows/local/filter.nf +++ b/subworkflows/local/filter.nf @@ -6,12 +6,20 @@ include { BCFTOOLS_VIEW as BCFTOOLS_COMPRESS_INDEX_FILTERED } from '../../module workflow FILTER { take: - ch_vcf // channel: [ val(meta), [ vcf ] ] + ch_input // channel: [ val(meta), [ vcf ] ] main: ch_versions = Channel.empty() + ch_input = ch_input + .branch { meta, file, index -> + vcf: meta.format == 'vcf' + sync: meta.format == 'sync' + } + + ch_vcf = ch_input.vcf + // decide whether to run bcftools filter def bcff = params.match_allele_count || ( params.filter && ( params.min_mapping_quality || params.min_mapping_quality_ref || @@ -26,10 +34,8 @@ workflow FILTER { .map{ meta, vcf -> [ meta.id, meta, vcf ] } .join( BCFTOOLS_FILTER.out.tbi.map{ meta, index -> [ meta.id, index ] } ) .map{ id, meta, vcf, index -> [ meta, vcf, index ] } - .set{ ch_vcf2 } + .set{ ch_vcf } ch_versions = ch_versions.mix(BCFTOOLS_FILTER.out.versions.first()) - } else { - ch_vcf2 = ch_vcf } // decide whether to run vcftools filter @@ -37,42 +43,39 @@ workflow FILTER { params.max_mean_depth || params.hwe_cutoff ) if (vcft) { - VCFTOOLS_FILTER ( ch_vcf2.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) + VCFTOOLS_FILTER ( ch_vcf.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) ch_versions = ch_versions.mix(VCFTOOLS_FILTER.out.versions.first()) // TODO: re-compress and index output VCFTOOLS_FILTER.out.vcf .map{ meta, vcf -> [ meta, vcf, [] ] } - .set{ ch_vcf3 } - } else { - ch_vcf3 = ch_vcf2 + .set{ ch_vcf } } if (params.thin_snps) { - THIN_SNPS( ch_vcf3.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) + THIN_SNPS( ch_vcf.map{ meta, vcf, index -> [ meta, vcf ] }, [], [] ) ch_versions = ch_versions.mix(THIN_SNPS.out.versions.first()) THIN_SNPS.out.vcf .map{ meta, vcf -> [ meta, vcf, [] ] } - .set{ ch_vcf4 } - } else { - ch_vcf4 = ch_vcf3 + .set{ ch_vcf } } if (vcft || params.thin_snps) { // re-index and compress if vcftools was run - BCFTOOLS_COMPRESS_INDEX_FILTERED( ch_vcf4, [], [], [] ).vcf + BCFTOOLS_COMPRESS_INDEX_FILTERED( ch_vcf, [], [], [] ).vcf .map{ meta,vcf -> [ meta.id, meta, vcf ] } .join( BCFTOOLS_COMPRESS_INDEX_FILTERED.out.tbi.map{ meta,index -> [ meta.id, index ] } ) .map{ id, meta, vcf, index -> [ meta, vcf, index ] } - .set{ ch_vcf_final } + .set{ ch_vcf } ch_versions = ch_versions.mix(BCFTOOLS_COMPRESS_INDEX_FILTERED.out.versions.first()) - } else { - ch_vcf_final = ch_vcf4 } + ch_vcf.mix( ch_input.sync ).set{ ch_output } + + // TODO: visualize filter results? (rmd section 'filter_visualization') emit: - vcf = ch_vcf_final + output = ch_output versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/filter_sim/main.nf b/subworkflows/local/filter_sim_vcf/main.nf similarity index 99% rename from subworkflows/local/filter_sim/main.nf rename to subworkflows/local/filter_sim_vcf/main.nf index 31b4bed..72f8a7b 100644 --- a/subworkflows/local/filter_sim/main.nf +++ b/subworkflows/local/filter_sim_vcf/main.nf @@ -19,7 +19,7 @@ include { RIPGREP as COUNT_SNPS } from '../../../modules/nf include { BCFTOOLS_VIEW as BCFTOOLS_COMPRESS_INDEX_FILTERED } from '../../../modules/nf-core/bcftools/view/main' -workflow FILTER_SIM { +workflow FILTER_SIM_VCF { take: ch_vcf diff --git a/subworkflows/local/filter_sim/meta.yml b/subworkflows/local/filter_sim_vcf/meta.yml similarity index 100% rename from subworkflows/local/filter_sim/meta.yml rename to subworkflows/local/filter_sim_vcf/meta.yml diff --git a/subworkflows/local/filter_sim/tests/main.nf.test b/subworkflows/local/filter_sim_vcf/tests/main.nf.test similarity index 95% rename from subworkflows/local/filter_sim/tests/main.nf.test rename to subworkflows/local/filter_sim_vcf/tests/main.nf.test index e5956fb..5f41de6 100644 --- a/subworkflows/local/filter_sim/tests/main.nf.test +++ b/subworkflows/local/filter_sim_vcf/tests/main.nf.test @@ -2,9 +2,9 @@ // nf-core subworkflows test filter_sim nextflow_workflow { - name "Test Subworkflow FILTER_SIM" + name "Test Subworkflow FILTER_SIM_VCF" script "../main.nf" - workflow "FILTER_SIM" + workflow "FILTER_SIM_VCF" tag "subworkflows" tag "subworkflows_" diff --git a/subworkflows/local/poolstats.nf b/subworkflows/local/poolstats.nf index 1fc2062..8e159a6 100644 --- a/subworkflows/local/poolstats.nf +++ b/subworkflows/local/poolstats.nf @@ -13,7 +13,7 @@ include { combn } from "../../modules/local/popoolation2/combn workflow POOLSTATS { take: - ch_vcf // channel: [ val(meta), path(vcf), path(index) ] + ch_input // channel: [ val(meta), path(vcf), path(index) ] ch_ref // channel: path(ref) main: @@ -25,34 +25,61 @@ workflow POOLSTATS { // prepare vcf channel for input to grenedalf sync // (join fasta reference index) - ch_vcf - .map{ meta, vcf, index -> [ meta.id, meta, vcf, index ] } + ch_input + .map{ meta, file, index -> [ meta.id, meta, file, index ] } .join( ch_ref.map{ meta, ref, fai -> [ meta.id, ref, fai ] } ) - .map{ id, meta, vcf, index, ref, fai -> [ meta, vcf, index, ref, fai ] } + .map{ id, meta, file, index, ref, fai -> [ meta, file, index, ref, fai ] } .set{ ch_prep } - // create sync file using grenedalf + ch_prep + .map{ meta, file, index, ref, fai -> [ meta, meta.rename ? meta.pool_map : [:] ] } + // .filter { it[0].rename } + .collectFile(sort: true) { meta, pool_map -> [ "${meta.id}.map", pool_map ? pool_map.collect{k,v -> "${k}\t${v}"}.join("\n") : "" ] } + .map{ [ it.baseName, it ]} + .set{ ch_samplemap } + + ch_prep + .map{ [ it[0].id ] + it } + .join( ch_samplemap ) + .map{ id, meta, file, index, ref, fai, pool_map -> + [meta, file, index, ref, fai, pool_map.size() ? pool_map : [] ] + } + .set{ ch_prep } + + + // create/modify sync files using grenedalf GRENEDALF_SYNC( - ch_prep.map{ meta, vcf, index, ref, fai -> [ meta, vcf, index ] }, - ch_prep.map{ meta, vcf, index, ref, fai -> [ meta, [] ] }, - ch_prep.map{ meta, vcf, index, ref, fai -> [ meta, fai ] } + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, meta.format == 'vcf' ? file : [], index ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, meta.format == 'sync' ? file : [] ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, [] ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, fai ] }, + ch_prep.map{ meta, file, index, ref, fai, map -> [ meta, map ] } ) ch_versions = ch_versions.mix(GRENEDALF_SYNC.out.versions.first()) - // prepare input channel to split sync file into pairwise comparisons - GRENEDALF_SYNC.out.sync - .map{ meta, sync -> [ meta, sync, combn(meta.pools.keySet() as String[],2) ] } - .transpose() - .map{ meta, sync, pair -> - [ [ id: meta.id, pools: meta.pools.subMap(pair).sort{ it.key } ], sync ] - } - .set{ ch_split_sync } + // prepare and split input channel into pairwise comparisons + if (params.popoolation2 || params.poolfstat || params.fisher_test_popoolation) { + GRENEDALF_SYNC.out.sync + .map{ meta, sync -> [ meta, sync, combn(meta.pools.keySet() as String[],2) ] } + .transpose() + .map{ meta, sync, pair -> + // println meta.pools.subMap(pair).sort{ it.key }.values().toArray().getClass() + [ + meta + [ + pools: meta.pools.subMap(pair).sort{ it.key }, + pool_map: meta.pool_map.findAll{ it.value as String in pair as String[] }.sort{ it.key } + ], + sync + ] + } + .set{ ch_split_sync } - // pairwise split sync file - SPLIT_SYNC( ch_split_sync, [] ).output - .map{ meta, sync -> [ meta, meta.pools, sync ] } - .set{ ch_split_sync } - ch_versions = ch_versions.mix(SPLIT_SYNC.out.versions.first()) + // pairwise split sync file + SPLIT_SYNC( ch_split_sync, [] ).output + .map{ meta, sync -> [ meta, meta.pools, sync ] } + .set{ ch_split_sync } + ch_versions = ch_versions.mix(SPLIT_SYNC.out.versions.first()) + } // prepare input for frequency calculation GRENEDALF_SYNC.out.sync @@ -76,8 +103,6 @@ workflow POOLSTATS { ch_versions = ch_versions.mix(POPOOLATION2_FST.out.versions.first()) ch_versions = ch_versions.mix(REHEADER_FST.out.versions.first()) - // mix fst results - // ch_fst = ch_fst.mix( POPOOLATION2_FST.out.fst.map { meta, fst -> [ meta, fst, "popoolation" ] } ) ch_fst = ch_fst.mix( REHEADER_FST.out.output.map { meta, fst -> [ meta, fst, "popoolation" ] } ) } @@ -92,7 +117,7 @@ workflow POOLSTATS { // run grenedalf fst if requested if (params.grenedalf) { - // create pool size map + // create pool size map file GRENEDALF_SYNC.out.sync .collectFile { meta, sync -> [ "${meta.id}.ps", meta.pools.collect{ k, v -> "${k}\t${v}"}.join("\n") ] @@ -127,21 +152,19 @@ workflow POOLSTATS { .map { id, meta, fst, method, freq -> [ meta, freq, fst, method ] } .set { ch_join } - // join pairwise fst results to snp frequencies and concatenate into one big file per input ch_join = JOINFREQ( ch_join ).fst_freq .collectFile( keepHeader: true, sort: true, storeDir: 'output/fst', cache: true ){ meta, joined -> [ "${meta.id}.fst", joined ] } .map{ [it.baseName, it ] } // rejoin meta tags - ch_join = ch_vcf + ch_join = ch_input .map{ it[0] } .unique() .map{ [ it.id, it ] } .join( ch_join ) .map{ id, meta, f -> [ meta, f ] } - ch_versions = ch_versions.mix(JOINFREQ.out.versions.first()) // run fisher tests if requested @@ -151,7 +174,14 @@ workflow POOLSTATS { .map{ meta, freq -> [ meta, freq, combn(meta.pools.keySet() as String[],2) ] } .transpose() .map{ meta, freq, pair -> - [ [ id: meta.id, pools: meta.pools.subMap(pair).sort{ it.key } ], pair.sort(), freq ] + [ + meta + [ + pools: meta.pools.subMap(pair).sort{ it.key }, + pool_map: meta.pool_map.findAll{ it.value as String in pair as String[] }.sort{ it.key } + ], + pair.sort(), + freq + ] } .set{ ch_split_freq } @@ -161,6 +191,7 @@ workflow POOLSTATS { // mix fisher results & versions ch_fisher = ch_fisher.mix( FISHERTEST.out.fisher ) ch_versions = ch_versions.mix(FISHERTEST.out.versions.first()) + } // run fisher tests with popoolation2 diff --git a/subworkflows/local/postprocess.nf b/subworkflows/local/postprocess.nf index c1e6176..3c88eaa 100644 --- a/subworkflows/local/postprocess.nf +++ b/subworkflows/local/postprocess.nf @@ -5,7 +5,7 @@ include { EXTRACT_SEQUENCES } from '../../modules/local/extrac workflow POSTPROCESS { take: - ch_vcf + ch_input ch_unfiltered ch_fst ch_ref @@ -32,16 +32,16 @@ workflow POSTPROCESS { .map{ [ it.baseName, it ] } // join them back to meta tags - ch_fisher_collapsed = ch_vcf + ch_fisher_collapsed = ch_input .map{ it[0] } .unique() .map{ [ it.id, it ] } .join( ch_fisher_collapsed ) .map{ id, meta, f -> [ meta, f ] } - ch_count = ch_vcf - .map{ meta, vcf, index -> [ meta + [filter: 'cumulative'], vcf ] } - .mix( ch_unfiltered.map{ meta, vcf, index -> [ meta + [filter: 'before'], vcf ] } ) + ch_count = ch_input + .map{ meta, input, index -> [ meta + [filter: 'cumulative'], input ] } + .mix( ch_unfiltered.map{ meta, input, index -> [ meta + [filter: 'before'], input ] } ) // count final filtered SNPs into map COUNT_SNPS_FINAL( ch_count, '^#', false ) @@ -53,7 +53,7 @@ workflow POSTPROCESS { .map{ [ it.baseName, it ] } // join them back to the meta tags - ch_filter_final = ch_vcf + ch_filter_final = ch_input .map{ it[0] } .unique() .map{ [ it.id, it ] } @@ -63,7 +63,7 @@ workflow POSTPROCESS { // build rmarkdown report input and params // get report file channel as [ meta, reportfile ] - ch_report = ch_vcf.map { [ it[0], file("${projectDir}/assets/assesspool_report.Rmd") ] } + ch_report = ch_input.map { [ it[0].id, it[0], file("${projectDir}/assets/assesspool_report.Rmd") ] } // generate input files channel ch_input_files = ch_fst.ifEmpty{ [] } @@ -71,6 +71,8 @@ workflow POSTPROCESS { .mix( ch_filter.ifEmpty{ [] } ) .mix( ch_filter_final.ifEmpty{ [] } ) .groupTuple() + .map{ [it[0].id] + it[1..-1] } + // subset the params object because there's at least one value that changes // every time, which invalidates the caching @@ -88,7 +90,21 @@ workflow POSTPROCESS { tz: TimeZone.getDefault().getID() ]]} - CREATE_REPORT( ch_report, ch_params.map{meta, p -> p}, ch_input_files.map{meta, f -> f} ) + // join everything together + ch_report + .join( ch_params ) + .join( ch_input_files ) + .map { it[1..-1] } + .set{ ch_report } + + // tuple val(meta), path(notebook) + // val parameters + // path input_files + CREATE_REPORT( + ch_report.map{ meta, report, params, files -> [meta, report] }, + ch_report.map{ meta, report, params, files -> params }, + ch_report.map{ meta, report, params, files -> files } + ) ch_versions = ch_versions.mix(CREATE_REPORT.out.versions.first()) emit: diff --git a/subworkflows/local/preprocess.nf b/subworkflows/local/preprocess.nf index bb19ca8..7f84061 100644 --- a/subworkflows/local/preprocess.nf +++ b/subworkflows/local/preprocess.nf @@ -1,9 +1,19 @@ include { BCFTOOLS_QUERY as VCF_SAMPLES } from '../../modules/nf-core/bcftools/query/main' -include { BCFTOOLS_REHEADER as VCF_RENAME } from '../../modules/nf-core/bcftools/reheader/main' include { BCFTOOLS_VIEW as COMPRESS_VCF } from '../../modules/nf-core/bcftools/view/main' include { TABIX_TABIX as INDEX_VCF } from '../../modules/nf-core/tabix/tabix/main' include { SAMTOOLS_FAIDX as INDEX_REFERENCE } from '../../modules/nf-core/samtools/faidx/main' +// read the first line from a gzip'd file +def gz_head(Path file) { + new java.io.BufferedReader( + new java.io.InputStreamReader( + new java.util.zip.GZIPInputStream(new java.io.FileInputStream(file.toFile())) + ) + ).withCloseable { reader -> + return reader.readLine() + } +} + workflow PREPROCESS { take: @@ -15,7 +25,7 @@ workflow PREPROCESS { // save ref genome channel ch_samplesheet - .map { meta, vcf, index, ref -> [ meta, ref ] } + .map { meta, input, index, ref -> [ meta, ref ] } .set { ch_ref } // generate fasta ref index from reference fasta INDEX_REFERENCE( ch_ref, ch_ref.map{ meta, ref -> [ meta, [] ] } ) @@ -26,68 +36,75 @@ workflow PREPROCESS { .map{ id, meta, fasta, fai -> [ meta, fasta, fai ] } .set{ ch_ref } - // save vcf channel + // save input channel ch_samplesheet - .map { meta, vcf, index, ref -> [ meta, vcf, index ] } - .set { ch_vcf } + .map { meta, input, index, ref -> [ meta, input, index ] } + .set { ch_input } + // branch inputs into sync and/or vcf + ch_input + .branch { meta, input, index -> + vcf: input.toString() =~ /(?i)\.vcf(\.gz)?$/ + sync: input.toString() =~ /(?i)\.sync(\.gz)?$/ + } + .set{ ch_input } + + // preprocess VCF input(s) // Get VCF sample names from header regardless of whether we're renaming them - VCF_SAMPLES(ch_vcf,[],[],[]).output + VCF_SAMPLES(ch_input.vcf,[],[],[]).output .splitText() .map { meta, pool -> [ meta.id, pool.trim() ] } .groupTuple() .set { ch_samplenames } - ch_versions = ch_versions.mix(VCF_SAMPLES.out.versions.first()) + ch_versions = ch_versions.mix(VCF_SAMPLES.out.versions.first()) // Extract VCF sample names and smash them into a pool map - ch_vcf + ch_input.vcf .map{ meta, vcf, index -> [ meta.id, meta, vcf, index ] } .join( ch_samplenames ) .map { id, meta, vcf, index, pools -> def pp = meta.pools ?: pools def ps = meta.pool_sizes.size() > 1 ? meta.pool_sizes : (1..pools.size()).collect{ meta.pool_sizes[0] } - [ [ id: meta.id, rename: meta.rename, pools: [pp,ps].transpose().collectEntries(), pool_map: [pools,pp].transpose().collectEntries() ], vcf, index ] + [ [ id: meta.id, format: 'vcf', rename: meta.rename, pools: [pp,ps].transpose().collectEntries(), pool_map: [pools,pp].transpose().collectEntries() ], vcf, index ] } .set{ ch_vcf } - // First, branch out which ones we need to rename, zip or index + // preprocess sync input(s) + // Extract/get sync sample names the same way, more or less + ch_input.sync + .map{ meta, sync, index -> + def pools = (sync.extension.toLowerCase() == 'gz' ? gz_head(sync) : sync.withReader { it.readLine() }).split(/\t/) + def sync_hdr = !!(pools[0] =~ /^#chr/) + pools = sync_hdr ? pools[3..-1] : (1..(pools.size()-3)).collect{ "${sync.baseName}.${it}"} + + def pp = meta.pools ?: pools + assert pp.size() == pools.size() + def ps = meta.pool_sizes.size() > 1 ? meta.pool_sizes : (1..pools.size()).collect{ meta.pool_sizes[0] } + [ + meta + [ + format: 'sync', + sync_hdr: sync_hdr, + pools: [pp,ps].transpose().collectEntries{k,v -> [ k.toString(), v ]}, + pool_map: [pools,pp].transpose().collectEntries{k,v -> [ k.toString(), v.toString() ]} + ], + sync, + [] + ] + } + .set { ch_sync } + + // Figure out how to process vcf samples ch_vcf .branch { meta, vcf, index -> - rename: meta.rename - zip: !meta.rename && vcf.extension != "gz" - index: !meta.rename && vcf.extension == "gz" && !index - keep_as_is: !meta.rename && vcf.extension == "gz" && index + // rename: meta.rename + zip: /*!meta.rename &&*/ vcf.extension != "gz" + index: /*!meta.rename &&*/ vcf.extension == "gz" && !index + keep_as_is: /*!meta.rename &&*/ vcf.extension == "gz" && index } .set{ ch_vcf } - // rename VCF samples ----------------------------------- - // Create a sample name map file for bcftools reheader - ch_vcf.rename - // mapfiles named for meta.id - .collectFile { meta, vcf, index -> - [ "${meta.id}.map", meta.pool_map.collect{ k, v -> "${k} ${v}"}.join("\n") ] - } - // use mapfile basename for meta.id to join below - .map { f -> [ f.baseName, f ] } - .set{ ch_remap } - - // Join sample map back to VCF channel - ch_vcf.rename - .map { meta, vcf, index -> [ meta.id, meta, vcf ] } - .join( ch_remap ) - .map { id, meta, vcf, mapfile -> [ meta, vcf, [], mapfile ] } - .set{ ch_remap } - - // Rename samples in the VCF header - VCF_RENAME( ch_remap,ch_remap.map{ [it[0],[]]} ).vcf - .map{ meta, vcf -> [meta.id, meta, vcf] } - .join( VCF_RENAME.out.index.map{ meta, index -> [ meta.id, index ] } ) - .map{ id, meta, vcf, index -> [ meta, vcf, index ] } - .set { ch_renamed } - ch_versions = ch_versions.mix(VCF_RENAME.out.versions.first()) - // rename VCF samples ----------------------------------- // zip unzipped VCF ------------------------------------- COMPRESS_VCF( ch_vcf.zip,[],[],[] ) @@ -97,7 +114,6 @@ workflow PREPROCESS { .map{ id, meta, vcf, index -> [meta,vcf,index] } .set{ ch_zipped } ch_versions = ch_versions.mix(COMPRESS_VCF.out.versions.first()) - // zip unzipped VCF ------------------------------------- // index unindexed VCF ---------------------------------- INDEX_VCF( ch_vcf.index.map{ meta, vcf, index -> [ meta,vcf ] } ) @@ -106,19 +122,17 @@ workflow PREPROCESS { .map{ id, meta, vcf, index -> [meta, vcf, index] } .set{ ch_indexed } ch_versions = ch_versions.mix(INDEX_VCF.out.versions.first()) - // index unindexed VCF ---------------------------------- - // // Mix back in any renamed bits + // Mix back in any renamed bits ch_vcf.keep_as_is - .mix( ch_renamed ) .mix( ch_zipped ) .mix( ch_indexed ) - .set{ ch_vcf } - + .mix( ch_sync ) + .set{ ch_input } emit: - vcf = ch_vcf + output = ch_input ref = ch_ref versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf b/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf index 82b585f..cfed707 100644 --- a/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_assesspool_pipeline/main.nf @@ -73,17 +73,17 @@ workflow PIPELINE_INITIALISATION { Channel .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) .map { - meta, vcf, index, ref, pools, pool_sizes -> + meta, input, index, ref, pools, pool_sizes -> def pp = pools ? pools.split(/,/) : [] def ps = (pool_sizes instanceof Number) ? [pool_sizes] : pool_sizes.split(/,/) if (pp.size() != ps.size()) { if (pp && ps.size() != 1) { error "Pool sizes must either be a single number or a list the same length as `pools`" } else if (pp && ps.size() == 1) { - ps = [1..pp.size()].collect{ ps[0] } + ps = (1..pp.size()).collect{ ps[0] } } } - return [ meta + [ pools: pp, pool_sizes: ps, rename: !(!pools) ], vcf, index, ref ] + return [ meta + [ pools: pp, pool_sizes: ps, rename: !(!pools) ], input, index, ref ] } .set { ch_samplesheet } diff --git a/workflows/assesspool.nf b/workflows/assesspool.nf index 8980583..281bf69 100644 --- a/workflows/assesspool.nf +++ b/workflows/assesspool.nf @@ -7,10 +7,10 @@ include { paramsSummaryMap } from 'plugin/nf-schema' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_assesspool_pipeline' include { FILTER } from '../subworkflows/local/filter.nf' -include { FILTER_SIM } from '../subworkflows/local/filter_sim/main.nf' +include { FILTER_SIM_VCF } from '../subworkflows/local/filter_sim_vcf/main.nf' include { PREPROCESS } from '../subworkflows/local/preprocess.nf' include { POOLSTATS } from '../subworkflows/local/poolstats.nf' -include { POSTPROCESS } from '../subworkflows/local/postprocess.nf' +include { POSTPROCESS } from '../subworkflows/local/postprocess.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -24,11 +24,12 @@ workflow ASSESSPOOL { ch_samplesheet // channel: samplesheet read in from --input main: + ch_versions = Channel.empty() // run VCF preprocessing PREPROCESS( ch_samplesheet ) - ch_vcf = PREPROCESS.out.vcf + ch_input = PREPROCESS.out.output ch_ref = PREPROCESS.out.ref ch_versions = ch_versions.mix(PREPROCESS.out.versions.first()) @@ -41,21 +42,21 @@ workflow ASSESSPOOL { ch_fisher = Channel.empty() ch_filtered = Channel.empty() - // perform stepwise filtration - // for visualization and evaluation - - + // // perform stepwise filtration + // // for visualization and evaluation if (params.visualize_filters) { - FILTER_SIM( ch_vcf ) - ch_filter_sim = FILTER_SIM.out.filter_summary + FILTER_SIM_VCF( ch_input.filter { meta, f, index -> meta.format == 'vcf' } ) + ch_filter_sim = FILTER_SIM_VCF.out.filter_summary } + // TODO: add cumulative filtering to the filter_only workflow + // run downstream processing unless we're only // visualizing filters if (!params.filter_only) { // run filtering if desired - FILTER( ch_vcf ) - ch_filtered = FILTER.out.vcf + FILTER( ch_input ) + ch_filtered = FILTER.out.output ch_versions = ch_versions.mix(FILTER.out.versions.first()) // calculate pool statistics (fst, fisher, etc.) @@ -72,7 +73,7 @@ workflow ASSESSPOOL { // run post-processing steps POSTPROCESS( ch_filtered, - ch_vcf, + ch_input, ch_fst, ch_ref, ch_sync, From c0cd93647cf3c3ca7f83d61ef246012ce2079b18 Mon Sep 17 00:00:00 2001 From: mykle hoban Date: Thu, 7 Aug 2025 12:23:04 -1000 Subject: [PATCH 19/39] update fisher plot to use global slider --- assets/assesspool_report.Rmd | 89 ++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 50 deletions(-) diff --git a/assets/assesspool_report.Rmd b/assets/assesspool_report.Rmd index 71a53e1..86d0c72 100644 --- a/assets/assesspool_report.Rmd +++ b/assets/assesspool_report.Rmd @@ -42,8 +42,9 @@ we're not doing multiple large data-access processes