guohanghui commited on
Commit
c8f61f1
·
verified ·
1 Parent(s): 6c99c2c

Upload 543 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +98 -0
  2. Dockerfile +18 -0
  3. README.md +27 -5
  4. app.py +45 -0
  5. deepTools/mcp_output/README_MCP.md +64 -0
  6. deepTools/mcp_output/analysis.json +391 -0
  7. deepTools/mcp_output/diff_report.md +73 -0
  8. deepTools/mcp_output/mcp_plugin/__init__.py +0 -0
  9. deepTools/mcp_output/mcp_plugin/adapter.py +139 -0
  10. deepTools/mcp_output/mcp_plugin/main.py +13 -0
  11. deepTools/mcp_output/mcp_plugin/mcp_service.py +102 -0
  12. deepTools/mcp_output/requirements.txt +13 -0
  13. deepTools/mcp_output/start_mcp.py +30 -0
  14. deepTools/mcp_output/workflow_summary.json +195 -0
  15. deepTools/source/.planemo.sh +35 -0
  16. deepTools/source/.readthedocs.yaml +15 -0
  17. deepTools/source/CHANGES.txt +448 -0
  18. deepTools/source/LICENSE.txt +9 -0
  19. deepTools/source/MANIFEST.in +8 -0
  20. deepTools/source/README.md +68 -0
  21. deepTools/source/README.rst +29 -0
  22. deepTools/source/__init__.py +4 -0
  23. deepTools/source/deeptools/SES_scaleFactor.py +195 -0
  24. deepTools/source/deeptools/__init__.py +0 -0
  25. deepTools/source/deeptools/alignmentSieve.py +439 -0
  26. deepTools/source/deeptools/bamCompare.py +314 -0
  27. deepTools/source/deeptools/bamCoverage.py +416 -0
  28. deepTools/source/deeptools/bamHandler.py +103 -0
  29. deepTools/source/deeptools/bamPEFragmentSize.py +369 -0
  30. deepTools/source/deeptools/bigwigAverage.py +128 -0
  31. deepTools/source/deeptools/bigwigCompare.py +146 -0
  32. deepTools/source/deeptools/cm.py +1088 -0
  33. deepTools/source/deeptools/computeGCBias.py +800 -0
  34. deepTools/source/deeptools/computeMatrix.py +429 -0
  35. deepTools/source/deeptools/computeMatrixOperations.py +852 -0
  36. deepTools/source/deeptools/correctGCBias.py +746 -0
  37. deepTools/source/deeptools/correlation.py +706 -0
  38. deepTools/source/deeptools/correlation_heatmap.py +110 -0
  39. deepTools/source/deeptools/countReadsPerBin.py +1033 -0
  40. deepTools/source/deeptools/deeptools_list_tools.py +78 -0
  41. deepTools/source/deeptools/estimateReadFiltering.py +376 -0
  42. deepTools/source/deeptools/estimateScaleFactor.py +115 -0
  43. deepTools/source/deeptools/getFragmentAndReadSize.py +166 -0
  44. deepTools/source/deeptools/getRatio.py +82 -0
  45. deepTools/source/deeptools/getScaleFactor.py +305 -0
  46. deepTools/source/deeptools/getScorePerBigWigBin.py +322 -0
  47. deepTools/source/deeptools/heatmapper.py +1372 -0
  48. deepTools/source/deeptools/heatmapper_utilities.py +204 -0
  49. deepTools/source/deeptools/mapReduce.py +263 -0
  50. deepTools/source/deeptools/misc.py +13 -0
.gitattributes CHANGED
@@ -33,3 +33,101 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ deepTools/source/deeptools/test/test_corrGC/paired.bam filter=lfs diff=lfs merge=lfs -text
37
+ deepTools/source/deeptools/test/test_heatmapper/heatmap_master_interpolation_bilinear.png filter=lfs diff=lfs merge=lfs -text
38
+ deepTools/source/deeptools/test/test_heatmapper/profile_master_multi.png filter=lfs diff=lfs merge=lfs -text
39
+ deepTools/source/docs/_static/welcome_eLife_chrX_heatmap.png filter=lfs diff=lfs merge=lfs -text
40
+ deepTools/source/docs/_static/welcome_eLife_chrX_profile-1.png filter=lfs diff=lfs merge=lfs -text
41
+ deepTools/source/docs/_static/welcome_eLife_chrX_scaleR_heatmap.png filter=lfs diff=lfs merge=lfs -text
42
+ deepTools/source/docs/images/computeGCBias_Galaxy.png filter=lfs diff=lfs merge=lfs -text
43
+ deepTools/source/docs/images/computeMatrix_modes.png filter=lfs diff=lfs merge=lfs -text
44
+ deepTools/source/docs/images/computeMatrix_overview.png filter=lfs diff=lfs merge=lfs -text
45
+ deepTools/source/docs/images/correctGCBias_Galaxy.png filter=lfs diff=lfs merge=lfs -text
46
+ deepTools/source/docs/images/Gal_DataLib.png filter=lfs diff=lfs merge=lfs -text
47
+ deepTools/source/docs/images/Gal_FAQ_clusterLabeling.png filter=lfs diff=lfs merge=lfs -text
48
+ deepTools/source/docs/images/Gal_FAQ_filteringDuplicates.png filter=lfs diff=lfs merge=lfs -text
49
+ deepTools/source/docs/images/Gal_FAQ_IGV_dataset.png filter=lfs diff=lfs merge=lfs -text
50
+ deepTools/source/docs/images/Gal_FAQ_IGV.png filter=lfs diff=lfs merge=lfs -text
51
+ deepTools/source/docs/images/Gal_FAQ_info.png filter=lfs diff=lfs merge=lfs -text
52
+ deepTools/source/docs/images/Gal_FAQ_UCSC01.png filter=lfs diff=lfs merge=lfs -text
53
+ deepTools/source/docs/images/Gal_screenshot_dataSet.png filter=lfs diff=lfs merge=lfs -text
54
+ deepTools/source/docs/images/Gal_screenshot_dataSetStates.png filter=lfs diff=lfs merge=lfs -text
55
+ deepTools/source/docs/images/Gal_startsite_with_comments.png filter=lfs diff=lfs merge=lfs -text
56
+ deepTools/source/docs/images/Gal_startsite.png filter=lfs diff=lfs merge=lfs -text
57
+ deepTools/source/docs/images/Gal_UCSC.png filter=lfs diff=lfs merge=lfs -text
58
+ deepTools/source/docs/images/GalHow_bamCompare.png filter=lfs diff=lfs merge=lfs -text
59
+ deepTools/source/docs/images/GalHow_bamCoverage.png filter=lfs diff=lfs merge=lfs -text
60
+ deepTools/source/docs/images/GalHow_clustHM01.png filter=lfs diff=lfs merge=lfs -text
61
+ deepTools/source/docs/images/GalHow_clustHM02.png filter=lfs diff=lfs merge=lfs -text
62
+ deepTools/source/docs/images/GalHow_clustHM03.png filter=lfs diff=lfs merge=lfs -text
63
+ deepTools/source/docs/images/GalHow_computeGCbias.png filter=lfs diff=lfs merge=lfs -text
64
+ deepTools/source/docs/images/GalHow_correctGCbias.png filter=lfs diff=lfs merge=lfs -text
65
+ deepTools/source/docs/images/GalHow_multiBamSummary.png filter=lfs diff=lfs merge=lfs -text
66
+ deepTools/source/docs/images/GalHow_plotCorrelation.png filter=lfs diff=lfs merge=lfs -text
67
+ deepTools/source/docs/images/GalHow_plotFingerprint.png filter=lfs diff=lfs merge=lfs -text
68
+ deepTools/source/docs/images/gallery/coverage_Ibrahim.png filter=lfs diff=lfs merge=lfs -text
69
+ deepTools/source/docs/images/gallery/hm_Bulut.png filter=lfs diff=lfs merge=lfs -text
70
+ deepTools/source/docs/images/gallery/hm_CpG.png filter=lfs diff=lfs merge=lfs -text
71
+ deepTools/source/docs/images/gallery/hm_DNase.png filter=lfs diff=lfs merge=lfs -text
72
+ deepTools/source/docs/images/gallery/hm_GC.png filter=lfs diff=lfs merge=lfs -text
73
+ deepTools/source/docs/images/gallery/hm_histonesGomez.png filter=lfs diff=lfs merge=lfs -text
74
+ deepTools/source/docs/images/gallery/hm_TATApsem.png filter=lfs diff=lfs merge=lfs -text
75
+ deepTools/source/docs/images/glossary_ascii.png filter=lfs diff=lfs merge=lfs -text
76
+ deepTools/source/docs/images/glossary_overview.png filter=lfs diff=lfs merge=lfs -text
77
+ deepTools/source/docs/images/glossary_sam.png filter=lfs diff=lfs merge=lfs -text
78
+ deepTools/source/docs/images/norm_IGVsnapshot_indFiles.png filter=lfs diff=lfs merge=lfs -text
79
+ deepTools/source/docs/images/plotCorrelation_galaxy.png filter=lfs diff=lfs merge=lfs -text
80
+ deepTools/source/docs/images/plotCoverage_annotated.png filter=lfs diff=lfs merge=lfs -text
81
+ deepTools/source/docs/images/QC_bamCorrelate_RNAseq.png filter=lfs diff=lfs merge=lfs -text
82
+ deepTools/source/docs/images/QC_fingerprint.png filter=lfs diff=lfs merge=lfs -text
83
+ deepTools/source/docs/images/QC_GCplots_input.png filter=lfs diff=lfs merge=lfs -text
84
+ deepTools/source/docs/images/QC_GCregionexclusion_UCSCscreenshot.png filter=lfs diff=lfs merge=lfs -text
85
+ deepTools/source/docs/images/QC_plotCoverage.png filter=lfs diff=lfs merge=lfs -text
86
+ deepTools/source/docs/images/start_collage.png filter=lfs diff=lfs merge=lfs -text
87
+ deepTools/source/docs/images/start_workflow.png filter=lfs diff=lfs merge=lfs -text
88
+ deepTools/source/docs/images/test_plots/ExampleComputeMatrix1.png filter=lfs diff=lfs merge=lfs -text
89
+ deepTools/source/docs/images/test_plots/ExampleComputeMatrix2.png filter=lfs diff=lfs merge=lfs -text
90
+ deepTools/source/docs/images/test_plots/ExampleComputeMatrix3.png filter=lfs diff=lfs merge=lfs -text
91
+ deepTools/source/docs/images/test_plots/ExampleHeatmap1.png filter=lfs diff=lfs merge=lfs -text
92
+ deepTools/source/docs/images/test_plots/ExampleHeatmap2.png filter=lfs diff=lfs merge=lfs -text
93
+ deepTools/source/docs/images/test_plots/ExampleHeatmap3.png filter=lfs diff=lfs merge=lfs -text
94
+ deepTools/source/docs/images/test_plots/ExampleHeatmap4.png filter=lfs diff=lfs merge=lfs -text
95
+ deepTools/source/docs/images/test_plots/ExampleProfile1.png filter=lfs diff=lfs merge=lfs -text
96
+ deepTools/source/docs/images/test_plots/ExampleProfile2.png filter=lfs diff=lfs merge=lfs -text
97
+ deepTools/source/galaxy/wrapper/static/images/bamCompare_output.png filter=lfs diff=lfs merge=lfs -text
98
+ deepTools/source/galaxy/wrapper/static/images/bamCoverage_output.png filter=lfs diff=lfs merge=lfs -text
99
+ deepTools/source/galaxy/wrapper/static/images/bamPEFragmentSize_output.png filter=lfs diff=lfs merge=lfs -text
100
+ deepTools/source/galaxy/wrapper/static/images/bigwigCompare_output.png filter=lfs diff=lfs merge=lfs -text
101
+ deepTools/source/galaxy/wrapper/static/images/computeGCBias_output.png filter=lfs diff=lfs merge=lfs -text
102
+ deepTools/source/galaxy/wrapper/static/images/computeMatrix_advancedOutput.png filter=lfs diff=lfs merge=lfs -text
103
+ deepTools/source/galaxy/wrapper/static/images/computeMatrix_output.png filter=lfs diff=lfs merge=lfs -text
104
+ deepTools/source/galaxy/wrapper/static/images/computeMatrix_overview.png filter=lfs diff=lfs merge=lfs -text
105
+ deepTools/source/galaxy/wrapper/static/images/computeMatrix_selectScores.png filter=lfs diff=lfs merge=lfs -text
106
+ deepTools/source/galaxy/wrapper/static/images/multiBamSummary_output.png filter=lfs diff=lfs merge=lfs -text
107
+ deepTools/source/galaxy/wrapper/static/images/multiBigwigSummary_output.png filter=lfs diff=lfs merge=lfs -text
108
+ deepTools/source/galaxy/wrapper/static/images/norm_IGVsnapshot_indFiles.png filter=lfs diff=lfs merge=lfs -text
109
+ deepTools/source/galaxy/wrapper/static/images/plotCorrelate_RNAseq.png filter=lfs diff=lfs merge=lfs -text
110
+ deepTools/source/galaxy/wrapper/static/images/plotCorrelation_output.png filter=lfs diff=lfs merge=lfs -text
111
+ deepTools/source/galaxy/wrapper/static/images/plotCoverage_annotated.png filter=lfs diff=lfs merge=lfs -text
112
+ deepTools/source/galaxy/wrapper/static/images/plotCoverage_output.png filter=lfs diff=lfs merge=lfs -text
113
+ deepTools/source/galaxy/wrapper/static/images/plotFingerprint_output.png filter=lfs diff=lfs merge=lfs -text
114
+ deepTools/source/galaxy/wrapper/static/images/plotHeatmap_example.png filter=lfs diff=lfs merge=lfs -text
115
+ deepTools/source/galaxy/wrapper/static/images/plotHeatmap_example02.png filter=lfs diff=lfs merge=lfs -text
116
+ deepTools/source/galaxy/wrapper/static/images/plotPCA_annotated.png filter=lfs diff=lfs merge=lfs -text
117
+ deepTools/source/galaxy/wrapper/static/images/plotProfiler_examples.png filter=lfs diff=lfs merge=lfs -text
118
+ deepTools/source/galaxy/wrapper/static/images/QC_GCplots_input.png filter=lfs diff=lfs merge=lfs -text
119
+ deepTools/source/galaxy/wrapper/static/images/QC_plotCoverage.png filter=lfs diff=lfs merge=lfs -text
120
+ deepTools/source/galaxy/wrapper/static/images/visual_hm_DmelPolII.png filter=lfs diff=lfs merge=lfs -text
121
+ deepTools/source/galaxy/wrapper/test-data/alignmentSieve.bam filter=lfs diff=lfs merge=lfs -text
122
+ deepTools/source/galaxy/wrapper/test-data/alignmentSieve2.bam filter=lfs diff=lfs merge=lfs -text
123
+ deepTools/source/galaxy/wrapper/test-data/alignmentSieve3.bam filter=lfs diff=lfs merge=lfs -text
124
+ deepTools/source/galaxy/wrapper/test-data/correctGCBias_result1.bam filter=lfs diff=lfs merge=lfs -text
125
+ deepTools/source/galaxy/wrapper/test-data/paired_chr2L.bam filter=lfs diff=lfs merge=lfs -text
126
+ deepTools/source/galaxy/wrapper/test-data/paired_chr2L.cram filter=lfs diff=lfs merge=lfs -text
127
+ deepTools/source/gallery/coverage_Ibrahim.png filter=lfs diff=lfs merge=lfs -text
128
+ deepTools/source/gallery/hm_Bulut.png filter=lfs diff=lfs merge=lfs -text
129
+ deepTools/source/gallery/hm_CpG.png filter=lfs diff=lfs merge=lfs -text
130
+ deepTools/source/gallery/hm_DNase.png filter=lfs diff=lfs merge=lfs -text
131
+ deepTools/source/gallery/hm_GC.png filter=lfs diff=lfs merge=lfs -text
132
+ deepTools/source/gallery/hm_histonesGomez.png filter=lfs diff=lfs merge=lfs -text
133
+ deepTools/source/gallery/hm_TATApsem.png filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user && python -m pip install --upgrade pip
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ ENV MCP_TRANSPORT=http
14
+ ENV MCP_PORT=7860
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["python", "deepTools/mcp_output/start_mcp.py"]
README.md CHANGED
@@ -1,10 +1,32 @@
1
  ---
2
- title: DeepTools
3
- emoji: 💻
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: docker
 
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Deeptools MCP
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
+ sdk_version: "4.26.0"
8
+ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Deeptools MCP Service
13
+
14
+ Auto-generated MCP service for deepTools.
15
+
16
+ ## Usage
17
+
18
+ ```
19
+ https://None-deepTools-mcp.hf.space/mcp
20
+ ```
21
+
22
+ ## Connect with Cursor
23
+
24
+ ```json
25
+ {
26
+ "mcpServers": {
27
+ "deepTools": {
28
+ "url": "https://None-deepTools-mcp.hf.space/mcp"
29
+ }
30
+ }
31
+ }
32
+ ```
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import os
3
+ import sys
4
+
5
+ mcp_plugin_path = os.path.join(os.path.dirname(__file__), "deepTools", "mcp_output", "mcp_plugin")
6
+ sys.path.insert(0, mcp_plugin_path)
7
+
8
+ app = FastAPI(
9
+ title="Deeptools MCP Service",
10
+ description="Auto-generated MCP service for deepTools",
11
+ version="1.0.0"
12
+ )
13
+
14
+ @app.get("/")
15
+ def root():
16
+ return {
17
+ "service": "Deeptools MCP Service",
18
+ "version": "1.0.0",
19
+ "status": "running",
20
+ "transport": os.environ.get("MCP_TRANSPORT", "http")
21
+ }
22
+
23
+ @app.get("/health")
24
+ def health_check():
25
+ return {"status": "healthy", "service": "deepTools MCP"}
26
+
27
+ @app.get("/tools")
28
+ def list_tools():
29
+ try:
30
+ from mcp_service import create_app
31
+ mcp_app = create_app()
32
+ tools = []
33
+ for tool_name, tool_func in mcp_app.tools.items():
34
+ tools.append({
35
+ "name": tool_name,
36
+ "description": tool_func.__doc__ or "No description available"
37
+ })
38
+ return {"tools": tools}
39
+ except Exception as e:
40
+ return {"error": f"Failed to load tools: {str(e)}"}
41
+
42
+ if __name__ == "__main__":
43
+ import uvicorn
44
+ port = int(os.environ.get("PORT", 7860))
45
+ uvicorn.run(app, host="0.0.0.0", port=port)
deepTools/mcp_output/README_MCP.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deepTools MCP (Model Context Protocol) Service
2
+
3
+ ## Project Introduction
4
+
5
+ deepTools is a comprehensive suite of Python tools designed for the efficient analysis of high-throughput sequencing data, particularly for ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenges of handling large datasets by providing tools for normalized coverage file generation, quality control, and publication-ready visualizations. deepTools supports efficient parallel processing using the mapReduce framework, making it suitable for genome-scale computations.
6
+
7
+ ## Installation Method
8
+
9
+ To install deepTools, ensure you have Python and the following dependencies:
10
+
11
+ - numpy
12
+ - matplotlib
13
+ - pysam
14
+ - pyBigWig
15
+
16
+ Optional dependencies include:
17
+
18
+ - scipy
19
+ - pandas
20
+
21
+ You can install deepTools using pip:
22
+
23
+ ```
24
+ pip install deeptools
25
+ ```
26
+
27
+ ## Quick Start
28
+
29
+ To quickly get started with deepTools, you can use the command-line interface (CLI) to call the main functions. Here are some examples:
30
+
31
+ - **Calculate Coverage**: Use `bamCoverage` to calculate the coverage of BAM files and output a bigWig file.
32
+ - **Compare BAM Files**: Use `bamCompare` to compare two BAM files and generate a bigWig file with the results.
33
+ - **Generate Heatmaps**: Use `heatmapper` to generate heatmaps from computed matrices.
34
+
35
+ Example command:
36
+
37
+ ```
38
+ bamCoverage -b sample.bam -o output.bw
39
+ ```
40
+
41
+ ## Available Tools and Endpoints List
42
+
43
+ 1. **alignmentSieve**: Filters alignments based on various criteria.
44
+ 2. **bamCompare**: Compares two BAM files and generates a bigWig file.
45
+ 3. **bamCoverage**: Calculates the coverage of BAM files.
46
+ 4. **computeMatrix**: Computes a matrix of scores for genomic regions.
47
+ 5. **heatmapper**: Generates heatmaps from computed matrices.
48
+ 6. **multiBamSummary**: Aggregates read counts across multiple BAM files.
49
+ 7. **multiBigwigSummary**: Aggregates scores across multiple bigWig files.
50
+ 8. **plotCorrelation**: Performs correlation analysis with heatmap/scatter plot output.
51
+ 9. **plotHeatmap**: Creates customizable heatmaps.
52
+ 10. **plotProfile**: Generates average signal profile plots.
53
+
54
+ ## Common Issues and Notes
55
+
56
+ - **Dependencies**: Ensure all required dependencies are installed. Optional dependencies can enhance functionality.
57
+ - **Environment**: deepTools is compatible with most Unix-like systems. Ensure your environment supports Python and the necessary libraries.
58
+ - **Performance**: For large datasets, consider using the mapReduce framework to leverage parallel processing capabilities.
59
+
60
+ ## Reference Links or Documentation
61
+
62
+ For more detailed information, visit the [deepTools GitHub repository](https://github.com/deeptools/deepTools) or refer to the official [deepTools documentation](https://deeptools.readthedocs.io/en/develop/).
63
+
64
+ For specific tool usage and workflows, see the [Typical Workflows](https://deeptools.readthedocs.io/en/develop/content/example_usage.html) section in the documentation.
deepTools/mcp_output/analysis.json ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "summary": {
3
+ "repository_url": "https://github.com/deeptools/deepTools",
4
+ "summary": "Imported via zip fallback, file count: 81",
5
+ "file_tree": {
6
+ ".github/CONTRIBUTING.md": {
7
+ "size": 544
8
+ },
9
+ ".github/ISSUE_TEMPLATE.md": {
10
+ "size": 691
11
+ },
12
+ ".github/PULL_REQUEST_TEMPLATE.md": {
13
+ "size": 286
14
+ },
15
+ ".github/workflows/planemo.yml": {
16
+ "size": 1421
17
+ },
18
+ ".github/workflows/pypi.yml": {
19
+ "size": 616
20
+ },
21
+ ".github/workflows/test.yml": {
22
+ "size": 3118
23
+ },
24
+ ".readthedocs.yaml": {
25
+ "size": 193
26
+ },
27
+ "CHANGES.txt": {
28
+ "size": 40451
29
+ },
30
+ "LICENSE.txt": {
31
+ "size": 1241
32
+ },
33
+ "README.md": {
34
+ "size": 5910
35
+ },
36
+ "deeptools/SES_scaleFactor.py": {
37
+ "size": 7007
38
+ },
39
+ "deeptools/__init__.py": {
40
+ "size": 0
41
+ },
42
+ "deeptools/alignmentSieve.py": {
43
+ "size": 18200
44
+ },
45
+ "deeptools/bamCompare.py": {
46
+ "size": 14290
47
+ },
48
+ "deeptools/bamCoverage.py": {
49
+ "size": 18617
50
+ },
51
+ "deeptools/bamHandler.py": {
52
+ "size": 3345
53
+ },
54
+ "deeptools/bamPEFragmentSize.py": {
55
+ "size": 21247
56
+ },
57
+ "deeptools/bigwigAverage.py": {
58
+ "size": 4908
59
+ },
60
+ "deeptools/bigwigCompare.py": {
61
+ "size": 6614
62
+ },
63
+ "deeptools/cm.py": {
64
+ "size": 44838
65
+ },
66
+ "deeptools/computeGCBias.py": {
67
+ "size": 31006
68
+ },
69
+ "deeptools/computeMatrix.py": {
70
+ "size": 22446
71
+ },
72
+ "deeptools/computeMatrixOperations.py": {
73
+ "size": 32110
74
+ },
75
+ "deeptools/correctGCBias.py": {
76
+ "size": 26158
77
+ },
78
+ "deeptools/correlation.py": {
79
+ "size": 28078
80
+ },
81
+ "deeptools/correlation_heatmap.py": {
82
+ "size": 3796
83
+ },
84
+ "deeptools/countReadsPerBin.py": {
85
+ "size": 42159
86
+ },
87
+ "deeptools/deeptools_list_tools.py": {
88
+ "size": 3345
89
+ },
90
+ "deeptools/estimateReadFiltering.py": {
91
+ "size": 16606
92
+ },
93
+ "deeptools/estimateScaleFactor.py": {
94
+ "size": 4782
95
+ },
96
+ "deeptools/getFragmentAndReadSize.py": {
97
+ "size": 7011
98
+ },
99
+ "deeptools/getRatio.py": {
100
+ "size": 2326
101
+ },
102
+ "deeptools/getScaleFactor.py": {
103
+ "size": 12772
104
+ },
105
+ "deeptools/getScorePerBigWigBin.py": {
106
+ "size": 11967
107
+ },
108
+ "deeptools/heatmapper.py": {
109
+ "size": 58987
110
+ },
111
+ "deeptools/heatmapper_utilities.py": {
112
+ "size": 7169
113
+ },
114
+ "deeptools/mapReduce.py": {
115
+ "size": 9786
116
+ },
117
+ "deeptools/misc.py": {
118
+ "size": 597
119
+ },
120
+ "deeptools/multiBamSummary.py": {
121
+ "size": 11899
122
+ },
123
+ "deeptools/multiBigwigSummary.py": {
124
+ "size": 11291
125
+ },
126
+ "deeptools/parserCommon.py": {
127
+ "size": 43744
128
+ },
129
+ "deeptools/plotCorrelation.py": {
130
+ "size": 10984
131
+ },
132
+ "deeptools/plotCoverage.py": {
133
+ "size": 16329
134
+ },
135
+ "deeptools/plotEnrichment.py": {
136
+ "size": 25244
137
+ },
138
+ "deeptools/plotFingerprint.py": {
139
+ "size": 19876
140
+ },
141
+ "deeptools/plotHeatmap.py": {
142
+ "size": 37144
143
+ },
144
+ "deeptools/plotPCA.py": {
145
+ "size": 9427
146
+ },
147
+ "deeptools/plotProfile.py": {
148
+ "size": 39224
149
+ },
150
+ "deeptools/sumCoveragePerBin.py": {
151
+ "size": 9899
152
+ },
153
+ "deeptools/test/__init__.py": {
154
+ "size": 0
155
+ },
156
+ "deeptools/test/skiptest_heatmapper_images.py": {
157
+ "size": 5917
158
+ },
159
+ "deeptools/test/test_bamCoverage_and_bamCompare.py": {
160
+ "size": 17582
161
+ },
162
+ "deeptools/test/test_bigwigAverage.py": {
163
+ "size": 2864
164
+ },
165
+ "deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py": {
166
+ "size": 4603
167
+ },
168
+ "deeptools/test/test_computeMatrixOperations.py": {
169
+ "size": 12233
170
+ },
171
+ "deeptools/test/test_corrGC/R_gc_paired.txt": {
172
+ "size": 7525
173
+ },
174
+ "deeptools/test/test_corrGC/frequencies_data.txt": {
175
+ "size": 825
176
+ },
177
+ "deeptools/test/test_countReadsPerBin.py": {
178
+ "size": 8401
179
+ },
180
+ "deeptools/test/test_heatmapper.py": {
181
+ "size": 12550
182
+ },
183
+ "deeptools/test/test_multiBamSummary.py": {
184
+ "size": 1945
185
+ },
186
+ "deeptools/test/test_plotCoverage.py": {
187
+ "size": 1215
188
+ },
189
+ "deeptools/test/test_readFiltering.py": {
190
+ "size": 6229
191
+ },
192
+ "deeptools/test/test_tools.py": {
193
+ "size": 838
194
+ },
195
+ "deeptools/test/test_writeBedGraph.py": {
196
+ "size": 4462
197
+ },
198
+ "deeptools/utilities.py": {
199
+ "size": 14161
200
+ },
201
+ "deeptools/writeBedGraph.py": {
202
+ "size": 13223
203
+ },
204
+ "deeptools/writeBedGraph_bam_and_bw.py": {
205
+ "size": 9255
206
+ },
207
+ "docs/_static/welcome_owl.carousel.min.js": {
208
+ "size": 40401
209
+ },
210
+ "docs/conf.py": {
211
+ "size": 11119
212
+ },
213
+ "docs/requirements.txt": {
214
+ "size": 72
215
+ },
216
+ "galaxy/wrapper/.shed.yml": {
217
+ "size": 2719
218
+ },
219
+ "galaxy/wrapper/test-data/alignmentSieve.txt": {
220
+ "size": 102
221
+ },
222
+ "galaxy/wrapper/test-data/bamPEFragmentSize_lengths1.txt": {
223
+ "size": 115
224
+ },
225
+ "galaxy/wrapper/test-data/bamPEFragmentSize_result1.txt": {
226
+ "size": 613
227
+ },
228
+ "galaxy/wrapper/test-data/bamPEFragmentSize_table1.txt": {
229
+ "size": 810
230
+ },
231
+ "galaxy/wrapper/test-data/computeMatrixOperations.txt": {
232
+ "size": 50
233
+ },
234
+ "galaxy/wrapper/test-data/estimateReadFiltering.txt": {
235
+ "size": 353
236
+ },
237
+ "galaxy/wrapper/test-data/plotEnrichment_output.txt": {
238
+ "size": 197
239
+ },
240
+ "pyproject.toml": {
241
+ "size": 2395
242
+ },
243
+ "scripts/convertChromsBigWig.py": {
244
+ "size": 7412
245
+ },
246
+ "scripts/split_bed_into_multiple_files.py": {
247
+ "size": 822
248
+ }
249
+ },
250
+ "processed_by": "zip_fallback",
251
+ "success": true
252
+ },
253
+ "structure": {
254
+ "packages": [
255
+ "source.deeptools",
256
+ "source.deeptools.test"
257
+ ]
258
+ },
259
+ "dependencies": {
260
+ "has_environment_yml": false,
261
+ "has_requirements_txt": false,
262
+ "pyproject": true,
263
+ "setup_cfg": false,
264
+ "setup_py": false
265
+ },
266
+ "entry_points": {
267
+ "imports": [],
268
+ "cli": [],
269
+ "modules": []
270
+ },
271
+ "llm_analysis": {
272
+ "core_modules": [
273
+ {
274
+ "package": "source.deeptools",
275
+ "module": "alignmentSieve",
276
+ "functions": [
277
+ "main",
278
+ "parseArguments"
279
+ ],
280
+ "classes": [],
281
+ "description": "This module is responsible for filtering alignments based on various criteria."
282
+ },
283
+ {
284
+ "package": "source.deeptools",
285
+ "module": "bamCompare",
286
+ "functions": [
287
+ "main",
288
+ "parseArguments"
289
+ ],
290
+ "classes": [],
291
+ "description": "This module compares two BAM files and generates a bigWig file with the results."
292
+ },
293
+ {
294
+ "package": "source.deeptools",
295
+ "module": "bamCoverage",
296
+ "functions": [
297
+ "main",
298
+ "parseArguments"
299
+ ],
300
+ "classes": [],
301
+ "description": "This module calculates the coverage of BAM files and outputs a bigWig file."
302
+ },
303
+ {
304
+ "package": "source.deeptools",
305
+ "module": "computeMatrix",
306
+ "functions": [
307
+ "main",
308
+ "parseArguments"
309
+ ],
310
+ "classes": [],
311
+ "description": "This module computes a matrix of scores for genomic regions."
312
+ },
313
+ {
314
+ "package": "source.deeptools",
315
+ "module": "heatmapper",
316
+ "functions": [
317
+ "main",
318
+ "parseArguments"
319
+ ],
320
+ "classes": [],
321
+ "description": "This module generates heatmaps from the computed matrices."
322
+ }
323
+ ],
324
+ "cli_commands": [
325
+ {
326
+ "name": "alignmentSieve",
327
+ "module": "source.deeptools.alignmentSieve",
328
+ "description": "CLI command for filtering alignments based on various criteria."
329
+ },
330
+ {
331
+ "name": "bamCompare",
332
+ "module": "source.deeptools.bamCompare",
333
+ "description": "CLI command for comparing two BAM files and generating a bigWig file."
334
+ },
335
+ {
336
+ "name": "bamCoverage",
337
+ "module": "source.deeptools.bamCoverage",
338
+ "description": "CLI command for calculating the coverage of BAM files."
339
+ },
340
+ {
341
+ "name": "computeMatrix",
342
+ "module": "source.deeptools.computeMatrix",
343
+ "description": "CLI command for computing a matrix of scores for genomic regions."
344
+ },
345
+ {
346
+ "name": "heatmapper",
347
+ "module": "source.deeptools.heatmapper",
348
+ "description": "CLI command for generating heatmaps from computed matrices."
349
+ }
350
+ ],
351
+ "import_strategy": {
352
+ "primary": "import",
353
+ "fallback": "cli",
354
+ "confidence": 0.85
355
+ },
356
+ "dependencies": {
357
+ "required": [
358
+ "numpy",
359
+ "matplotlib",
360
+ "pysam",
361
+ "pyBigWig"
362
+ ],
363
+ "optional": [
364
+ "scipy",
365
+ "pandas"
366
+ ]
367
+ },
368
+ "risk_assessment": {
369
+ "import_feasibility": 0.8,
370
+ "intrusiveness_risk": "medium",
371
+ "complexity": "medium"
372
+ }
373
+ },
374
+ "deepwiki_analysis": {
375
+ "repo_url": "https://github.com/deeptools/deepTools",
376
+ "repo_name": "deepTools",
377
+ "content": "deeptools/deepTools\nInstallation and Getting Started\nTypical Workflows\nCore Processing Engines\nParallel Processing Framework (mapReduce)\nRead Counting Engine (countReadsPerBin)\nScore Extraction (getScorePerBigWigBin)\nMatrix Computation Engine (heatmapper)\nCoverage Generation and Normalization\nSingle-Sample Coverage (bamCoverage)\nSample Comparison (bamCompare)\nGC Bias Correction Pipeline\nMulti-Sample Analysis Tools\nData Integration (multiBamSummary and multiBigwigSummary)\nCorrelation Analysis (plotCorrelation)\nPrincipal Component Analysis (plotPCA)\nVisualization Tools\nMatrix Generation (computeMatrix)\nMatrix Operations (computeMatrixOperations)\nHeatmap Visualization (plotHeatmap)\nProfile Plots (plotProfile)\nQuality Control Tools\nChIP-seq Enrichment Assessment (plotFingerprint)\nFragment Size Analysis (bamPEFragmentSize)\nCoverage Distribution (plotCoverage)\nFeature Enrichment (plotEnrichment)\nRead Filtering Estimation (estimateReadFiltering and alignmentSieve)\nGalaxy Integration\nGalaxy Wrapper Architecture\nTool Shed Distribution and Installation\nTesting with Planemo\nDevelopment and Contributing\nProject Structure and Configuration\nCI/CD Pipeline\nDocumentation System\nRelease Process and Versioning\nFile Formats and Data Structures\nInput File Formats\nIntermediate File Formats\nOutput Formats and Visualization\nAdvanced Topics and Optimization\nPerformance Tuning\nFiltering and Read Processing Options\nNormalization Methods Deep Dive\nSpecialized Read Processing Modes\nCHANGES.txt\ndocs/content/about.rst\ndocs/content/changelog.rst\ndocs/content/example_api_tutorial.rst\ndocs/content/example_gallery.rst\ndocs/content/example_step_by_step.rst\ndocs/content/example_usage.rst\ndocs/content/help_faq.rst\ndocs/content/help_faq_galaxy.rst\ndocs/content/help_galaxy_intro.rst\ndocs/content/help_glossary.rst\ndocs/content/installation.rst\ndocs/content/list_of_tools.rst\ndocs/images/Gal_FAQ_filteringDuplicates.png\ndocs/images/Gal_FAQ_info.png\ndocs/index.rst\ngalaxy/wrapper/deepTools_macros.xml\ngalaxy/wrapper/estimateReadFiltering.xml\ngalaxy/wrapper/test-data/estimateReadFiltering.txt\nscripts/convertChromsBigWig.py\nPurpose and Scope\nThis page provides a high-level introduction to deepTools: what it is, what problems it solves, and its primary capabilities for analyzing high-throughput sequencing data. For detailed installation instructions, seeInstallation and Getting Started. For specific tool usage and workflows, seeTypical Workflows.\nSources:README.md9-11docs/index.rst7-8\nWhat is deepTools?\ndeepTools is a suite of Python tools developed for efficient analysis of high-throughput sequencing data, particularly ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenge of handling large amounts of data generated from DNA sequencing centers by providing:\nNormalized coverage file generationin standard bedGraph and bigWig formats\nQuality control modulesfor assessing data quality and technical biases\nPublication-ready visualizationsfor identifying enrichments and functional genome annotations\nEfficient parallel processingusing themapReduceframework for genome-scale computations\ndeepTools is available through three distinct usage modes:\nSources:README.md9-14docs/index.rst10-14CHANGES.txt62\nSystem Architecture\nThe following diagram illustrates the primary components of deepTools and how they relate to each other:\ndeepTools Component Architecture\nData Access LayerCore Processing EnginesTool Modules LayerUser Interface LayerCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)Galaxy XML Wrappers(galaxy/wrapper/*.xml)Python API(import deeptools.*)Quality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSizeCoverage ToolsbamCoveragebamComparebigwigCompareAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrixVisualization ToolsplotHeatmapplotProfileplotCorrelationcountReadsPerBin(deeptools/countReadsPerBin.py)getScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)heatmapper(deeptools/heatmapper.py)mapReduce(deeptools/mapReduce.py)pysam(BAM/CRAM files)pyBigWig(bigWig files)deeptoolsintervals(BED/GTF files)\nData Access Layer\nCore Processing Engines\nTool Modules Layer\nUser Interface Layer\nCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)\nGalaxy XML Wrappers(galaxy/wrapper/*.xml)\nPython API(import deeptools.*)\nQuality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSize\nCoverage ToolsbamCoveragebamComparebigwigCompare\nAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrix\nVisualization ToolsplotHeatmapplotProfileplotCorrelation\ncountReadsPerBin(deeptools/countReadsPerBin.py)\ngetScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)\nheatmapper(deeptools/heatmapper.py)\nmapReduce(deeptools/mapReduce.py)\npysam(BAM/CRAM files)\npyBigWig(bigWig files)\ndeeptoolsintervals(BED/GTF files)\nSources:docs/content/list_of_tools.rst1-46pyproject.toml24-50README.md34-56\nCore Capabilities\ndeepTools provides five major functional categories that address different stages of sequencing data analysis:\n1. Quality Control and Preprocessing\nTools for assessing data quality before analysis:\nplotFingerprint- ChIP-seq enrichment assessment with quality metrics (Jensen-Shannon distance, CHANCE statistics)\nplotFingerprint\ncomputeGCBias/correctGCBias- GC bias detection and correction\ncomputeGCBias\ncorrectGCBias\nbamPEFragmentSize- Fragment size distribution analysis for paired-end data\nbamPEFragmentSize\nplotCoverage- Coverage distribution and genome coverage metrics\nplotCoverage\nestimateReadFiltering- Preview effects of filtering parameters\nestimateReadFiltering\nSources:docs/content/list_of_tools.rst18-32galaxy/wrapper/estimateReadFiltering.xml1-118\n2. Coverage Generation and Normalization\nTools for converting aligned reads (BAM) to normalized coverage tracks (bigWig/bedGraph):\nbamCoverage- Single-sample normalization with RPKM, CPM, BPM, RPGC methods\nbamCoverage\nbamCompare- Two-sample comparison (e.g., ChIP vs. input) with log2ratio, difference, mean operations\nbigwigCompare- Comparison operations on bigWig files\nbigwigCompare\nbigwigAverage- Averaging multiple bigWig files\nbigwigAverage\nSources:docs/content/list_of_tools.rst24-27CHANGES.txt28\n3. Data Aggregation\nTools for integrating multi-sample data:\nmultiBamSummary- Read count aggregation across BAM files (bins or BED-defined regions)\nmultiBamSummary\nmultiBigwigSummary- Score aggregation across bigWig files\nmultiBigwigSummary\ncomputeMatrix- Signal computation over genomic regions in two modes:scale-regionsandreference-point\ncomputeMatrix\nscale-regions\nreference-point\nSources:docs/content/list_of_tools.rst10-29\n4. Analysis and Statistics\nTools for deriving statistical insights:\nplotCorrelation- Pearson/Spearman correlation with hierarchical clustering and heatmap/scatter plot output\nplotCorrelation\nplotPCA- Principal component analysis for dimensionality reduction\nplotEnrichment- Feature enrichment quantification\nplotEnrichment\nSources:docs/content/list_of_tools.rst14-17docs/content/example_usage.rst32-44\n5. Visualization\nTools for creating publication-ready plots:\nplotHeatmap- Customizable heatmaps with clustering, color schemes, and multi-sample/region support\nplotHeatmap\nplotProfile- Average signal profile plots (meta-profiles) with standard error/deviation options\nplotProfile\ncomputeMatrixOperations- Matrix manipulation (filter, subset, sort, combine)\ncomputeMatrixOperations\nMultiple output formats supported: PNG, PDF, SVG, and interactive HTML (via plotly)\nSources:docs/content/list_of_tools.rst34-36CHANGES.txt187\nFile Format Ecosystem\ndeepTools operates on standard genomics file formats and performs conversions between them:\nFile Format Flow Diagram\nOutput FormatsIntermediate FormatsdeepTools ProcessingInput FormatsBAM/CRAM(aligned reads)bigWig(coverage signal)BED/GTF/GFF(genomic regions)2bit/FASTA(reference genome)bamCoveragebamComparemultiBamSummarymultiBigwigSummarycomputeMatrixcomputeGCBiasNPZ matrix(multiBamSummary output)Matrix .gz(computeMatrix output)Tabular(GC frequencies)bigWig/bedGraph(coverage tracks)PNG/PDF/SVG(static plots)HTML(interactive plotly)Tabular(data tables)\nOutput Formats\nIntermediate Formats\ndeepTools Processing\nInput Formats\nBAM/CRAM(aligned reads)\nbigWig(coverage signal)\nBED/GTF/GFF(genomic regions)\n2bit/FASTA(reference genome)\nbamCoverage\nmultiBamSummary\nmultiBigwigSummary\ncomputeMatrix\ncomputeGCBias\nNPZ matrix(multiBamSummary output)\nMatrix .gz(computeMatrix output)\nTabular(GC frequencies)\nbigWig/bedGraph(coverage tracks)\nPNG/PDF/SVG(static plots)\nHTML(interactive plotly)\nTabular(data tables)\nKey intermediate formats serve as bridges between data aggregation and visualization:\nNPZ files- Compressed NumPy arrays storing multi-sample read counts or scores (output frommultiBamSummary/multiBigwigSummary)\nmultiBamSummary\nmultiBigwigSummary\nMatrix .gz files- Compressed matrices of signal values over genomic regions (output fromcomputeMatrix)\ncomputeMatrix\nSources:docs/content/help_glossary.rst82-174docs/content/list_of_tools.rst8-45\nParallel Processing Framework\nAll compute-intensive operations in deepTools utilize themapReduceframework for efficient genome-scale processing:\nmapReduce Execution Model\nmapReduce Framework (deeptools/mapReduce.py)Input:BAM/bigWig filesBED regionsParametersGenome Chunking(~400k reads per chunk)Worker Pool(multiprocessing.Pool)Worker 1:Process chunk 1Worker 2:Process chunk 2Worker N:Process chunk NResult Aggregation(concatenate/merge)Output:Aggregated results\nmapReduce Framework (deeptools/mapReduce.py)\nInput:BAM/bigWig filesBED regionsParameters\nGenome Chunking(~400k reads per chunk)\nWorker Pool(multiprocessing.Pool)\nWorker 1:Process chunk 1\nWorker 2:Process chunk 2\nWorker N:Process chunk N\nResult Aggregation(concatenate/merge)\nOutput:Aggregated results\nThe framework provides:\nAutomatic parallelizationacross user-specified number of processors (via--numberOfProcessors)",
378
+ "model": "gpt-4o-2024-08-06",
379
+ "source": "selenium",
380
+ "success": true
381
+ },
382
+ "deepwiki_options": {
383
+ "enabled": true,
384
+ "model": "gpt-4o-2024-08-06"
385
+ },
386
+ "risk": {
387
+ "import_feasibility": 0.8,
388
+ "intrusiveness_risk": "medium",
389
+ "complexity": "medium"
390
+ }
391
+ }
deepTools/mcp_output/diff_report.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DeepTools Project Difference Report
2
+
3
+ **Date:** January 31, 2026
4
+ **Time:** 18:24:38
5
+ **Repository:** deepTools
6
+ **Project Type:** Python Library
7
+ **Intrusiveness:** None
8
+ **Workflow Status:** Success
9
+ **Test Status:** Failed
10
+
11
+ ## Project Overview
12
+
13
+ DeepTools is a Python library designed to facilitate the analysis and visualization of high-throughput sequencing data. It provides a suite of tools for processing and interpreting large datasets, making it an essential resource for bioinformatics research.
14
+
15
+ ## Difference Analysis
16
+
17
+ ### New Files
18
+
19
+ In this update, 8 new files have been introduced to the deepTools repository. These files likely contain new features or enhancements to existing functionalities. However, no existing files were modified, indicating that the new additions are supplementary rather than replacements or updates to current code.
20
+
21
+ ### Modified Files
22
+
23
+ There were no modifications to existing files in this update. This suggests that the core functionalities of the library remain unchanged, and the focus was on expanding capabilities or adding new features.
24
+
25
+ ## Technical Analysis
26
+
27
+ ### Workflow Status
28
+
29
+ The workflow status is marked as "success," indicating that the integration and deployment processes were completed without any errors. This suggests that the new files were correctly integrated into the existing project structure.
30
+
31
+ ### Test Status
32
+
33
+ The test status is marked as "failed," which is a critical issue. This failure indicates that the new additions have introduced bugs or issues that prevent the library from functioning as expected. It is essential to identify and resolve these issues to ensure the reliability and stability of the library.
34
+
35
+ ## Recommendations and Improvements
36
+
37
+ 1. **Conduct Thorough Testing:**
38
+ - Perform detailed unit and integration testing on the new files to identify the root cause of the test failures.
39
+ - Ensure that all new functionalities are covered by test cases to prevent future issues.
40
+
41
+ 2. **Code Review:**
42
+ - Conduct a comprehensive code review of the new files to ensure adherence to coding standards and best practices.
43
+ - Identify any potential areas for optimization or refactoring.
44
+
45
+ 3. **Documentation Update:**
46
+ - Update the project documentation to include information about the new features and how they integrate with existing functionalities.
47
+ - Ensure that any new dependencies or installation instructions are clearly outlined.
48
+
49
+ 4. **Bug Fixes:**
50
+ - Prioritize fixing the issues causing test failures to restore the library's functionality.
51
+ - Implement a bug tracking system to monitor and resolve any new issues that arise.
52
+
53
+ ## Deployment Information
54
+
55
+ The deployment process was successful, indicating that the new files were correctly integrated into the project. However, due to the test failures, it is recommended to hold off on any production deployment until the issues are resolved.
56
+
57
+ ## Future Planning
58
+
59
+ 1. **Feature Expansion:**
60
+ - Continue to expand the library's capabilities by introducing new tools and functionalities that align with user needs and industry trends.
61
+
62
+ 2. **Community Engagement:**
63
+ - Engage with the user community to gather feedback on the new features and identify areas for improvement.
64
+
65
+ 3. **Regular Updates:**
66
+ - Implement a regular update schedule to ensure that the library remains up-to-date with the latest advancements in bioinformatics.
67
+
68
+ 4. **Enhanced Testing Framework:**
69
+ - Develop a more robust testing framework to catch issues earlier in the development process and improve overall software quality.
70
+
71
+ ## Conclusion
72
+
73
+ The recent update to the deepTools project has introduced new features, but the test failures highlight the need for immediate attention to ensure the library's reliability. By addressing the recommendations outlined in this report, the project can continue to provide valuable tools for the bioinformatics community while maintaining high standards of quality and performance.
deepTools/mcp_output/mcp_plugin/__init__.py ADDED
File without changes
deepTools/mcp_output/mcp_plugin/adapter.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Path settings
5
+ source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
6
+ sys.path.insert(0, source_path)
7
+
8
+ # Import statements
9
+ try:
10
+ from deeptools.alignmentSieve import alignmentSieve
11
+ from deeptools.bamCompare import bamCompare
12
+ from deeptools.bamCoverage import bamCoverage
13
+ from deeptools.computeMatrix import computeMatrix
14
+ from deeptools.heatmapper import heatmapper
15
+ except ImportError as e:
16
+ print(f"ImportError: {e}. Ensure the source directory is correctly set.")
17
+
18
+ class Adapter:
19
+ """
20
+ Adapter class for deepTools functionalities.
21
+ Provides methods to interact with various deepTools modules.
22
+ """
23
+
24
+ def __init__(self):
25
+ self.mode = "import"
26
+
27
+ # -------------------------------------------------------------------------
28
+ # Alignment Sieve Module
29
+ # -------------------------------------------------------------------------
30
+
31
+ def run_alignment_sieve(self, input_file, output_file, **kwargs):
32
+ """
33
+ Filters alignments based on various criteria using alignmentSieve.
34
+
35
+ Parameters:
36
+ - input_file: str, path to the input BAM file.
37
+ - output_file: str, path to the output BAM file.
38
+ - kwargs: additional parameters for alignmentSieve.
39
+
40
+ Returns:
41
+ - dict: status of the operation.
42
+ """
43
+ try:
44
+ alignmentSieve(input_file=input_file, output_file=output_file, **kwargs)
45
+ return {"status": "success", "message": "Alignment sieve completed successfully."}
46
+ except Exception as e:
47
+ return {"status": "error", "message": f"Failed to run alignment sieve: {e}"}
48
+
49
+ # -------------------------------------------------------------------------
50
+ # BAM Compare Module
51
+ # -------------------------------------------------------------------------
52
+
53
+ def run_bam_compare(self, bamfile1, bamfile2, output_file, **kwargs):
54
+ """
55
+ Compares two BAM files and generates a bigWig file using bamCompare.
56
+
57
+ Parameters:
58
+ - bamfile1: str, path to the first BAM file.
59
+ - bamfile2: str, path to the second BAM file.
60
+ - output_file: str, path to the output bigWig file.
61
+ - kwargs: additional parameters for bamCompare.
62
+
63
+ Returns:
64
+ - dict: status of the operation.
65
+ """
66
+ try:
67
+ bamCompare(bamfile1=bamfile1, bamfile2=bamfile2, output_file=output_file, **kwargs)
68
+ return {"status": "success", "message": "BAM comparison completed successfully."}
69
+ except Exception as e:
70
+ return {"status": "error", "message": f"Failed to compare BAM files: {e}"}
71
+
72
+ # -------------------------------------------------------------------------
73
+ # BAM Coverage Module
74
+ # -------------------------------------------------------------------------
75
+
76
+ def run_bam_coverage(self, bamfile, output_file, **kwargs):
77
+ """
78
+ Calculates the coverage of BAM files using bamCoverage.
79
+
80
+ Parameters:
81
+ - bamfile: str, path to the BAM file.
82
+ - output_file: str, path to the output coverage file.
83
+ - kwargs: additional parameters for bamCoverage.
84
+
85
+ Returns:
86
+ - dict: status of the operation.
87
+ """
88
+ try:
89
+ bamCoverage(bamfile=bamfile, output_file=output_file, **kwargs)
90
+ return {"status": "success", "message": "BAM coverage calculation completed successfully."}
91
+ except Exception as e:
92
+ return {"status": "error", "message": f"Failed to calculate BAM coverage: {e}"}
93
+
94
+ # -------------------------------------------------------------------------
95
+ # Compute Matrix Module
96
+ # -------------------------------------------------------------------------
97
+
98
+ def run_compute_matrix(self, score_file, regions_file, output_file, **kwargs):
99
+ """
100
+ Computes a matrix of scores for genomic regions using computeMatrix.
101
+
102
+ Parameters:
103
+ - score_file: str, path to the score file.
104
+ - regions_file: str, path to the regions file.
105
+ - output_file: str, path to the output matrix file.
106
+ - kwargs: additional parameters for computeMatrix.
107
+
108
+ Returns:
109
+ - dict: status of the operation.
110
+ """
111
+ try:
112
+ computeMatrix(score_file=score_file, regions_file=regions_file, output_file=output_file, **kwargs)
113
+ return {"status": "success", "message": "Matrix computation completed successfully."}
114
+ except Exception as e:
115
+ return {"status": "error", "message": f"Failed to compute matrix: {e}"}
116
+
117
+ # -------------------------------------------------------------------------
118
+ # Heatmapper Module
119
+ # -------------------------------------------------------------------------
120
+
121
+ def run_heatmapper(self, matrix_file, output_file, **kwargs):
122
+ """
123
+ Generates heatmaps from computed matrices using heatmapper.
124
+
125
+ Parameters:
126
+ - matrix_file: str, path to the matrix file.
127
+ - output_file: str, path to the output heatmap file.
128
+ - kwargs: additional parameters for heatmapper.
129
+
130
+ Returns:
131
+ - dict: status of the operation.
132
+ """
133
+ try:
134
+ heatmapper(matrix_file=matrix_file, output_file=output_file, **kwargs)
135
+ return {"status": "success", "message": "Heatmap generation completed successfully."}
136
+ except Exception as e:
137
+ return {"status": "error", "message": f"Failed to generate heatmap: {e}"}
138
+
139
+ # End of Adapter class definition
deepTools/mcp_output/mcp_plugin/main.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MCP Service Auto-Wrapper - Auto-generated
3
+ """
4
+ from mcp_service import create_app
5
+
6
+ def main():
7
+ """Main entry point"""
8
+ app = create_app()
9
+ return app
10
+
11
+ if __name__ == "__main__":
12
+ app = main()
13
+ app.run()
deepTools/mcp_output/mcp_plugin/mcp_service.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Path settings to include the local source directory
5
+ source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
6
+ if source_path not in sys.path:
7
+ sys.path.insert(0, source_path)
8
+
9
+ from fastmcp import FastMCP
10
+ from deeptools.alignmentSieve import alignmentSieve
11
+ from deeptools.bamCompare import bamCompare
12
+ from deeptools.bamCoverage import bamCoverage
13
+ from deeptools.computeMatrix import computeMatrix
14
+ from deeptools.heatmapper import heatmapper
15
+
16
+ mcp = FastMCP("deepToolsService")
17
+
18
+ @mcp.tool(name="alignment_sieve", description="Filter alignments based on various criteria.")
19
+ def alignment_sieve(input_file: str, output_file: str, min_length: int, max_length: int) -> dict:
20
+ """
21
+ Filters alignments in a BAM file based on length criteria.
22
+
23
+ :param input_file: Path to the input BAM file.
24
+ :param output_file: Path to the output BAM file.
25
+ :param min_length: Minimum alignment length to retain.
26
+ :param max_length: Maximum alignment length to retain.
27
+ :return: Dictionary with success status and result or error message.
28
+ """
29
+ try:
30
+ alignmentSieve(input_file, output_file, min_length, max_length)
31
+ return {"success": True, "result": f"Filtered alignments saved to {output_file}"}
32
+ except Exception as e:
33
+ return {"success": False, "error": str(e)}
34
+
35
+ @mcp.tool(name="bam_compare", description="Compare two BAM files and generate a bigWig file.")
36
+ def bam_compare(bam_file1: str, bam_file2: str, output_file: str) -> dict:
37
+ """
38
+ Compares two BAM files and generates a bigWig file.
39
+
40
+ :param bam_file1: Path to the first BAM file.
41
+ :param bam_file2: Path to the second BAM file.
42
+ :param output_file: Path to the output bigWig file.
43
+ :return: Dictionary with success status and result or error message.
44
+ """
45
+ try:
46
+ bamCompare(bam_file1, bam_file2, output_file)
47
+ return {"success": True, "result": f"Comparison result saved to {output_file}"}
48
+ except Exception as e:
49
+ return {"success": False, "error": str(e)}
50
+
51
+ @mcp.tool(name="bam_coverage", description="Calculate the coverage of BAM files.")
52
+ def bam_coverage(bam_file: str, output_file: str) -> dict:
53
+ """
54
+ Calculates the coverage of a BAM file and outputs a bigWig file.
55
+
56
+ :param bam_file: Path to the BAM file.
57
+ :param output_file: Path to the output bigWig file.
58
+ :return: Dictionary with success status and result or error message.
59
+ """
60
+ try:
61
+ bamCoverage(bam_file, output_file)
62
+ return {"success": True, "result": f"Coverage data saved to {output_file}"}
63
+ except Exception as e:
64
+ return {"success": False, "error": str(e)}
65
+
66
+ @mcp.tool(name="compute_matrix", description="Compute a matrix of scores for genomic regions.")
67
+ def compute_matrix(input_file: str, output_file: str) -> dict:
68
+ """
69
+ Computes a matrix of scores for genomic regions from an input file.
70
+
71
+ :param input_file: Path to the input file.
72
+ :param output_file: Path to the output matrix file.
73
+ :return: Dictionary with success status and result or error message.
74
+ """
75
+ try:
76
+ computeMatrix(input_file, output_file)
77
+ return {"success": True, "result": f"Matrix computed and saved to {output_file}"}
78
+ except Exception as e:
79
+ return {"success": False, "error": str(e)}
80
+
81
+ @mcp.tool(name="heatmapper", description="Generate heatmaps from computed matrices.")
82
+ def generate_heatmap(matrix_file: str, output_file: str) -> dict:
83
+ """
84
+ Generates a heatmap from a computed matrix file.
85
+
86
+ :param matrix_file: Path to the matrix file.
87
+ :param output_file: Path to the output heatmap file.
88
+ :return: Dictionary with success status and result or error message.
89
+ """
90
+ try:
91
+ heatmapper(matrix_file, output_file)
92
+ return {"success": True, "result": f"Heatmap generated and saved to {output_file}"}
93
+ except Exception as e:
94
+ return {"success": False, "error": str(e)}
95
+
96
+ def create_app() -> FastMCP:
97
+ """
98
+ Creates and returns the FastMCP application instance.
99
+
100
+ :return: FastMCP instance.
101
+ """
102
+ return mcp
deepTools/mcp_output/requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastmcp
2
+ fastapi
3
+ uvicorn[standard]
4
+ pydantic>=2.0.0
5
+ numpy >= 2.0.0
6
+ scipy >= 0.17.0
7
+ matplotlib >= 3.5.0
8
+ pysam >= 0.14.0
9
+ numpydoc >= 0.5
10
+ pyBigWig >= 0.2.1
11
+ py2bit >= 0.2.0
12
+ plotly >= 4.9
13
+ deeptoolsintervals >= 0.1.8
deepTools/mcp_output/start_mcp.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ MCP Service Startup Entry
4
+ """
5
+ import sys
6
+ import os
7
+
8
+ project_root = os.path.dirname(os.path.abspath(__file__))
9
+ mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
10
+ if mcp_plugin_dir not in sys.path:
11
+ sys.path.insert(0, mcp_plugin_dir)
12
+
13
+ from mcp_service import create_app
14
+
15
+ def main():
16
+ """Start FastMCP service"""
17
+ app = create_app()
18
+ # Use environment variable to configure port, default 8000
19
+ port = int(os.environ.get("MCP_PORT", "8000"))
20
+
21
+ # Choose transport mode based on environment variable
22
+ transport = os.environ.get("MCP_TRANSPORT", "stdio")
23
+ if transport == "http":
24
+ app.run(transport="http", host="0.0.0.0", port=port)
25
+ else:
26
+ # Default to STDIO mode
27
+ app.run()
28
+
29
+ if __name__ == "__main__":
30
+ main()
deepTools/mcp_output/workflow_summary.json ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "repository": {
3
+ "name": "deepTools",
4
+ "url": "https://github.com/deeptools/deepTools",
5
+ "local_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/deepTools",
6
+ "description": "Python library",
7
+ "features": "Basic functionality",
8
+ "tech_stack": "Python",
9
+ "stars": 0,
10
+ "forks": 0,
11
+ "language": "Python",
12
+ "last_updated": "",
13
+ "complexity": "medium",
14
+ "intrusiveness_risk": "medium"
15
+ },
16
+ "execution": {
17
+ "start_time": 1769854937.7038116,
18
+ "end_time": 1769855028.4553556,
19
+ "duration": 90.75154423713684,
20
+ "status": "success",
21
+ "workflow_status": "success",
22
+ "nodes_executed": [
23
+ "download",
24
+ "analysis",
25
+ "env",
26
+ "generate",
27
+ "run",
28
+ "review",
29
+ "finalize"
30
+ ],
31
+ "total_files_processed": 2,
32
+ "environment_type": "unknown",
33
+ "llm_calls": 0,
34
+ "deepwiki_calls": 0
35
+ },
36
+ "tests": {
37
+ "original_project": {
38
+ "passed": false,
39
+ "details": {},
40
+ "test_coverage": "100%",
41
+ "execution_time": 0,
42
+ "test_files": []
43
+ },
44
+ "mcp_plugin": {
45
+ "passed": true,
46
+ "details": {},
47
+ "service_health": "healthy",
48
+ "startup_time": 0,
49
+ "transport_mode": "stdio",
50
+ "fastmcp_version": "unknown",
51
+ "mcp_version": "unknown"
52
+ }
53
+ },
54
+ "analysis": {
55
+ "structure": {
56
+ "packages": [
57
+ "source.deeptools",
58
+ "source.deeptools.test"
59
+ ]
60
+ },
61
+ "dependencies": {
62
+ "has_environment_yml": false,
63
+ "has_requirements_txt": false,
64
+ "pyproject": true,
65
+ "setup_cfg": false,
66
+ "setup_py": false
67
+ },
68
+ "entry_points": {
69
+ "imports": [],
70
+ "cli": [],
71
+ "modules": []
72
+ },
73
+ "risk_assessment": {
74
+ "import_feasibility": 0.8,
75
+ "intrusiveness_risk": "medium",
76
+ "complexity": "medium"
77
+ },
78
+ "deepwiki_analysis": {
79
+ "repo_url": "https://github.com/deeptools/deepTools",
80
+ "repo_name": "deepTools",
81
+ "content": "deeptools/deepTools\nInstallation and Getting Started\nTypical Workflows\nCore Processing Engines\nParallel Processing Framework (mapReduce)\nRead Counting Engine (countReadsPerBin)\nScore Extraction (getScorePerBigWigBin)\nMatrix Computation Engine (heatmapper)\nCoverage Generation and Normalization\nSingle-Sample Coverage (bamCoverage)\nSample Comparison (bamCompare)\nGC Bias Correction Pipeline\nMulti-Sample Analysis Tools\nData Integration (multiBamSummary and multiBigwigSummary)\nCorrelation Analysis (plotCorrelation)\nPrincipal Component Analysis (plotPCA)\nVisualization Tools\nMatrix Generation (computeMatrix)\nMatrix Operations (computeMatrixOperations)\nHeatmap Visualization (plotHeatmap)\nProfile Plots (plotProfile)\nQuality Control Tools\nChIP-seq Enrichment Assessment (plotFingerprint)\nFragment Size Analysis (bamPEFragmentSize)\nCoverage Distribution (plotCoverage)\nFeature Enrichment (plotEnrichment)\nRead Filtering Estimation (estimateReadFiltering and alignmentSieve)\nGalaxy Integration\nGalaxy Wrapper Architecture\nTool Shed Distribution and Installation\nTesting with Planemo\nDevelopment and Contributing\nProject Structure and Configuration\nCI/CD Pipeline\nDocumentation System\nRelease Process and Versioning\nFile Formats and Data Structures\nInput File Formats\nIntermediate File Formats\nOutput Formats and Visualization\nAdvanced Topics and Optimization\nPerformance Tuning\nFiltering and Read Processing Options\nNormalization Methods Deep Dive\nSpecialized Read Processing Modes\nCHANGES.txt\ndocs/content/about.rst\ndocs/content/changelog.rst\ndocs/content/example_api_tutorial.rst\ndocs/content/example_gallery.rst\ndocs/content/example_step_by_step.rst\ndocs/content/example_usage.rst\ndocs/content/help_faq.rst\ndocs/content/help_faq_galaxy.rst\ndocs/content/help_galaxy_intro.rst\ndocs/content/help_glossary.rst\ndocs/content/installation.rst\ndocs/content/list_of_tools.rst\ndocs/images/Gal_FAQ_filteringDuplicates.png\ndocs/images/Gal_FAQ_info.png\ndocs/index.rst\ngalaxy/wrapper/deepTools_macros.xml\ngalaxy/wrapper/estimateReadFiltering.xml\ngalaxy/wrapper/test-data/estimateReadFiltering.txt\nscripts/convertChromsBigWig.py\nPurpose and Scope\nThis page provides a high-level introduction to deepTools: what it is, what problems it solves, and its primary capabilities for analyzing high-throughput sequencing data. For detailed installation instructions, seeInstallation and Getting Started. For specific tool usage and workflows, seeTypical Workflows.\nSources:README.md9-11docs/index.rst7-8\nWhat is deepTools?\ndeepTools is a suite of Python tools developed for efficient analysis of high-throughput sequencing data, particularly ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenge of handling large amounts of data generated from DNA sequencing centers by providing:\nNormalized coverage file generationin standard bedGraph and bigWig formats\nQuality control modulesfor assessing data quality and technical biases\nPublication-ready visualizationsfor identifying enrichments and functional genome annotations\nEfficient parallel processingusing themapReduceframework for genome-scale computations\ndeepTools is available through three distinct usage modes:\nSources:README.md9-14docs/index.rst10-14CHANGES.txt62\nSystem Architecture\nThe following diagram illustrates the primary components of deepTools and how they relate to each other:\ndeepTools Component Architecture\nData Access LayerCore Processing EnginesTool Modules LayerUser Interface LayerCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)Galaxy XML Wrappers(galaxy/wrapper/*.xml)Python API(import deeptools.*)Quality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSizeCoverage ToolsbamCoveragebamComparebigwigCompareAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrixVisualization ToolsplotHeatmapplotProfileplotCorrelationcountReadsPerBin(deeptools/countReadsPerBin.py)getScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)heatmapper(deeptools/heatmapper.py)mapReduce(deeptools/mapReduce.py)pysam(BAM/CRAM files)pyBigWig(bigWig files)deeptoolsintervals(BED/GTF files)\nData Access Layer\nCore Processing Engines\nTool Modules Layer\nUser Interface Layer\nCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)\nGalaxy XML Wrappers(galaxy/wrapper/*.xml)\nPython API(import deeptools.*)\nQuality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSize\nCoverage ToolsbamCoveragebamComparebigwigCompare\nAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrix\nVisualization ToolsplotHeatmapplotProfileplotCorrelation\ncountReadsPerBin(deeptools/countReadsPerBin.py)\ngetScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)\nheatmapper(deeptools/heatmapper.py)\nmapReduce(deeptools/mapReduce.py)\npysam(BAM/CRAM files)\npyBigWig(bigWig files)\ndeeptoolsintervals(BED/GTF files)\nSources:docs/content/list_of_tools.rst1-46pyproject.toml24-50README.md34-56\nCore Capabilities\ndeepTools provides five major functional categories that address different stages of sequencing data analysis:\n1. Quality Control and Preprocessing\nTools for assessing data quality before analysis:\nplotFingerprint- ChIP-seq enrichment assessment with quality metrics (Jensen-Shannon distance, CHANCE statistics)\nplotFingerprint\ncomputeGCBias/correctGCBias- GC bias detection and correction\ncomputeGCBias\ncorrectGCBias\nbamPEFragmentSize- Fragment size distribution analysis for paired-end data\nbamPEFragmentSize\nplotCoverage- Coverage distribution and genome coverage metrics\nplotCoverage\nestimateReadFiltering- Preview effects of filtering parameters\nestimateReadFiltering\nSources:docs/content/list_of_tools.rst18-32galaxy/wrapper/estimateReadFiltering.xml1-118\n2. Coverage Generation and Normalization\nTools for converting aligned reads (BAM) to normalized coverage tracks (bigWig/bedGraph):\nbamCoverage- Single-sample normalization with RPKM, CPM, BPM, RPGC methods\nbamCoverage\nbamCompare- Two-sample comparison (e.g., ChIP vs. input) with log2ratio, difference, mean operations\nbigwigCompare- Comparison operations on bigWig files\nbigwigCompare\nbigwigAverage- Averaging multiple bigWig files\nbigwigAverage\nSources:docs/content/list_of_tools.rst24-27CHANGES.txt28\n3. Data Aggregation\nTools for integrating multi-sample data:\nmultiBamSummary- Read count aggregation across BAM files (bins or BED-defined regions)\nmultiBamSummary\nmultiBigwigSummary- Score aggregation across bigWig files\nmultiBigwigSummary\ncomputeMatrix- Signal computation over genomic regions in two modes:scale-regionsandreference-point\ncomputeMatrix\nscale-regions\nreference-point\nSources:docs/content/list_of_tools.rst10-29\n4. Analysis and Statistics\nTools for deriving statistical insights:\nplotCorrelation- Pearson/Spearman correlation with hierarchical clustering and heatmap/scatter plot output\nplotCorrelation\nplotPCA- Principal component analysis for dimensionality reduction\nplotEnrichment- Feature enrichment quantification\nplotEnrichment\nSources:docs/content/list_of_tools.rst14-17docs/content/example_usage.rst32-44\n5. Visualization\nTools for creating publication-ready plots:\nplotHeatmap- Customizable heatmaps with clustering, color schemes, and multi-sample/region support\nplotHeatmap\nplotProfile- Average signal profile plots (meta-profiles) with standard error/deviation options\nplotProfile\ncomputeMatrixOperations- Matrix manipulation (filter, subset, sort, combine)\ncomputeMatrixOperations\nMultiple output formats supported: PNG, PDF, SVG, and interactive HTML (via plotly)\nSources:docs/content/list_of_tools.rst34-36CHANGES.txt187\nFile Format Ecosystem\ndeepTools operates on standard genomics file formats and performs conversions between them:\nFile Format Flow Diagram\nOutput FormatsIntermediate FormatsdeepTools ProcessingInput FormatsBAM/CRAM(aligned reads)bigWig(coverage signal)BED/GTF/GFF(genomic regions)2bit/FASTA(reference genome)bamCoveragebamComparemultiBamSummarymultiBigwigSummarycomputeMatrixcomputeGCBiasNPZ matrix(multiBamSummary output)Matrix .gz(computeMatrix output)Tabular(GC frequencies)bigWig/bedGraph(coverage tracks)PNG/PDF/SVG(static plots)HTML(interactive plotly)Tabular(data tables)\nOutput Formats\nIntermediate Formats\ndeepTools Processing\nInput Formats\nBAM/CRAM(aligned reads)\nbigWig(coverage signal)\nBED/GTF/GFF(genomic regions)\n2bit/FASTA(reference genome)\nbamCoverage\nmultiBamSummary\nmultiBigwigSummary\ncomputeMatrix\ncomputeGCBias\nNPZ matrix(multiBamSummary output)\nMatrix .gz(computeMatrix output)\nTabular(GC frequencies)\nbigWig/bedGraph(coverage tracks)\nPNG/PDF/SVG(static plots)\nHTML(interactive plotly)\nTabular(data tables)\nKey intermediate formats serve as bridges between data aggregation and visualization:\nNPZ files- Compressed NumPy arrays storing multi-sample read counts or scores (output frommultiBamSummary/multiBigwigSummary)\nmultiBamSummary\nmultiBigwigSummary\nMatrix .gz files- Compressed matrices of signal values over genomic regions (output fromcomputeMatrix)\ncomputeMatrix\nSources:docs/content/help_glossary.rst82-174docs/content/list_of_tools.rst8-45\nParallel Processing Framework\nAll compute-intensive operations in deepTools utilize themapReduceframework for efficient genome-scale processing:\nmapReduce Execution Model\nmapReduce Framework (deeptools/mapReduce.py)Input:BAM/bigWig filesBED regionsParametersGenome Chunking(~400k reads per chunk)Worker Pool(multiprocessing.Pool)Worker 1:Process chunk 1Worker 2:Process chunk 2Worker N:Process chunk NResult Aggregation(concatenate/merge)Output:Aggregated results\nmapReduce Framework (deeptools/mapReduce.py)\nInput:BAM/bigWig filesBED regionsParameters\nGenome Chunking(~400k reads per chunk)\nWorker Pool(multiprocessing.Pool)\nWorker 1:Process chunk 1\nWorker 2:Process chunk 2\nWorker N:Process chunk N\nResult Aggregation(concatenate/merge)\nOutput:Aggregated results\nThe framework provides:\nAutomatic parallelizationacross user-specified number of processors (via--numberOfProcessors)",
82
+ "model": "gpt-4o-2024-08-06",
83
+ "source": "selenium",
84
+ "success": true
85
+ },
86
+ "code_complexity": {
87
+ "cyclomatic_complexity": "medium",
88
+ "cognitive_complexity": "medium",
89
+ "maintainability_index": 75
90
+ },
91
+ "security_analysis": {
92
+ "vulnerabilities_found": 0,
93
+ "security_score": 85,
94
+ "recommendations": []
95
+ }
96
+ },
97
+ "plugin_generation": {
98
+ "files_created": [
99
+ "mcp_output/start_mcp.py",
100
+ "mcp_output/mcp_plugin/__init__.py",
101
+ "mcp_output/mcp_plugin/mcp_service.py",
102
+ "mcp_output/mcp_plugin/adapter.py",
103
+ "mcp_output/mcp_plugin/main.py",
104
+ "mcp_output/requirements.txt",
105
+ "mcp_output/README_MCP.md"
106
+ ],
107
+ "main_entry": "start_mcp.py",
108
+ "requirements": [
109
+ "fastmcp>=0.1.0",
110
+ "pydantic>=2.0.0"
111
+ ],
112
+ "readme_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/deepTools/mcp_output/README_MCP.md",
113
+ "adapter_mode": "import",
114
+ "total_lines_of_code": 0,
115
+ "generated_files_size": 0,
116
+ "tool_endpoints": 0,
117
+ "supported_features": [
118
+ "Basic functionality"
119
+ ],
120
+ "generated_tools": [
121
+ "Basic tools",
122
+ "Health check tools",
123
+ "Version info tools"
124
+ ]
125
+ },
126
+ "code_review": {},
127
+ "errors": [],
128
+ "warnings": [],
129
+ "recommendations": [
130
+ "Improve test coverage by adding more unit tests for core modules",
131
+ "Implement continuous integration (CI) to automate testing and deployment",
132
+ "Update documentation to include detailed installation and usage instructions",
133
+ "Optimize large file handling to improve performance",
134
+ "Refactor code to reduce complexity and improve maintainability",
135
+ "Ensure all dependencies are clearly defined and up-to-date",
136
+ "Enhance error handling to provide more informative messages",
137
+ "Consider adding a setup.py for easier package installation",
138
+ "Improve code comments for better readability and understanding",
139
+ "Conduct a code review to identify potential improvements and optimizations."
140
+ ],
141
+ "performance_metrics": {
142
+ "memory_usage_mb": 0,
143
+ "cpu_usage_percent": 0,
144
+ "response_time_ms": 0,
145
+ "throughput_requests_per_second": 0
146
+ },
147
+ "deployment_info": {
148
+ "supported_platforms": [
149
+ "Linux",
150
+ "Windows",
151
+ "macOS"
152
+ ],
153
+ "python_versions": [
154
+ "3.8",
155
+ "3.9",
156
+ "3.10",
157
+ "3.11",
158
+ "3.12"
159
+ ],
160
+ "deployment_methods": [
161
+ "Docker",
162
+ "pip",
163
+ "conda"
164
+ ],
165
+ "monitoring_support": true,
166
+ "logging_configuration": "structured"
167
+ },
168
+ "execution_analysis": {
169
+ "success_factors": [
170
+ "Efficient execution of all workflow nodes",
171
+ "Successful generation of MCP plugin files"
172
+ ],
173
+ "failure_reasons": [],
174
+ "overall_assessment": "good",
175
+ "node_performance": {
176
+ "download_time": "Completed successfully, indicating efficient data retrieval",
177
+ "analysis_time": "Completed successfully, indicating effective code analysis",
178
+ "generation_time": "Completed successfully, indicating efficient code generation",
179
+ "test_time": "Original project tests failed, but MCP plugin tests passed"
180
+ },
181
+ "resource_usage": {
182
+ "memory_efficiency": "Memory usage data not available, unable to assess",
183
+ "cpu_efficiency": "CPU usage data not available, unable to assess",
184
+ "disk_usage": "Disk usage data not available, unable to assess"
185
+ }
186
+ },
187
+ "technical_quality": {
188
+ "code_quality_score": 75,
189
+ "architecture_score": 80,
190
+ "performance_score": 70,
191
+ "maintainability_score": 75,
192
+ "security_score": 85,
193
+ "scalability_score": 70
194
+ }
195
+ }
deepTools/source/.planemo.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Some versions of planemo don't handle symlinks
3
+ unlink galaxy/wrapper/test-data/test.bw
4
+ cp deeptools/test/test_heatmapper/test.bw galaxy/wrapper/test-data/test.bw
5
+
6
+ if [[ $1 == "1" ]] ; then
7
+ wrappers="galaxy/wrapper/alignmentSieve.xml \
8
+ galaxy/wrapper/bamCompare.xml \
9
+ galaxy/wrapper/bamCoverage.xml \
10
+ galaxy/wrapper/bamPEFragmentSize.xml \
11
+ galaxy/wrapper/bigwigCompare.xml \
12
+ galaxy/wrapper/bigwigAverage.xml \
13
+ galaxy/wrapper/computeGCBias.xml"
14
+ elif [[ $1 == "2" ]] ; then
15
+ wrappers="galaxy/wrapper/computeMatrix.xml \
16
+ galaxy/wrapper/computeMatrixOperations.xml \
17
+ galaxy/wrapper/correctGCBias.xml \
18
+ galaxy/wrapper/estimateReadFiltering.xml \
19
+ galaxy/wrapper/multiBamSummary.xml \
20
+ galaxy/wrapper/multiBigwigSummary.xml"
21
+ else
22
+ wrappers="galaxy/wrapper/plotCorrelation.xml \
23
+ galaxy/wrapper/plotCoverage.xml \
24
+ galaxy/wrapper/plotEnrichment.xml \
25
+ galaxy/wrapper/plotFingerprint.xml \
26
+ galaxy/wrapper/plotHeatmap.xml \
27
+ galaxy/wrapper/plotPCA.xml \
28
+ galaxy/wrapper/plotProfiler.xml"
29
+ fi
30
+
31
+ planemo --version
32
+ planemo lint ${wrappers}
33
+ planemo test --no_dependency_resolution --galaxy_branch $2 --install_galaxy ${wrappers} 2>&1
34
+ mkdir upload
35
+ mv tool_test_output* upload/
deepTools/source/.readthedocs.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 2
2
+
3
+ build:
4
+ os: ubuntu-22.04
5
+ tools:
6
+ python: "3.12"
7
+
8
+ sphinx:
9
+ configuration: docs/conf.py
10
+
11
+ python:
12
+ install:
13
+ - method: pip
14
+ path: .
15
+ - requirements: docs/requirements.txt
deepTools/source/CHANGES.txt ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 3.5.5
2
+ * drop support for python 3.7
3
+ * doc fixes (argparse properly displayed, minor changes in installation instructions)
4
+ * deepblue support stops
5
+ * initiate deprecation of tight_layout in plotheatmap, in favor of constrained_layout. Minor changes in paddings, etc can occur (but for the better).
6
+ * documentation changes to improve ESS tab, table constraints have been lifted & sphinx_rtd_theme to v2.0.0
7
+ * upload artifact in gh test runner pinned to 3
8
+ * Try to get the number of processors from sched_getaffinity, to avoid using to many in job submissions for example. #1199
9
+ * Fix typo in estimateScaleFactor that fixes broken argparsing. #1286
10
+
11
+ 3.5.4
12
+ * error handling and cases for bwAverage with >2 samples
13
+ * Tick.label deprecation for mpl 3.8
14
+ * minimal mpl version is 3.5
15
+ * cicd update for pypi push
16
+
17
+ 3.5.3
18
+ * requirement cap for matplotlib lifted (changes in plotting can occur)
19
+ * nose has been deprecated in favor of pytests
20
+ * pytests run with python 3.7 - 3.11
21
+ * toml file for installation, requirements, versioning and executables
22
+ * planemo tests updated to galaxy 23.1
23
+ * custom github action runner deprecated
24
+ * deprecation of np types for builtin types
25
+ * stricter label checks and validator in galaxy
26
+
27
+ 3.5.2
28
+ * new subcommand: Bigwig average #1169
29
+ * dendogram of plotCorrelation now matches each cell correctly
30
+ * Fix label options
31
+ * add pool
32
+ * several other bugs fixed: #1159, #1185, #1172, #1181, #1183
33
+ * Fix galaxy tests, separate planemo and update pypi push only on tag releases
34
+ * upload artifact
35
+ * allow 1 or 2 lines diff for bowtie2 program
36
+ * change github action to get artifacts
37
+ * fix plotPCA
38
+ * try to fix old samtools installed
39
+ * add forgotten channels
40
+ * default chunklength increased for alignmentSieve
41
+ * chunklength in alignmentSieve is a CLI argument now
42
+ * suppress lack of index warnings from pysam
43
+ * fixedStep in bedGraph output to avoid merging bins with equal values
44
+
45
+ 3.5.1
46
+ * cmp usage is updated to fit the recent mpl updates.
47
+ * The requirements.txt is updated.
48
+ * "NA" occurences in plotFingerprint.py have been replaced by numpy.NAN (PR #1002)
49
+ * computeMatrixOperations.xml is fixed (brought up in #1003)
50
+ * plotly error is fixed. (issue #1013)
51
+ * relase version is updated in planemo.sh
52
+ * fixed galaxy tests
53
+ * A bug is taken care of in computeMatrixOperations.py / dataRange
54
+ * in plotProfile.py legen location is changed from auto to best (issue #1042)
55
+
56
+ 3.5.0
57
+
58
+ * Fixed a small issue in computeGCBias (issue #969)
59
+ * Added dataRange to computeMatricOperation to return min,max,median and 10th and 90th percentile.
60
+ * Fixed a small typo in bamCompare. (issue #966)
61
+ * Save the output matrix of the plotheatmap in a format to be compatible with running plotheatmap on it again.(issue #953)
62
+ * Different colors can now be set by user for plotProfile --plotType heatmap (issue #956)
63
+ * Added the `auto` option to the zMin and zMax of plotHeatmap. (issue #908)
64
+ * Added `--sortUsingSamples` and `--clusterUsingSamples` to the plotHeatmap galaxy wrapper. (issue #976)
65
+
66
+ 3.4.3
67
+
68
+ * Changed iteritems() in estimateEscaleFactor to its python3 compatible items().
69
+ * Added the missing argument (--clusterUsingSamples) to plotProfile.
70
+
71
+ 3.4.2
72
+
73
+ * Programmed around a bug in matplotlib that prevented the plotCorrelation scatter plot from working. See https://bioinformatics.stackexchange.com/questions/12830/plot-correlation-between-several-bam-files/12831
74
+
75
+ 3.4.1
76
+
77
+ * Prevented temporary bedGraph files from being written to (possibly small) shared-memory drives even when TMPDIR is set to somewhere else. Now shared memory is only used if requested by setting TMPDIR (or other appropriate environment variables) to `/dev/shm`.
78
+ * Fixed a bug in bamPEFragmentSize that caused incompatibility with newer matplotlib releases. (issue #928)
79
+
80
+ 3.4.0
81
+
82
+ * Fixed a bug in one of the Galaxy wrappers.
83
+ * Added the `--lineAtTickMarks` option to `plotHeatmap` so that there are dashed vertical lines for each tick mark in the plot. (issue #924)
84
+
85
+ 3.3.2
86
+
87
+ * Fixed --yAxisLabel in plotProfile (issue #889)
88
+ * Fixed a small X-axis tick offset issue. This caused the location of tick marks in profile plots to be shifted to the left by 0.5 to 1 bin. This was generally not notable, only really appearing when very few bins (e.g., 4) were used. The issue was mostly that the end tick would appear after the end of the plot, since its coordinate was the end of the bin. (issue #888)
89
+ * multiBamSummary and multiBigwigSummary no longer exclude small bins at the end of genomic chunks. multiBamSummary now has a `--genomicChunkSize` option in case users need to control the size of the genome used for multiprocessing for consistency. (issue #887)
90
+ * Added 4 new colormaps, which were copied from the seaborn project (issue #879). These are: rocket, mako, vlag, and icefire.
91
+ * Fixed an issue in the Galaxy wrapper of plotCorrelation where the X and Y.
92
+ * Fixed an issue with the `--Offset` option, where a single negative value wouldn't include only a single position, but rather that base through the end of the read. (stems from issue #902)
93
+ * Clustered output from plotHeatmap and plotProfile now allow computing the silhouette score of each row. This is printed in the returned BED file as the last column.
94
+
95
+ 3.3.1
96
+
97
+ * Fixed `--plotNumbers` not working in `plotCorrelation`. This was issue #838.
98
+ * Fixed compatibility with matplotlib 3 and restrict to at least that version.
99
+ * The Y-axis labels should once again appear in both plotHeatmap and plotProfile (issue #844). This was related to the previous point.
100
+ * Testing is no longer performed with python 2.7, which will reach end of life in a couple months.
101
+ * Various documentation updates (issues #868, #867 and #851).
102
+ * Increased support for BED files with track header lines (issue #866).
103
+
104
+ 3.3.0
105
+
106
+ * `plotCoverage` now has a `--BED` option, to restrict plots and output to apply to a specific set of regions given by a BED or GTF file or files (issue #829).
107
+ * `plotCoverage` now has a `--DepthSummary` option, which produces a summary similar to GATK's DepthOfCoverage (issue #828).
108
+ * `plotCoverage` is now able to compute coverage metrics for arbitrary coverage thresholds using multiples of the `-ct` option (e.g., `-ct 0 -ct 10 -ct 20 -ct 30`).
109
+
110
+ 3.2.1
111
+
112
+ * Changed a bug in `estimateReadFiltering` where the estimated number of filtered reads was typically too low.
113
+ * Made an internal change that should drastically reduce the memory requirements of many tools. This slightly increases run time, but as the resulting resource usage is much more attractive this is judged worthwhile.
114
+ * An informative error message is now produced with `bamCoverage` if RPGC normalization is requested but no effective genome size is provided (issue #815).
115
+ * Fixes some issues with y-axis scaling (issue #822)
116
+
117
+ 3.2.0
118
+
119
+ * Added access in the Galaxy wrapper to the `--labels` option in most tools (issue #738)
120
+ * Added the `std` plot type to plotProfile in Galaxy (issue #782)
121
+ * `bamCompare` now has a `--skipZeroOverZero` option to allow skipping bins where both input files lack coverage (issue #785)
122
+ * `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
123
+ * `multiBamSummary` now has a `--scaleFactors` option, which computes scale factors in the same manner as DESeq2 to a file. Note that the produced scaling factors are meant to be used with `bamCoverage`. If you want to use them directly in DESeq2 (or a similar package) you will need to invert them (take 1/scale factor). (issue #800)
124
+ * Fixed an issue with large numbers of samples and small genome sizes sometimes causing nothing to be processed. (issue #801)
125
+
126
+ 3.1.3
127
+
128
+ * Added the `--legendLocation` option in the Galaxy wrappers for plotProfile and plotHeatmap
129
+ * More thoroughly checked that output files can be written (issue #764).
130
+ * `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
131
+
132
+ 3.1.2
133
+
134
+ * Added a `--markers` option to `plotPCA`, courtesy of @sklasfeld.
135
+ * `computeMatrixOperations rbind` now properly supports multiple region groups (issue #742)
136
+ * Fixed the usage of `--xRange` and `--yRange` with `plotCorrelation` (issue #709)
137
+
138
+ 3.1.1
139
+
140
+ * Fixed the `--outFileNameData` option in `plotProfile` when `computeMatrix reference-point --referencePoint center` was used. This caused an error previously. (issue #727)
141
+ * RPGC normalization and the `--scaleFactor` option in `bamCoverage` are no longer mutually exclusive.
142
+ * Increased the default plot width in plotPCA (issue #738)
143
+
144
+ 3.1.0
145
+
146
+ * The `--centerReads` option in `bamCoverage` is now compatible with `--Offset` (previously `--centerReads` was silently ignored if `--Offset` was specified). (issue #693)
147
+ * `bamCoverage` and `bamCompare` now have an `--exactScaling` option. Instead of using a random sample of alignment to compute the scaling factor, this causes all reads in the file to be used. This is significantly slower, but helpful in situations where reads that should be excluded clump together on the genome (i.e., when sampling based on location is likely to be inaccurate).
148
+ * `plotCorrelation --whatToPlot scatterplot` now has `--xRange` and `--yRange` options rather than just `--maxRange`. (issue #709)
149
+ * `computeMatrixOperations` can now be used to change sample and group names.
150
+ * `computeMatrixOperations` can now filter rows by minimum and/or maximum value.
151
+ * `--maxThreshold` and `--minThreshold` are now more consistently honoured. (#702)
152
+ * Fixed region handling when using files on deepBlue (#700)
153
+ * Using `--normalizeUsing RPGC` with `bamCompare` will now result in a fatal error, rather than a simple warning and the settings being changed under the hood. (#718)
154
+ * Related to the last point, setting `--normalizeUsing` to anything other than `None` will result in an error unless `--scaleFactorsMethod None` is also used. This is to prevent people from accidentally getting unintended normalization.
155
+ * bamPEFragmentSize no longer exploids its memory use with multiple large BAM/CRAM files (#720). Many other tools will also benefit from this change.
156
+
157
+ 3.0.2
158
+
159
+ * Fixed an issue regarding under sampling alignments in some cases with computing scaling factors. This was issue #690. The resolution isn't perfect, it's hard to know how many reads really need to be sampled for things like RNA-seq.
160
+ * `computeMatrix` now has a `--verbose` option. Setting this will drastically increase the verbosity of the messages sent to the screen. Only do this for debugging. `--quiet` will disable this completely (as well as all other messages printed to screen).
161
+ * Fixed handling of `--sortUsing region_length` in `plotHeatmap`. This now works properly for `--referencePoint center` and `--referencePoint TES`, where in the latter case the dashed line is drawn at the region start. The documentation has been updated to mention this. (issue #671)
162
+ * The reference point label specified by `computeMatrix reference-point` is now respected by plotHeatmap and plotProfile. So if you used `computeMatrix reference-point --referencePointLabel center` then 'center' will now appear as the tick label in your heatmaps and profiles automatically. (issues #606 and #683)
163
+ * Enabled using regions with a `.` in the chromosome name in the Galaxy wrappers (issue #692)
164
+
165
+ 3.0.1
166
+
167
+ * Fixed the `--perGroup` option in plotProfile and plotHeatmap when multiple groups were being used. In version 3.0.0, this would typically cause an error and deepTools to crash. (issue #673)
168
+ * Fixed a few issues with the Galaxy wrappers. Thanks to Ralf Gilsbach, Claudia Keller, and @bgruening (e.g., issue #678)
169
+
170
+ 3.0.0
171
+
172
+ * plotCorrelation` now has `--log1p` and `--maxRange` options if a scatter plot is produced. `--log1p` plots the natural log of the values (plus 1). `--maxRange` sets the maximum X and Y axis ranges. If they would normally be below this value then they are left unchanged. (issue #536)
173
+ * The PCA plot now includes "% of var. explained" in the top axis labels. (issue #547)
174
+ * `plotProfile` and `plotHeatmap` now have a `--labelRotation` option that can rotate the X-axis labels. This is one of the more common requests for customization. For further customization, please modify your .matplotlibrc file or save as a PDF and modify further in Illustrator or a similar program. (issue #537)
175
+ * The `--ignoreDuplicates` algorithm has been updated to better handle paired-end reads. (issue #524)
176
+ * Added the `estimateReadFiltering` tool to estimate how many reads would be filtered from a BAM file or files if a variety of desired filtering criterion are applied (issue #518).
177
+ * Rewrote the bigWig creation functions so there are no longer steps involving creating a single large bedGraph and then sorting it. That was a hold-over from previous versions that used UCSC tools. This was issue #546. This also means that there are no longer any required external programs (previously, only `sort` was required).
178
+ * `plotPCA` can now be run on the transposed matrix, as is typically done with RNAseq data (e.g., with deepTools). Further, matplotlib is now no longer used for computing the PCA, but rather an SVD is performed and the results directly used. The options `--transpose` and `--ntop` were also added. The former computes the PCA of the transposed matrix and the latter specifies how many of the most variable rows in the matrix to use. By default, the 1000 most variable features are used. In the (now optional) plot, the `--PCs` option can now be used to specify which principal components to plot. Finally, the unbiased standard deviation is used in the out, as is done by `prcomp()` in R. This was issue #496.
179
+ * Symbol colors for `plotPCA` can now be specified. (issue #560)
180
+ * `plotFingerprint` always returns the synthetic JSD, even if no `--JSDsample` is specified. (issue #564)
181
+ * `plotEnrichment` will only read in annotation files a single time rather than in each thread. This prevents terrible performance when using many tens of millions of BED/GTF regions at the expense of a slight memory increase. (issue #530)
182
+ * Fixed a small bug generally affecting `plotFingerprint` where BAM files without an index were processed as bigWig files, resulting in a confusing error message (issue #574). Thanks to Sitanshu Gakkhar for poiting this out!
183
+ * `bamPEFragmentSize` now has `--table` and `--outRawFragmentLengths` options. The former option will output the read/fragment metrics to a file in tabular format (in addition to the previous information written to the screen). The latter option will write the raw read/fragment counts to a tsv file. The format of the file is a line with "#bamPEFragmentSize", followed by a header line of "Size\tOccurences\tSample", which should facilitate processing in things like R. (issue #572)
184
+ * `bamPEFragmentSize` will now plot the read length distribution for single-end BAM files. Note that if you mix single and paired-end files that the resulting plots may be difficult to interpret.
185
+ * The various plot commands do not actually have to plot anything, instead they can optionally only print their raw metrics or other text output. This is mostly useful with large numbers of input files, since the resulting plots can become quickly crowded. (issue #5719
186
+ * Expanded the metrics output by `bamPEFragmentSize` such that it now fully replaces Picard CollectInsertSizeMetrics (issue #577).
187
+ * "plotly" is now available as an output image format for all tools. Note that this is not really an image format, but rather an interactive webpage that you can open in your browser. The resulting webpages can be VERY large (especially for `plotHeatmap`), so please keep that in mind. Further, plotly does not currently have the capabilities to support all of deepTools' features, so note that some options will be ignored. For privacy reasons, all plotly files are saved locally and not uploaded to the public plot.ly site. You can click on the "Export to plot.ly" link on the bottom right of plotly output if you would like to modify the resulting files.
188
+ * `bamCoverage` no longer prints `normalization: depth` be default, but rather a more accurate message indicating that the scaling is performed according to the percentage of alignments kept after filtering. This was originally added in #366 (issue #590).
189
+ * The output of `plotFingerprint --outRawCounts` now has a header line to facilitate identification by MultiQC.
190
+ * `plotPCA` now has a `--log2` option, which log2 transforms the data before computing the PCA. Note that 0.01 is added to all values to 0 doesn't become -infinity.
191
+ * `computeGCBias` no longer requires a fragment length for paired-end datasets. This was apparently always meant to be the case anyway. (issue #595)
192
+ * `computeMatrixOperations sort` can now properly perform filtering of individual regions, as was originally intended (issue #594)
193
+ * `plotCoverage --outRawCounts` now has another line it its header, which is meant to aid MultiQC.
194
+ * There is no longer a configuration file. The default number of threads for all tools is 1. See issue #613.
195
+ * `bamCoverage` and `bamCompare` have rewritten normalization functions. They have both added CPM and BPM normalization and, importantly, filtering is now done **before** computing scaling factors. A few of the options associated with this (e.g., `--normalizeUsingRPKM`) have been replaced with the `--normalizeUsing` option. This behavior represents a break from that seen in earlier versions but should be easier to follow and more in line with what users expect is happening. The syntax for normalization has been reworked multiple times (see #629).
196
+ * Fixed issue #631
197
+ * `computeMatrix` now repeats labels for each column in a plot. This is convenient if you later want to merge reference-point and scale-regions runs and still have correct tick marks and labels in plotHeatmap/plotProfile (issue #614). Note that the output of computeMatrix and computeMatrixOperations can not be used with older versions of deepTools (but output from previous versions can still be used).
198
+ * `plotHeatmap --sortRegions` now has a `keep` option. This is identical to `--sortRegions no`, but may be clearer (issue #621)
199
+ * `plotPCA --outFileNameData` and `plotCorrelation --outFileCorMatrix` now produce files with a single comment line (i.e., '#plotPCA --outFileNameData' and '#plotCorrelation --outFileCorMatrix'). These can then be more easily parsed by programs like MultiQC.
200
+ * All functions that accept file labels (e.g., via a `--samplesLabel` option) now also have a `--smartLabels` option. This will result in labels comprised of the file name, after stripping any path and the file extension. (issue #627)
201
+ * The `-o` option can now be universally used to indicate the file to save a tool's primary output. Previously, some tools use `-o`, some used `-out` and still others used things like `-hist` or `-freq`. This caused annoyance due to having to always remember the appropriate switch. Hopefully standardizing to `-o` will alleviate this. (issue #640)
202
+ * Using a --blackListFileName with overlapping regions will typically now cause the various deepTools programs to stop. This is to ensure that resulting scale factors are correct (issue #649)
203
+ * `bamCoverage` is a bit more efficient with small BAM files now due to underlying algorithmic changes. Relatedely, bamCoverage will skip some unnecessary estimation steps if you are not filtering reads, further speeding processing a bit. (issue #662)
204
+ * Added support for CRAM files. This requires pysam > 0.13.0 (issue #619).
205
+
206
+ 2.5.7
207
+
208
+ * Fixed a small bug that caused computation to stop. This was related to a change made for release 2.5.5.
209
+
210
+ 2.5.6
211
+
212
+ * Fixed a bug where deepTools in python3 can't handle npz file labels created under python 2.
213
+
214
+ 2.5.5
215
+
216
+ * Updated blacklist handling such that an error is thrown on overlapping regions.
217
+
218
+ 2.5.4
219
+
220
+ * Fixed issue #612, which only occurs when unaligned reads have a position assigned to them.
221
+ * Ticks in the profile plot at the top of the output of `plotHeatmap` should now always line up properly. (issue #616)
222
+
223
+ 2.5.3
224
+
225
+ * Fixed a bug in `plotEnrichment`, the `--keepExons` option with a BED12 file would cause an error. (issue #559)
226
+ * `bamCoverage` now doesn't cause and error to be thrown by `sort` in there are "/spaces in quoted path/". (issue #558)
227
+
228
+ 2.5.2
229
+
230
+ * Fixed a bug in `bamCoverage` that can cause crashes when python3 is used.
231
+ * Fixed a bug in the multiBigwigSummary Galaxy wrapper.
232
+ * A more reasonable exit code (not 0) is now returned if there's a mismatch in the label and file number.
233
+ * `plotFingerprint` no longer tries to use illegal line designators (issue #538)
234
+ * Various documentation fixes
235
+
236
+ 2.5.1
237
+
238
+ * Added universal new line support to deeptoolsintervals (issue #506).
239
+ * Fixed a few issues with correctGCBias under python 3.5 (thanks to @drakeeee)
240
+ * Setting `--minThreshold 0.0` or `--maxThreshold 0.0` now works properly. Previously, setting either of these to 0 was ignored. (issue #516)
241
+ * You can now specify the plot width and height in `plotPCA` and `plotCorrelation` (heatmap only) with the `--plotWidth` and `--plotHeight` parameters. (issue #507)
242
+ * plotCoverage no longer clips the top off of plots. Further, you can now set the plot width and height with `--plotWidth` and `--plotHeight`. (issue #508)
243
+ * In bamCoverage, specifying `--filterRNAstrand` no longer results in `--extendReads` being ignored. (issue #520)
244
+ * `plotFingerprint` and `plotEnrichment` no longer require producing a plot, which is useful if you only need QC metrics and are using a LOT of samples (such that matplotlib would crash anyway). This hasn't been implemented in Galaxy, but can if people would like it. (issues #519 and #526)
245
+ * `computeMatrix` now accepts a `--samplesLabel` option, which is useful in those cases when you aren't immediately running `plotHeatmap` and don't have terribly descriptive file names (issue #523)
246
+ * If you use `plotFingerprint` with the `--JSDsample` option and forget to list that file under `--bamfiles` it will be added automatically and the file name added to the labels if needed (issue #527)
247
+ * Various Galaxy wrapper fixes
248
+
249
+ 2.5.0
250
+
251
+ * Fix a bug where using regions with the same name in multiple BED files in computeMatrix caused downstream problems in plotHeatmap/plotProfile (issue #477).
252
+ * If computeMatrix/plotHeatmap/plotProfile is asked to sort the output matrix, it now does so by ignoring NaN values. Previously, any row with an NaN was placed at the top of the output (issue #447).
253
+ * Fixed issue #471
254
+ * Various Galaxy wrapper fixes
255
+ * There is now a `--rowCenter` option in `plotPCA`, which can be used to make each row of the matrix used in the PCA to have a mean of 0. This can be useful in cases where there's extreme region-based depth variation that is shared between all samples. This was issue #477.
256
+ * The --Offset option is now available in `plotEnrichment`. This was issue #481.
257
+ * The maximum coverage allowed while calculating the Jensen-Shannon distance in `plotFingerprint` has been increased to 2 million and an informational message containing the number of bins above this value is printed to the standard output.
258
+ * `bamCoverage` now respects the `--scaleFactor` argument even if not other normalization is performed (issue #482).
259
+ * The `--minFragmentLength` and `--maxFragmentLength` options now respect single-end reads. For SE reads, these parameters refer to the number of aligned bases (i.e., splicing is ignored). This was issue #489.
260
+ * `--yMin` and `--yMax` can now be lists of values in `plotHeatmap`. This was issue #487. Note that the plots are not perfectly aligned if you do this.
261
+
262
+ 2.4.3
263
+
264
+ * Fixed incorrect label ordering in the `plotCorrelation` command with the `--outFileCorMatrix` options.
265
+ * Fixed bug #491, which involved python 3 and bamCoverage.
266
+
267
+ 2.4.2
268
+
269
+ * Fixed an issue where `computeMatrix reference-point --referencePoint center` would break if 1-base regions were used. This was bug #456.
270
+ * `plotCorrelation` with `--outFileCorMatrix` now works with `--labels` again (thanks to @sklasfeld for supplying the patch).
271
+ * `bigwigCompare` and `bamCompare` can now return the average (mean) of two input files (issue #467).
272
+
273
+ 2.4.1
274
+
275
+ * Setting --zMin to the same value as --zMax, whether intentionally or because the --zMax value computed by deepTools happens to be now larger than the desired value, will result in the maximum value in the dataset being used (internally, --zMax gets set to None).
276
+ * Scale factor is now set to 1 in bamCoverage if no normalization is used. The fact that this wasn't being done previously was a bug.
277
+ * Fixed a bug (#451) affecting BED files with a `deepTools_group` column that caused a problem with `--sortRegions keep` in computeMatrix.
278
+ * Fixed a bug where some matrices produced with `computeMatrixOperations cbind` would result in the right-most samples sometimes getting squished due to having ticks outside of their graph bounds. Ticks are now scaled if they don't match the data range (issue #452).
279
+ * In plotFingerprint, the number of reads per-bin are no longer used. Instead, the sum of the per-base coverage (or signal if bigWig input is used) is used. This leads to more similar metrics produced by us and others regarding things like Jensen-Shannon metrics. For those just interested in the plots, there's little effective change here.
280
+
281
+ 2.4.0
282
+
283
+ * The --Offset option to bamCoverage can now take two values, which can be used to specify a range within each alignment of bases to use. As an example, `--Offset 5 -1` will use ignore the first 4 bases of an alignment (accounting for orientation) and use only the 5th through last base. This can be useful for things like ATACseq (see #370).
284
+ * Read extension can now be used in conjunction with --Offset in bamCoverage.
285
+ * plotFingerprint can now output quality metrics, including the Jensen-Shannon distance if a reference sample is specified (see #328). Additionally, various statistics from CHANCE can be produced.
286
+ * Switched from using the 'twobitreader' python module to our new custom 'py2bit' module for accessing 2bit files. This fixes the performance regression seen in computeGCBias starting in version 2.3.0 (#383).
287
+ * `bigwigCompare`, `computeMatrix`, and `multiBigwigSummary` can read signal files hosted on [deepBlue](http://deepblue.mpi-inf.mpg.de/).
288
+ * Fixed a minor bug in `deeptools`, where the `--version` option was ignored (see #404).
289
+ * Text in SVG and PDF files is now actual text and not a path (see #403).
290
+ * The `--maxFragmentLength` option in bamCoverage now alters the `maxPairedFragmentLength` that is otherwise hard-coded (see #410).
291
+ * Added the `computeMatrixOperations` tools, which can be used to sort/reorder/subset/filter/combine the output of `computeMatrix`.
292
+ * `computeMatrix --sortRegions` has a new `keep` option, which is the default. This mimics the behavior in deepTools prior to 2.3.0 where the output order matched the input order. This is, of course, a bit slower, so if the order doesn't matter then use `no`.
293
+ * Fixed issue #435, where `plotHeatmap --sortRegions region_length` would crash with an error.
294
+ * Output bedGraph files are now sorted (#439).
295
+ * Values stored in bedGraph files (and therefore placed into bigWig files) now use python's "general" format with 6 digits of precision. This tends to produce slightly larger files, but with less loss for values near 0 (see #438).
296
+ * Corrected how computeGCBias determines the lambda parameter, which should only really affect very atypical experiments (i.e., correctGCBias would have crashed is this greatly affected you).
297
+
298
+ 2.3.6
299
+
300
+ * multiBamSummary will now not automatically append .npz to the output file name if it's not present. This was bug #436
301
+ * Fixed a bug with plotHeatmap where --yMin and --yMax didn't work
302
+
303
+ 2.3.5
304
+
305
+ * Various Galaxy wrapper fixes (e.g., issue #415 and #417)
306
+ * Fixed issue #413, wherein the --nanAfterEnd option sometimes causes computeMatrix to throw an error.
307
+ * Fixed issue #416, wherein --outRawCounts in multiBamSummary and multiBigwigSummary would cause an error if python3 was being used.
308
+
309
+ 2.3.4
310
+
311
+ * Fixed bug #405, which dealt with the SES normalization in bamCompare (it was producing an error and terminating the program).
312
+ * Fixed bug #407, which dealt with multiBamSummary or multiBigwigSummary bins and saving the raw data. This was causing an error and the program to terminate.
313
+
314
+ 2.3.3
315
+
316
+ * Fixed a bug wherein proper pairs where being incorrectly called improper pairs, thereby causing slightly incorrect read extension.
317
+
318
+ 2.3.2
319
+
320
+ * The deeptoolsinterval module was modified to speed up plotEnrichment, which was taking forever to finish.
321
+
322
+ 2.3.1
323
+
324
+ * This release has no real code changes, the 2.3.0 release on pypi was missing files.
325
+
326
+ 2.3.0
327
+
328
+ * Modified how normalization is done when filtering is used. Previously, the filtering wasn't taken into account when computing the total number of alignments. That is now being done. Note that this uses sampling and will try to sample at least 100000 alignments and see what fraction of them are filtered. The total number of aligned reads is then scaled accordingly (#309).
329
+ * Modified how normalization is done when a blacklist is used. Previously, the number of alignments overlapping a blacklisted region was subtracted from the total number of alignments in the file. This decreased things a bit too much, since only alignments falling completely within a blacklisted region are actually excluded completely (#312).
330
+ * BED12 and GTF files can now be used as input (issue #71). Additionally, multiBamSummary, multiBigwigSummary and computeMatrix now have a --metagene option, which allows summarization over concatenated exons, rather than include introns as well (this has always been the default). This was issue #76.
331
+ * Read extension is handled more accurately, such that if a read originates outside of a bin or BED/GTF region that it will typically be included if the --extendReads option is used and the extension would put it in a given bin/region.
332
+ * deepTools now uses a custom interval-tree implementation that allows including metadata, such as gene/transcript IDs, along with intervals. For those interested, the code for this available separately (https://github.com/dpryan79/deeptools_intervals) with the original C-only implementation here: https://github.com/dpryan79/libGTF.
333
+ * The API for the countReadsPerBin, getScorePerBigWigBin, and mapReduce modules has changed slightly (this was needed to support the --metagene option). Anyone using these in their own programs is encouraged to look at the modified API before upgrading.
334
+ * Added the `plotEnrichment` function (this was issue #329).
335
+ * There is now a `subsetMatrix` script available that can be used to subset the output of computeMatrix. This is useful for preparing plots that only contain a subset of samples/region groups. Note that this isn't installed by default.
336
+ * The Galaxy wrappers were updated to include the ability to exclude blacklisted regions.
337
+ * Most functions (both at the command line and within Galaxy) that process BAM files can now filter by fragment length (--minFragmentLength and --maxFragmentLength). By default there's no filtering performed. The primary purpose of this is to facilitate ATACseq analysis, where fragment length determines whether one is processing mono-/di-/poly-nucleosome fragments. This was issue #336.
338
+ * bamPEFragmentSize now has --logScale and --maxFragmentLength options, which allow you to plot frequencies on the log scale and set the max plotted fragment length, respectively. This was issue #337.
339
+ * --blackListFileName now accepts multiple files.
340
+ * bamPEFragmentSize now supports multiple input files.
341
+ * If the sequence has been removed from BAM files, SE reads no longer cause an error in bamCoverage if --normalizeTo1x is specified. In general, the code that looks at read length now checks the CIGAR string if there's no sequence available in a BAM file (for both PE and SE datasets). This was issue #369.
342
+ * bamCoverage now respects the --filterRNAstrand option when computing scaling factors. This was issue #353.
343
+ * computeMatrix and plotHeatmap can now sort using only a subset of samples
344
+ * There is now an --Offset option to bamCoverage, which allows having the signal at a single base. This is useful for things like RiboSeq or GROseq, where the goal is to get focal peaks at single bases/codons/etc.
345
+ * The --MNase option to `bamCoverage` now respects --minFragmentLength and --maxFragmentLength, with defaults set to 130 and 200.
346
+
347
+ 2.2.4
348
+
349
+ * Fix the incorrectly oriented dendrogram in plotCorrelation (issue #350). Relatedly, we're bumping the minimum version of scipy required to one where this is correct.
350
+
351
+ 2.2.3
352
+
353
+ * Fixed issue #334, where computeGCBias wasn't properly handling the black list option.
354
+
355
+ 2.2.2
356
+
357
+ * Fixed labels when hierarchical clustering is used (they were off by one previously).
358
+ * Fixed a bug wherein bamCompare couldn't work with a blacklist
359
+ * Fixed yet another change in pysam, though at least in this case is was fixing a previous problem
360
+
361
+ 2.2.1
362
+
363
+ * Fixed a bug introduced in version 2.2.0 wherein sometimes a pre-2.2.0 produced matrix file could no longer be used with plotHeatmap or plotProfile (this only happened when --outFileNameData was then used).
364
+ * Finally suppressed all of the runtime warnings that numpy likes to randomly throw.
365
+ * Worked around an undocumented change in pysam-0.9.0 that tended to break things.
366
+
367
+ 2.2.0
368
+
369
+ * plotFingerprint now iterates through line styles as well as colors. This allows up to 35 samples per plot without repeating (not that that many would ever be recommended). This was issue #80.
370
+ * Fixed a number of Galaxy wrappers, which were rendered incorrectly due to including a section title of "Background".
371
+ * A number of image file handles were previously not explicitly closed, which caused occasional completion of a plot* program but without the files actually being there. This only happened on some NFS mount points.
372
+ * The Galaxy wrappers now support the `--outFileNameData` option on plotProfile and plotHeatmap.
373
+ * Added support for blacklist regions. These can be supplied as a BED file and the regions will largely be skipped in processing (they'll also be ignored during normalization). This is very useful to skip regions known to attract excess signal. This was issue #101.
374
+ * Modified plotPCA to include the actual eigenvalues rather than rescaled ones. Also, plotPCA can now output the underlying values (issue #231).
375
+ * Regions within each feature body can now be unscaled when using `computeMatrix`. Thus, if you're interested in unscaled signal around the TSS/TES then you can now use the `--unscaled5prime` and `--unscaled3prime` options. This was issue #108.
376
+ * bamCoverage now has a `--filterRNAstrand` option, that will produce coverage for only a single strand. Note that the strand referred to is the DNA strand and not sense/anti-sense.
377
+ * Issues with plotHeatmap x-axis labels were fixed (issue #301).
378
+
379
+ 2.1.1
380
+
381
+ * Fixed a how the --hclust option was handled in plotHeatmap/plotProfile. This gets around a quirk in scipy.
382
+ * A bug involving processing comment lines in BED files was corrected (issue #288)
383
+ * The Galaxy wrappers are now automatically tested with each modification.
384
+ * plotCoverage and plotFingerprint in Galaxy now accept 1 or more BAM files rather than at least 2 files.
385
+
386
+ 2.1.0
387
+
388
+ * Updates to many of the Galaxy wrappers and associated documentation.
389
+ * A bug was fixed in how chromosome names were dealt with in bigWig files. If you ever received errors due to illegal intervals then that should now be fixed. This was issue #250
390
+ * plotProfile now has an --outFileNameData option for saving the underlying data in a text format.
391
+ * correctGCBias ensures that the resulting BAM file will pass picard/HTSJDK's validation if the input file did (issue #248)
392
+ * The default bin size was changed to 10, which is typically a bit more useful
393
+ * The --regionsLabel option to plotProfile and plotHeatmap now accepts a space-separated list, in line with --samplesLabel
394
+ * BAM files that have had their sequences stripped no longer cause an error
395
+ * bamPEFragmentSize now has -bs and -n options to allow adjusting the number of alignments sampled. Note that the default value is auto-adjusted if the sampling is too sparse.
396
+ * bamPEFragmentSize now accepts single-end files.
397
+ * The --hclust option to plotProfile and plotHeatmap continues even if one of the groups is too small for plotting (matplotlib will produce a warning that you can ignore). This was issue #280.
398
+
399
+ 2.0.1
400
+
401
+ * A critical bug that prevented plotPCA from running was fixed.
402
+ * multiBamCoverage was renamed to multiBamSummary, to be in better alignment with multiBigwigSummary.
403
+ * computeGCBias and correctGCBias are now more tolerant of chromosome name mismatches.
404
+ * multiBigwigSummary and multiBamSummary can accept a single bigWig/BAM input file, though one should use the
405
+ --outRawCounts argument.
406
+
407
+ 2.0.0
408
+
409
+ * Documentation improved and migrated to http://deeptools.readthedocs.org The API to use deepTools modules is now
410
+ part of the documentation and includes a tutorial.
411
+ * Allow multiple bigwig files in computeMatrix that can be clustered together
412
+ * computeMatrix now accepts multiple bed files. Each bed file is considered as a group. Labels are automatically
413
+ added based on the file names.
414
+ * When computing read coverage now splited reads are understood. This is convenient for computing the
415
+ coverage of for RNA-seq data.
416
+ * New quality control tool 'plotCoverage' to plot the coverage over base pairs for multiple samples
417
+ * renaming of --missingDataAsZero to --skipNonCovered regions for clarity in bamCoverage and bamCompare
418
+ * New analysis tool plotPCA that visualizes the results from principal component analysis
419
+ * New option in bamCoverage `--MNase` that will compute the read coverage only considering 2 base pairs at the
420
+ center of the fragment.
421
+ * Make read extension optional. Remove the need to specify a default fragment length for most of the tools. Now, when
422
+ read extension is enabled and the bam files contain paired en data, the mean fragment length is automatically
423
+ calculated by sampling the read pairs in the bam file. The --doNotExtendPairedEnds and --fragmentLentgh parameters
424
+ are no longer used and the new --extendReads parameter was added.
425
+ * Dramatically improved bigwig related tools by using the new pyBigWig module. Eliminated the requirement for the
426
+ UCSC program `bigWigInfo`
427
+ * renamed heatmapper to plotHeatmap and profiler to plotProfile
428
+ * added hierarchical clustering, besides k-means to plotProfile and plotHeatmap
429
+ * improved plotting features for plotProfile when using 'overlapped_lines' and 'heatmap' plot types
430
+ * Resolved an error introduced by numpy version 1.10 in computeMatrix
431
+ * plotting of correlations (from bamCorrelate or bigwigCorrelate) was separated from the computation of the
432
+ underlying data. A new tool, plotCorrelation was added. This tool can plot correlations as heatmaps or as scatter
433
+ plots and includes options to adjust a large array of visual features.
434
+ * Fixed issue with bed intervals in bigwigCorrelate and bamCorrelate and a user specified region.
435
+ * Correlation coefficients can be computed even if the data contains NaNs
436
+ * Allow computeMatrix to read files with DOS newline characters
437
+ * Added option --skipChromosomes to bigwigCorrelate, for example to skip all 'random' chromosomes. bigwigCorrelate
438
+ now also considers chromosomes as identical when their names between samples differ with the prefix 'chr'. E.g.
439
+ chr1 vs. 1
440
+ * For bamCoverage and bamCompare, behaviour of scaleFactor was updated such that now, if given in combination
441
+ with the normalization options (normalize to 1x or normalize using RPKM) the given scaleFactor
442
+ will multiply the scale factor computed for the normalization methods.
443
+ * Fixed problem with read pairs labelled as proper pairs by the aligner but that were actually not proper pairs, for
444
+ example because the mates did not face each other. deepTools adds further checks to determine if a read pair is a
445
+ proper pair.
446
+ * Added titles to QC plots (#74)
447
+ * Added --samFlagInclude and --samFlagExclude parameters. This is useful to for example only include forward reads
448
+ * In deeptools2 most of the core code was rewriting to facilitate API usage and for optimization.
deepTools/source/LICENSE.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ The file deeptools/cm.py is licensed under the BSD license, see a copy in that file. The remainder of the code is licensed under the MIT license:
2
+
3
+ Copyright 2019 Max Planck Institute for Immunobiology and Epigenetics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
deepTools/source/MANIFEST.in ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ include *.txt
2
+ include README.md
3
+ exclude examples/*
4
+ exclude deepTools.egg-info/*
5
+ include scripts/*
6
+ exclude deeptools/test/*
7
+ exclude galaxy/*
8
+ exclude gallery/*
deepTools/source/README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deepTools
2
+ [![Documentation Status](https://readthedocs.org/projects/deeptools/badge/)](http://deeptools.readthedocs.org/)
3
+ [![PyPI Version](https://img.shields.io/pypi/v/deeptools.svg?style=plastic)](https://pypi.org/project/deepTools/)
4
+ [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/deeptools/README.html)
5
+ [![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAASCAYAAABB7B6eAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAACXBIWXMAAAsTAAALEwEAmpwYAAACC2lUWHRYTUw6Y29tLmFkb2JlLnhtcAAAAAAAPHg6eG1wbWV0YSB4bWxuczp4PSJhZG9iZTpuczptZXRhLyIgeDp4bXB0az0iWE1QIENvcmUgNS40LjAiPgogICA8cmRmOlJERiB4bWxuczpyZGY9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyMiPgogICAgICA8cmRmOkRlc2NyaXB0aW9uIHJkZjphYm91dD0iIgogICAgICAgICAgICB4bWxuczp0aWZmPSJodHRwOi8vbnMuYWRvYmUuY29tL3RpZmYvMS4wLyI+CiAgICAgICAgIDx0aWZmOlJlc29sdXRpb25Vbml0PjI8L3RpZmY6UmVzb2x1dGlvblVuaXQ+CiAgICAgICAgIDx0aWZmOkNvbXByZXNzaW9uPjE8L3RpZmY6Q29tcHJlc3Npb24+CiAgICAgICAgIDx0aWZmOk9yaWVudGF0aW9uPjE8L3RpZmY6T3JpZW50YXRpb24+CiAgICAgICAgIDx0aWZmOlBob3RvbWV0cmljSW50ZXJwcmV0YXRpb24+MjwvdGlmZjpQaG90b21ldHJpY0ludGVycHJldGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KD0UqkwAAAn9JREFUOBGlVEuLE0EQruqZiftwDz4QYT1IYM8eFkHFw/4HYX+GB3/B4l/YP+CP8OBNTwpCwFMQXAQPKtnsg5nJZpKdni6/6kzHvAYDFtRUT71f3UwAEbkLch9ogQxcBwRKMfAnM1/CBwgrbxkgPAYqlBOy1jfovlaPsEiWPROZmqmZKKzOYCJb/AbdYLso9/9B6GppBRqCrjSYYaquZq20EUKAzVpjo1FzWRDVrNay6C/HDxT92wXrAVCH3ASqq5VqEtv1WZ13Mdwf8LFyyKECNbgHHAObWhScf4Wnj9CbQpPzWYU3UFoX3qkhlG8AY2BTQt5/EA7qaEPQsgGLWied0A8VKrHAsCC1eJ6EFoUd1v6GoPOaRAtDPViUr/wPzkIFV9AaAZGtYB568VyJfijV+ZBzlVZJ3W7XHB2RESGe4opXIGzRTdjcAupOK09RA6kzr1NTrTj7V1ugM4VgPGWEw+e39CxO6JUw5XhhKihmaDacU2GiR0Ohcc4cZ+Kq3AjlEnEeRSazLs6/9b/kh4eTC+hngE3QQD7Yyclxsrf3cpxsPXn+cFdenF9aqlBXMXaDiEyfyfawBz2RqC/O9WF1ysacOpytlUSoqNrtfbS642+4D4CS9V3xb4u8P/ACI4O810efRu6KsC0QnjHJGaq4IOGUjWTo/YDZDB3xSIxcGyNlWcTucb4T3in/3IaueNrZyX0lGOrWndstOr+w21UlVFokILjJLFhPukbVY8OmwNQ3nZgNJNmKDccusSb4UIe+gtkI+9/bSLJDjqn763f5CQ5TLApmICkqwR0QnUPKZFIUnoozWcQuRbC0Km02knj0tPYx63furGs3x/iPnz83zJDVNtdP3QAAAABJRU5ErkJggg==)](https://usegalaxy.eu/root?tool_id=deeptools_compute_matrix)
6
+ ![test](https://github.com/deeptools/deepTools/actions/workflows/test.yml/badge.svg)
7
+
8
+
9
+ ## User-friendly tools for exploring deep-sequencing data
10
+
11
+ deepTools addresses the challenge of handling the large amounts of data that are now routinely generated from DNA sequencing centers. deepTools contains useful modules to process the mapped reads data for multiple quality checks, creating **normalized coverage files** in standard bedGraph and bigWig file formats, that allow comparison between different files (for example, treatment and control). Finally, using such normalized and standardized files, deepTools can create many publication-ready **visualizations** to identify enrichments and for functional annotations of the genome.
12
+
13
+ For support or questions please post to [Biostars](http://biostars.org). For bug reports and feature requests please open an issue [on github](http://github.com/deeptools/deeptools).
14
+
15
+
16
+ ### Citation:
17
+
18
+ Ramírez F, Ryan DP, Grüning B, Bhardwaj V, Kilpert F, Richter AS, Heyne S, Dündar F, Manke T. [deepTools2: a next generation web server for deep-sequencing data analysis.](https://nar.oxfordjournals.org/content/early/2016/04/12/nar.gkw257.abstract) Nucleic Acids Research. 2016 Apr 13:gkw257.
19
+
20
+ ### Documentation:
21
+
22
+ Our [documentation](http://deeptools.readthedocs.org/) contains more details on the [individual tool scopes and usages](http://deeptools.readthedocs.org/en/latest/content/list_of_tools.html) and an [introduction to our deepTools Galaxy web server](http://deeptools.readthedocs.org/en/latest/content/help_galaxy_intro.html) including [step-by-step protocols](http://deeptools.readthedocs.org/en/latest/content/example_usage.html).
23
+
24
+ >Please see also the [FAQ](http://deeptools.readthedocs.org/en/latest/content/help_faq.html), which we update regularly.
25
+ Our [Gallery](http://deeptools.readthedocs.org/en/latest/content/example_gallery.html) may give you some more ideas about the scope of deepTools.
26
+
27
+ >For more specific **troubleshooting, feedback, and tool suggestions**, please post [to Biostars](http://biostars.org).
28
+
29
+
30
+ -------------------------------------------------------------------------------------------------------------------
31
+
32
+ ### Installation
33
+
34
+ deepTools are available for:
35
+
36
+ * Command line usage (via pip / conda / github)
37
+ * Integration into Galaxy servers (via toolshed/API/web-browser)
38
+
39
+ There are many easy ways to install deepTools. More details can be found [here](https://deeptools.readthedocs.io/en/latest/content/installation.html).
40
+
41
+ In Brief:
42
+
43
+ **Install through pypi**
44
+
45
+ $ pip install deeptools
46
+
47
+ **Install via conda**
48
+
49
+ $ conda install -c bioconda deeptools
50
+
51
+ **Install by cloning the repository**
52
+
53
+ $ git clone https://github.com/deeptools/deepTools
54
+ $ cd deepTools
55
+ $ pip install .
56
+
57
+ <a name="galaxy"/></a>
58
+ ### Galaxy Installation
59
+
60
+ deepTools can be easily integrated into [Galaxy](http://galaxyproject.org). Please see the [installation instructions in our documentation](http://deeptools.readthedocs.io/en/latest/content/installation.html#galaxy-installation) for further details.
61
+
62
+ **Note:** From version 2.3 onwards, deepTools support **python3**.
63
+
64
+ ------------------------------------
65
+
66
+ This tool suite is developed by the [Bioinformatics Facility](http://www1.ie-freiburg.mpg.de/bioinformaticsfac) at the [Max Planck Institute for Immunobiology and Epigenetics, Freiburg](http://www1.ie-freiburg.mpg.de/).
67
+
68
+ [Documentation](http://deeptools.readthedocs.org/en/latest/index.html) | [deepTools Galaxy](http://deeptools.ie-freiburg.mpg.de) | [FAQ](http://deeptools.readthedocs.org/en/latest/content/help_faq.html)
deepTools/source/README.rst ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ======================================================================
2
+ deepTools
3
+ ======================================================================
4
+
5
+ User-friendly tools for exploring deep-sequencing data
6
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
7
+
8
+ deepTools addresses the challenge of handling the large amounts of data
9
+ that are now routinely generated from DNA sequencing centers. deepTools
10
+ contains useful modules to process the mapped reads data for multiple
11
+ quality checks, creating **normalized coverage files** in standard
12
+ bedGraph and bigWig file formats, that allow comparison between
13
+ different files (for example, treatment and control). Finally, using
14
+ such normalized and standardized files, deepTools can create many
15
+ publication-ready **visualizations** to identify enrichments and for
16
+ functional annotations of the genome.
17
+
18
+ For support or questions please make a post on `Biostars <http://biostars.org>`__. For feature requests, please open an issue on `github <http://github.com/deeptools/deeptools>`__.
19
+
20
+ For further documentation, please see our `read the docs page <http://deeptools.readthedocs.org/>`__.
21
+
22
+ Citation:
23
+ ^^^^^^^^^
24
+
25
+ Ramírez F, Ryan DP, Grüning B, Bhardwaj V, Kilpert F, Richter AS, Heyne
26
+ S, Dündar F, Manke T. `deepTools2: a next generation web server for
27
+ deep-sequencing data
28
+ analysis. <https://nar.oxfordjournals.org/content/early/2016/04/12/nar.gkw257.abstract>`__
29
+ Nucleic Acids Research. 2016 Apr 13:gkw257.
deepTools/source/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ deepTools Project Package Initialization File
4
+ """
deepTools/source/deeptools/SES_scaleFactor.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import numpy as np
6
+
7
+ # own packages
8
+ from deeptools import bamHandler
9
+ import deeptools.countReadsPerBin as countR
10
+
11
+ old_settings = np.seterr(all='ignore')
12
+ debug = 0
13
+
14
+
15
+ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
16
+ normalizationLength,
17
+ avg_method='median', blackListFileName=None, numberOfProcessors=1,
18
+ verbose=False, chrsToSkip=[], mappingStatsList=[]):
19
+ r"""
20
+ Subdivides the genome into chunks to be analyzed in parallel
21
+ using several processors. The code handles the creation of
22
+ workers that compute fragment counts (coverage) for different
23
+ regions and then collect and integrates the results.
24
+
25
+ Parameters
26
+ ----------
27
+ bamFilesList : list
28
+ list of bam files to normalize
29
+ binLength : int
30
+ the window size in bp, where reads are going to be
31
+ counted.
32
+ numberOfSamples : int
33
+ number of sites to sample from the genome. For more info see
34
+ the documentation of the CountReadsPerBin class
35
+ normalizationLength : int
36
+ length, in bp, to normalize the data.
37
+ For a value of 1, on average
38
+ 1 read per base pair is found
39
+ avg_method : str
40
+ defines how the different values are to be summarized.
41
+ The options are 'mean' and 'median'
42
+ chrsToSkip : list
43
+ name of the chromosomes to be excluded from the
44
+ scale estimation. Usually the chrX is included.
45
+ blackListFileName : str
46
+ BED file containing blacklisted regions
47
+ mappingStatsList : list
48
+ List of the number of mapped reads per file
49
+
50
+ Returns
51
+ -------
52
+ dict
53
+ Dictionary with the following keys::
54
+ 'size_factors'
55
+ 'size_factors_based_on_mapped_reads'
56
+ 'size_factors_SES'
57
+ 'size_factors_based_on_mean'
58
+ 'size_factors_based_on_median'
59
+ 'mean'
60
+ 'meanSES'
61
+ 'median'
62
+ 'reads_per_bin'
63
+ 'std'
64
+ 'sites_sampled'
65
+
66
+
67
+ Examples
68
+ --------
69
+ >>> test = Tester()
70
+ >>> bin_length = 50
71
+ >>> num_samples = 4
72
+ >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples, 1)
73
+ >>> _dict['size_factors']
74
+ array([1. , 0.5])
75
+ >>> _dict['size_factors_based_on_mean']
76
+ array([1. , 0.5])
77
+ """
78
+
79
+ assert len(bamFilesList) == 2, "SES scale factors are only defined for 2 files"
80
+
81
+ if len(mappingStatsList) == len(bamFilesList):
82
+ mappedReads = mappingStatsList
83
+ else:
84
+ mappedReads = []
85
+ for fname in bamFilesList:
86
+ mappedReads.append(bamHandler.openBam(fname, returnStats=True, nThreads=numberOfProcessors)[1])
87
+
88
+ sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')
89
+
90
+ sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads
91
+
92
+ cr = countR.CountReadsPerBin(bamFilesList,
93
+ binLength=binLength,
94
+ numberOfSamples=numberOfSamples,
95
+ extendReads=False,
96
+ blackListFileName=blackListFileName,
97
+ numberOfProcessors=numberOfProcessors,
98
+ verbose=verbose,
99
+ chrsToSkip=chrsToSkip)
100
+
101
+ try:
102
+ num_reads_per_bin = cr.run()
103
+ except Exception as detail:
104
+ exit("*ERROR*: {}".format(detail))
105
+
106
+ sitesSampled = len(num_reads_per_bin)
107
+
108
+ # the transpose is taken to easily iterate by columns which are now
109
+ # converted to rows
110
+ num_reads_per_bin = num_reads_per_bin.transpose()
111
+ # size factors based on order statistics
112
+ # see Signal extraction scaling (SES) method in: Diaz et al (2012)
113
+ # Normalization, bias correction, and peak calling for ChIP-seq.
114
+ # Statistical applications in genetics and molecular biology, 11(3).
115
+
116
+ # using the same names as in Diaz paper
117
+ # p refers to ChIP, q to input
118
+
119
+ p = np.sort(num_reads_per_bin[0, :]).cumsum()
120
+ q = np.sort(num_reads_per_bin[1, :]).cumsum()
121
+
122
+ # p[-1] and q[-1] are the maximum values in the arrays.
123
+ # both p and q are normalized by this value
124
+ diff = np.abs(p / p[-1] - q / q[-1])
125
+ # get the lowest rank for wich the difference is the maximum
126
+ maxIndex = np.flatnonzero(diff == diff.max())[0]
127
+ # Take a lower rank to move to a region with probably
128
+ # less peaks and more background.
129
+ maxIndex = int(maxIndex * 0.8)
130
+ while maxIndex < len(p):
131
+ # in rare cases the maxIndex maps to a zero value.
132
+ # In such cases, the next index is used until
133
+ # a non zero value appears.
134
+ cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
135
+ if cumSum.min() > 0:
136
+ break
137
+ maxIndex += 1
138
+
139
+ meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
140
+ np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])]
141
+
142
+ # the maxIndex may be too close to the the signal regions
143
+ # so i take a more conservative approach by taking a close number
144
+
145
+ sizeFactorsSES = cumSum.min() / cumSum
146
+ median = np.median(num_reads_per_bin, axis=1)
147
+
148
+ # consider only those read numbers that are below the 90
149
+ # percentile to stimate the
150
+ # mean and std
151
+ mean = []
152
+ std = []
153
+ for values in num_reads_per_bin:
154
+ maxNumReads = (np.percentile(values, 90))
155
+ if maxNumReads == 0:
156
+ maxNumReads = (np.percentile(values, 99))
157
+ if maxNumReads == 0:
158
+ print("all genomic regions sampled from one ")
159
+ "of the bam files have no reads.\n"
160
+ values = values[values <= maxNumReads]
161
+
162
+ mean.append(np.mean(values))
163
+ std.append(np.std(values))
164
+
165
+ mean = np.array(mean)
166
+ readsPerBin = mean if avg_method == 'mean' else median
167
+
168
+ if min(median) == 0:
169
+ idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
170
+ exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
171
+ "Try selecting a larger sample size or a region with coverage\n".format(idx_zero))
172
+
173
+ sizeFactor = sizeFactorsSES
174
+ return {'size_factors': sizeFactor,
175
+ 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
176
+ 'size_factors_SES': sizeFactorsSES,
177
+ 'size_factors_based_on_mean': mean.min() / mean,
178
+ 'size_factors_based_on_median': median.min() / median,
179
+ 'mean': mean,
180
+ 'meanSES': meanSES,
181
+ 'median': median,
182
+ 'reads_per_bin': readsPerBin,
183
+ 'std': std,
184
+ 'sites_sampled': sitesSampled}
185
+
186
+
187
+ class Tester(object):
188
+
189
+ def __init__(self):
190
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
191
+ self.bamFile1 = self.root + "testA.bam"
192
+ self.bamFile2 = self.root + "testB.bam"
193
+ global debug
194
+ debug = 0
195
+ self.chrom = '3R'
deepTools/source/deeptools/__init__.py ADDED
File without changes
deepTools/source/deeptools/alignmentSieve.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import argparse
3
+ import pysam
4
+ import os
5
+ import sys
6
+
7
+ from deeptools import parserCommon
8
+ from deeptools.bamHandler import openBam
9
+ from deeptools.mapReduce import mapReduce
10
+ from deeptools.utilities import getTLen, smartLabels, getTempFileName
11
+ from importlib.metadata import version
12
+
13
+
14
+ def parseArguments():
15
+ parser = argparse.ArgumentParser(
16
+ formatter_class=argparse.RawDescriptionHelpFormatter,
17
+ description="This tool filters alignments in a BAM/CRAM file according the the specified parameters. It can optionally output to BEDPE format.",
18
+ usage='alignmentSieve -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt\n'
19
+ 'help: alignmentSieve -h / alignmentSieve --help')
20
+
21
+ required = parser.add_argument_group('Required arguments')
22
+ required.add_argument('--bam', '-b',
23
+ metavar='FILE1',
24
+ help='An indexed BAM file.',
25
+ required=True)
26
+
27
+ required.add_argument('--outFile', '-o',
28
+ help='The file to write results to. These are the alignments or fragments that pass the filtering criteria.')
29
+
30
+ general = parser.add_argument_group('General arguments')
31
+ general.add_argument('--numberOfProcessors', '-p',
32
+ help='Number of processors to use. Type "max/2" to '
33
+ 'use half the maximum number of processors or "max" '
34
+ 'to use all available processors. (Default: %(default)s)',
35
+ metavar="INT",
36
+ type=parserCommon.numberOfProcessors,
37
+ default=1,
38
+ required=False)
39
+
40
+ general.add_argument('--filterMetrics',
41
+ metavar="FILE.log",
42
+ help="The number of entries in total and filtered are saved to this file")
43
+
44
+ general.add_argument('--filteredOutReads',
45
+ metavar="filtered.bam",
46
+ help="If desired, all reads NOT passing the filtering criteria can be written to this file.")
47
+
48
+ general.add_argument('--label', '-l',
49
+ metavar='sample1',
50
+ help='User defined label instead of the default label '
51
+ '(file name).')
52
+
53
+ general.add_argument('--smartLabels',
54
+ action='store_true',
55
+ help='Instead of manually specifying a labels for the input '
56
+ 'file, this causes deepTools to use the file name '
57
+ 'after removing the path and extension.')
58
+
59
+ general.add_argument('--verbose', '-v',
60
+ help='Set to see processing messages.',
61
+ action='store_true')
62
+
63
+ general.add_argument('--version', action='version',
64
+ version='%(prog)s {}'.format(version('deeptools')))
65
+
66
+ general.add_argument('--shift',
67
+ nargs='+',
68
+ type=int,
69
+ help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.')
70
+
71
+ general.add_argument('--ATACshift',
72
+ action='store_true',
73
+ help='Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.')
74
+
75
+ general.add_argument('--genomeChunkLength',
76
+ type=int,
77
+ default=int(1e6),
78
+ help='Size of the genome (in bps) to be processed per thread. (Default: %(default)s)')
79
+
80
+ output = parser.add_argument_group('Output arguments')
81
+ output.add_argument('--BED',
82
+ action='store_true',
83
+ help='Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.')
84
+
85
+ filtering = parser.add_argument_group('Optional arguments')
86
+
87
+ filtering.add_argument('--filterRNAstrand',
88
+ help='Selects RNA-seq reads (single-end or paired-end) in '
89
+ 'the given strand. (Default: %(default)s)',
90
+ choices=['forward', 'reverse'],
91
+ default=None)
92
+
93
+ filtering.add_argument('--ignoreDuplicates',
94
+ help='If set, reads that have the same orientation '
95
+ 'and start position will be considered only '
96
+ 'once. If reads are paired, the mate\'s position '
97
+ 'also has to coincide to ignore a read.',
98
+ action='store_true')
99
+
100
+ filtering.add_argument('--minMappingQuality',
101
+ metavar='INT',
102
+ help='If set, only reads that have a mapping '
103
+ 'quality score of at least this are '
104
+ 'considered.',
105
+ type=int)
106
+
107
+ filtering.add_argument('--samFlagInclude',
108
+ help='Include reads based on the SAM flag. For example, '
109
+ 'to get only reads that are the first mate, use a flag of 64. '
110
+ 'This is useful to count properly paired reads only once, '
111
+ 'as otherwise the second mate will be also considered for the '
112
+ 'coverage.',
113
+ metavar='INT',
114
+ default=None,
115
+ type=int,
116
+ required=False)
117
+
118
+ filtering.add_argument('--samFlagExclude',
119
+ help='Exclude reads based on the SAM flag. For example, '
120
+ 'to get only reads that map to the forward strand, use '
121
+ '--samFlagExclude 16, where 16 is the SAM flag for reads '
122
+ 'that map to the reverse strand.',
123
+ metavar='INT',
124
+ default=None,
125
+ type=int,
126
+ required=False)
127
+
128
+ filtering.add_argument('--blackListFileName', '-bl',
129
+ help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.",
130
+ metavar="BED file",
131
+ nargs="+",
132
+ required=False)
133
+
134
+ filtering.add_argument('--minFragmentLength',
135
+ help='The minimum fragment length needed for read/pair '
136
+ 'inclusion. This option is primarily useful '
137
+ 'in ATACseq experiments, for filtering mono- or '
138
+ 'di-nucleosome fragments. (Default: %(default)s)',
139
+ metavar='INT',
140
+ default=0,
141
+ type=int,
142
+ required=False)
143
+
144
+ filtering.add_argument('--maxFragmentLength',
145
+ help='The maximum fragment length needed for read/pair '
146
+ 'inclusion. A value of 0 indicates no limit. (Default: %(default)s)',
147
+ metavar='INT',
148
+ default=0,
149
+ type=int,
150
+ required=False)
151
+
152
+ return parser
153
+
154
+
155
+ def shiftRead(b, chromDict, args):
156
+ if not b.is_proper_pair:
157
+ return None
158
+ tLen = getTLen(b, notAbs=True)
159
+ start = b.pos
160
+ end = start + b.query_alignment_end
161
+ if b.is_reverse and not b.is_read2:
162
+ end -= args.shift[2]
163
+ deltaTLen = args.shift[3] - args.shift[2]
164
+ elif b.is_reverse and b.is_read2:
165
+ end += args.shift[1]
166
+ deltaTLen = args.shift[1] - args.shift[0]
167
+ elif not b.is_reverse and not b.is_read2:
168
+ start += args.shift[0]
169
+ deltaTLen = args.shift[1] - args.shift[0]
170
+ else:
171
+ start -= args.shift[3]
172
+ deltaTLen = args.shift[3] - args.shift[2]
173
+
174
+ # Sanity check
175
+ if end - start < 1:
176
+ if b.is_reverse:
177
+ start = end - 1
178
+ else:
179
+ end = start + 1
180
+ if start < 0:
181
+ start = 0
182
+ if end > chromDict[b.reference_name]:
183
+ end = chromDict[b.reference_name]
184
+ if end - start < 1:
185
+ return None
186
+
187
+ # create a new read
188
+ b2 = pysam.AlignedSegment()
189
+ b2.query_name = b.query_name
190
+ b2.flag = b.flag
191
+ b2.reference_id = b.reference_id
192
+ b2.reference_start = start
193
+ b2.mapping_quality = b.mapping_quality
194
+ b2.cigar = ((0, end - start),) # Returned cigar is only matches
195
+ if tLen < 0:
196
+ b2.template_length = tLen - deltaTLen
197
+ else:
198
+ b2.template_length = tLen + deltaTLen
199
+ b2.next_reference_id = b.next_reference_id
200
+ b2.next_reference_start = b.next_reference_start
201
+ if b.is_proper_pair:
202
+ if b2.is_read2 and b2.is_reverse:
203
+ b2.next_reference_start += args.shift[0]
204
+ elif not b2.is_read2 and b2.is_reverse:
205
+ b2.next_reference_start -= args.shift[3]
206
+
207
+ return b2
208
+
209
+
210
+ def filterWorker(arglist):
211
+ chrom, start, end, args, chromDict = arglist
212
+ fh = openBam(args.bam)
213
+ mode = 'wb'
214
+ oname = getTempFileName(suffix='.bam')
215
+ if args.filteredOutReads:
216
+ onameFiltered = getTempFileName(suffix='.bam')
217
+ else:
218
+ onameFiltered = None
219
+ ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
220
+ if onameFiltered:
221
+ ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
222
+ else:
223
+ ofiltered = None
224
+
225
+ prev_pos = set()
226
+ lpos = None
227
+
228
+ nFiltered = 0
229
+ total = 0
230
+ for read in fh.fetch(chrom, start, end):
231
+ if read.pos < start:
232
+ # ensure that we never double count (in case distanceBetweenBins == 0)
233
+ continue
234
+
235
+ total += 1
236
+ if read.flag & 4:
237
+ # Ignore unmapped reads, they were counted already
238
+ nFiltered += 1
239
+ if ofiltered:
240
+ ofiltered.write(read)
241
+ continue
242
+
243
+ if args.minMappingQuality and read.mapq < args.minMappingQuality:
244
+ nFiltered += 1
245
+ if ofiltered:
246
+ ofiltered.write(read)
247
+ continue
248
+
249
+ if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
250
+ nFiltered += 1
251
+ if ofiltered:
252
+ ofiltered.write(read)
253
+ continue
254
+ if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
255
+ nFiltered += 1
256
+ if ofiltered:
257
+ ofiltered.write(read)
258
+ continue
259
+
260
+ tLen = getTLen(read)
261
+ if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
262
+ nFiltered += 1
263
+ if ofiltered:
264
+ ofiltered.write(read)
265
+ continue
266
+ if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
267
+ nFiltered += 1
268
+ if ofiltered:
269
+ ofiltered.write(read)
270
+ continue
271
+
272
+ if args.ignoreDuplicates:
273
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
274
+ if tLen >= 0:
275
+ s = read.pos
276
+ e = s + tLen
277
+ else:
278
+ s = read.pnext
279
+ e = s - tLen
280
+ if read.reference_id != read.next_reference_id:
281
+ e = read.pnext
282
+ if lpos is not None and lpos == read.reference_start \
283
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
284
+ nFiltered += 1
285
+ if ofiltered:
286
+ ofiltered.write(read)
287
+ continue
288
+ if lpos != read.reference_start:
289
+ prev_pos.clear()
290
+ lpos = read.reference_start
291
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
292
+
293
+ # filterRNAstrand
294
+ if args.filterRNAstrand:
295
+ if read.is_paired:
296
+ if args.filterRNAstrand == 'forward':
297
+ if read.flag & 144 == 128 or read.flag & 96 == 64:
298
+ pass
299
+ else:
300
+ nFiltered += 1
301
+ if ofiltered:
302
+ ofiltered.write(read)
303
+ continue
304
+ elif args.filterRNAstrand == 'reverse':
305
+ if read.flag & 144 == 144 or read.flag & 96 == 96:
306
+ pass
307
+ else:
308
+ nFiltered += 1
309
+ if ofiltered:
310
+ ofiltered.write(read)
311
+ continue
312
+ else:
313
+ if args.filterRNAstrand == 'forward':
314
+ if read.flag & 16 == 16:
315
+ pass
316
+ else:
317
+ nFiltered += 1
318
+ if ofiltered:
319
+ ofiltered.write(read)
320
+ continue
321
+ elif args.filterRNAstrand == 'reverse':
322
+ if read.flag & 16 == 0:
323
+ pass
324
+ else:
325
+ nFiltered += 1
326
+ if ofiltered:
327
+ ofiltered.write(read)
328
+ continue
329
+
330
+ if args.shift:
331
+ read = shiftRead(read, chromDict, args)
332
+ if not read:
333
+ continue
334
+
335
+ # Read survived filtering
336
+ ofh.write(read)
337
+
338
+ # The results from the workers will get sorted, so get the TID
339
+ tid = fh.get_tid(chrom)
340
+
341
+ ofh.close()
342
+ if ofiltered:
343
+ ofiltered.close()
344
+ fh.close()
345
+ return tid, start, total, nFiltered, oname, onameFiltered
346
+
347
+
348
+ def convertBED(oname, tmpFiles, chromDict):
349
+ """
350
+ Stores results in BEDPE format, which is:
351
+ chromosome frag_leftend frag_rightend
352
+
353
+ The fragment ends can be shifted
354
+ """
355
+ ofile = open(oname, "w")
356
+ for tmpFile in tmpFiles:
357
+ # Setting verbosity to avoid lack of index error/warning
358
+ pysam.set_verbosity(0)
359
+ fh = pysam.AlignmentFile(tmpFile)
360
+ # Reset verbosity
361
+ pysam.set_verbosity(3)
362
+ for b in fh.fetch(until_eof=True):
363
+ tLen = getTLen(b, notAbs=True)
364
+ if tLen > 0:
365
+ start = b.pos
366
+ end = start + tLen
367
+ if end > chromDict[b.reference_name]:
368
+ end = chromDict[b.reference_name]
369
+ if end - start < 1:
370
+ continue
371
+ ofile.write("{}\t{}\t{}\n".format(b.reference_name, start, end))
372
+ fh.close()
373
+ os.unlink(tmpFile)
374
+ ofile.close()
375
+
376
+
377
+ def main(args=None):
378
+ args = parseArguments().parse_args(args)
379
+ if args.shift:
380
+ if len(args.shift) not in [2, 4]:
381
+ sys.exit("The --shift option can accept either 2 or 4 values only.")
382
+ if len(args.shift) == 2:
383
+ args.shift.extend([-args.shift[1], -args.shift[0]])
384
+ elif args.ATACshift:
385
+ args.shift = [4, -5, 5, -4]
386
+
387
+ bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
388
+ total = mapped + unmapped
389
+ chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)]
390
+ chromDict = {x: y for x, y in zip(bam.references, bam.lengths)}
391
+
392
+ # Filter, writing the results to a bunch of temporary files
393
+ res = mapReduce([args, chromDict],
394
+ filterWorker,
395
+ chrom_sizes,
396
+ blackListFileName=args.blackListFileName,
397
+ numberOfProcessors=args.numberOfProcessors,
398
+ genomeChunkLength=args.genomeChunkLength,
399
+ verbose=args.verbose)
400
+
401
+ res = sorted(res) # The temp files are now in order for concatenation
402
+ nFiltered = sum([x[3] for x in res])
403
+ totalSeen = sum([x[2] for x in res]) # The * contig isn't queried
404
+
405
+ tmpFiles = [x[4] for x in res]
406
+ if not args.BED:
407
+ arguments = ["-o", args.outFile]
408
+ arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7
409
+ pysam.samtools.cat(*arguments)
410
+ for tmpFile in tmpFiles:
411
+ os.unlink(tmpFile)
412
+ else:
413
+ convertBED(args.outFile, tmpFiles, chromDict)
414
+
415
+ if args.filteredOutReads:
416
+ tmpFiles = [x[5] for x in res]
417
+ if not args.BED:
418
+ arguments = ["-o", args.filteredOutReads]
419
+ arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7
420
+ pysam.samtools.cat(*arguments)
421
+ for tmpFile in tmpFiles:
422
+ os.unlink(tmpFile)
423
+ else:
424
+ convertBED(args.outFile, tmpFiles, chromDict, args)
425
+
426
+ if args.filterMetrics:
427
+ sampleName = args.bam
428
+ if args.label:
429
+ sampleName = args.label
430
+ if args.smartLabels:
431
+ sampleName = smartLabels([args.bam])[0]
432
+
433
+ of = open(args.filterMetrics, "w")
434
+ of.write("#bamFilterReads --filterMetrics\n")
435
+ of.write("#File\tReads Remaining\tTotal Initial Reads\n")
436
+ of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total))
437
+ of.close()
438
+
439
+ return 0
deepTools/source/deeptools/bamCompare.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse # to parse command line arguments
5
+ import numpy as np
6
+ import sys
7
+
8
+ # my packages
9
+ from deeptools import writeBedGraph
10
+ from deeptools.SES_scaleFactor import estimateScaleFactor
11
+ from deeptools import parserCommon
12
+ from deeptools import bamHandler
13
+ from deeptools.getRatio import getRatio
14
+ from deeptools.getScaleFactor import get_num_kept_reads
15
+ from deeptools.getScaleFactor import get_scale_factor
16
+ debug = 0
17
+ old_settings = np.seterr(all='ignore')
18
+
19
+
20
+ def parseArguments():
21
+ parentParser = parserCommon.getParentArgParse()
22
+ bamParser = parserCommon.read_options()
23
+ normalizationParser = parserCommon.normalization_options()
24
+ requiredArgs = getRequiredArgs()
25
+ optionalArgs = getOptionalArgs()
26
+ outputParser = parserCommon.output()
27
+ parser = argparse.ArgumentParser(
28
+ parents=[requiredArgs, outputParser, optionalArgs,
29
+ parentParser, normalizationParser, bamParser],
30
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
31
+ description='This tool compares two BAM files based on the number of '
32
+ 'mapped reads. To compare the BAM files, the genome is partitioned '
33
+ 'into bins of equal size, then the number of reads found in each bin'
34
+ ' is counted per file, and finally a summary value is '
35
+ 'reported. This value can be the ratio of the number of reads per '
36
+ 'bin, the log2 of the ratio, or the difference. This tool can '
37
+ 'normalize the number of reads in each BAM file using the SES method '
38
+ 'proposed by Diaz et al. (2012) "Normalization, bias correction, and '
39
+ 'peak calling for ChIP-seq". Statistical Applications in Genetics '
40
+ 'and Molecular Biology, 11(3). Normalization based on read counts '
41
+ 'is also available. The output is either a bedgraph or bigWig file '
42
+ 'containing the bin location and the resulting comparison value. '
43
+ 'Note that *each end* in a pair (for paired-end reads) is treated '
44
+ 'independently. If this is undesirable, then use the --samFlagInclude '
45
+ 'or --samFlagExclude options.',
46
+
47
+ usage='bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw\n'
48
+ 'help: bamCompare -h / bamCompare --help',
49
+
50
+ add_help=False)
51
+
52
+ return parser
53
+
54
+
55
+ def getRequiredArgs():
56
+ parser = argparse.ArgumentParser(add_help=False)
57
+
58
+ required = parser.add_argument_group('Required arguments')
59
+
60
+ # define the arguments
61
+ required.add_argument('--bamfile1', '-b1',
62
+ metavar='BAM file',
63
+ help='Sorted BAM file 1. Usually the BAM file '
64
+ 'for the treatment.',
65
+ required=True)
66
+
67
+ required.add_argument('--bamfile2', '-b2',
68
+ metavar='BAM file',
69
+ help='Sorted BAM file 2. Usually the BAM '
70
+ 'file for the control.',
71
+ required=True)
72
+
73
+ return parser
74
+
75
+
76
+ def getOptionalArgs():
77
+
78
+ parser = argparse.ArgumentParser(add_help=False)
79
+ optional = parser.add_argument_group('Optional arguments')
80
+
81
+ optional.add_argument("--help", "-h", action="help",
82
+ help="show this help message and exit")
83
+
84
+ optional.add_argument('--scaleFactorsMethod',
85
+ help='Method to use to scale the samples. '
86
+ 'If a method is specified, then it will be used to compensate '
87
+ 'for sequencing depth differences between the samples. '
88
+ 'As an alternative, this can be set to None and an option from '
89
+ '--normalizeUsing <method> can be used. (Default: %(default)s)',
90
+ choices=['readCount', 'SES', 'None'],
91
+ default='readCount')
92
+
93
+ optional.add_argument('--sampleLength', '-l',
94
+ help='*Only relevant when SES is chosen for the '
95
+ 'scaleFactorsMethod.* To compute the SES, specify '
96
+ 'the length (in bases) of the regions (see --numberOfSamples) '
97
+ 'that will be randomly sampled to calculate the scaling factors. '
98
+ 'If you do not have a good sequencing depth for '
99
+ 'your samples consider increasing the sampling '
100
+ 'regions\' size to minimize the probability '
101
+ 'that zero-coverage regions are used. (Default: %(default)s)',
102
+ default=1000,
103
+ type=int)
104
+
105
+ optional.add_argument('--numberOfSamples', '-n',
106
+ help='*Only relevant when SES is chosen for the '
107
+ 'scaleFactorsMethod.* Number of samplings taken '
108
+ 'from the genome to compute the scaling factors. (Default: %(default)s)',
109
+ default=1e5,
110
+ type=int)
111
+
112
+ optional.add_argument('--scaleFactors',
113
+ help='Set this parameter manually to avoid the computation of '
114
+ 'scaleFactors. The format is scaleFactor1:scaleFactor2.'
115
+ 'For example, --scaleFactor 0.7:1 will cause the first BAM file to'
116
+ 'be multiplied by 0.7, while not scaling '
117
+ 'the second BAM file (multiplication with 1).',
118
+ default=None,
119
+ required=False)
120
+
121
+ optional.add_argument('--operation',
122
+ help='The default is to output the log2 ratio of the '
123
+ 'two samples. The reciprocal ratio returns the '
124
+ 'the negative of the inverse of the ratio '
125
+ 'if the ratio is less than 0. The resulting '
126
+ 'values are interpreted as negative fold changes. '
127
+ 'Instead of performing a computation using both files, the scaled signal can '
128
+ 'alternatively be output for the first or second file using '
129
+ 'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)',
130
+ default='log2',
131
+ choices=['log2', 'ratio', 'subtract', 'add', 'mean',
132
+ 'reciprocal_ratio', 'first', 'second'],
133
+ required=False)
134
+
135
+ optional.add_argument('--pseudocount',
136
+ help='A small number to avoid x/0. Only useful '
137
+ 'together with --operation log2 or --operation ratio. '
138
+ 'You can specify different values as pseudocounts for '
139
+ 'the numerator and the denominator by providing two '
140
+ 'values (the first value is used as the numerator '
141
+ 'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
142
+ default=[1],
143
+ type=float,
144
+ nargs='+',
145
+ action=parserCommon.requiredLength(1, 2),
146
+ required=False)
147
+
148
+ optional.add_argument('--skipZeroOverZero',
149
+ help='Skip bins where BOTH BAM files lack coverage. '
150
+ 'This is determined BEFORE any applicable pseudocount '
151
+ 'is added.',
152
+ action='store_true')
153
+
154
+ return parser
155
+
156
+
157
+ def process_args(args=None):
158
+ args = parseArguments().parse_args(args)
159
+
160
+ if args.smoothLength and args.smoothLength <= args.binSize:
161
+ print("Warning: the smooth length given ({}) is smaller than the bin "
162
+ "size ({}).\n\n No smoothing will be "
163
+ "done".format(args.smoothLength,
164
+ args.binSize))
165
+ args.smoothLength = None
166
+
167
+ if not args.ignoreForNormalization:
168
+ args.ignoreForNormalization = []
169
+
170
+ if not isinstance(args.pseudocount, list):
171
+ args.pseudocount = [args.pseudocount]
172
+
173
+ if len(args.pseudocount) == 1:
174
+ args.pseudocount *= 2
175
+
176
+ return args
177
+
178
+ # get_scale_factors function is used for scaling in bamCompare
179
+ # while get_scale_factor is used for depth normalization
180
+
181
+
182
+ def get_scale_factors(args, statsList, mappedList):
183
+
184
+ if args.scaleFactors:
185
+ scale_factors = list(map(float, args.scaleFactors.split(":")))
186
+ elif args.scaleFactorsMethod == 'SES':
187
+ scalefactors_dict = estimateScaleFactor(
188
+ [args.bamfile1, args.bamfile2],
189
+ args.sampleLength, args.numberOfSamples,
190
+ 1,
191
+ mappingStatsList=mappedList,
192
+ blackListFileName=args.blackListFileName,
193
+ numberOfProcessors=args.numberOfProcessors,
194
+ verbose=args.verbose,
195
+ chrsToSkip=args.ignoreForNormalization)
196
+
197
+ scale_factors = scalefactors_dict['size_factors']
198
+
199
+ if args.verbose:
200
+ print("Size factors using SES: {}".format(scale_factors))
201
+ print("%s regions of size %s where used " %
202
+ (scalefactors_dict['sites_sampled'],
203
+ args.sampleLength))
204
+
205
+ print("ignoring filtering/blacklists, size factors if the number of mapped "
206
+ "reads would have been used:")
207
+ print(tuple(
208
+ float(min(mappedList)) / np.array(mappedList)))
209
+
210
+ elif args.scaleFactorsMethod == 'readCount':
211
+ # change the scaleFactor to 1.0
212
+ args.scaleFactor = 1.0
213
+ # get num of kept reads for bam file 1
214
+ args.bam = args.bamfile1
215
+ bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
216
+ # get num of kept reads for bam file 2
217
+ args.bam = args.bamfile2
218
+ bam2_mapped, _ = get_num_kept_reads(args, statsList[1])
219
+
220
+ mapped_reads = [bam1_mapped, bam2_mapped]
221
+
222
+ # new scale_factors (relative to min of two bams)
223
+ scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads)
224
+ if args.verbose:
225
+ print("Size factors using total number "
226
+ "of mapped reads: {}".format(scale_factors))
227
+
228
+ elif args.scaleFactorsMethod == 'None':
229
+ scale_factors = None
230
+
231
+ return scale_factors
232
+
233
+
234
+ def main(args=None):
235
+ """
236
+ The algorithm is composed of two steps.
237
+
238
+
239
+ 1. Per-sample scaling / depth Normalization:
240
+ + If scaling is used (using the SES or read counts method), appropriate scaling
241
+ factors are determined to account for sequencing depth differences.
242
+ + Optionally scaling can be turned off and individual samples could be depth normalized using
243
+ RPKM, BPM or CPM methods
244
+
245
+ 2. Ratio calculation between two bam files:
246
+ + The genome is transversed and computing
247
+ the log ratio/ratio/difference etc. for bins of fixed width
248
+ given by the user.
249
+
250
+ """
251
+ args = process_args(args)
252
+
253
+ if args.normalizeUsing == "RPGC":
254
+ sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!")
255
+ if args.normalizeUsing == 'None':
256
+ args.normalizeUsing = None # For the sake of sanity
257
+ if args.scaleFactorsMethod != 'None' and args.normalizeUsing:
258
+ sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))
259
+
260
+ # Get mapping statistics
261
+ bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)
262
+ bam1.close()
263
+ bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors)
264
+ bam2.close()
265
+
266
+ scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2])
267
+ if scale_factors is None:
268
+ # check whether one of the depth norm methods are selected
269
+ if args.normalizeUsing is not None:
270
+ args.scaleFactor = 1.0
271
+ # if a normalization is required then compute the scale factors
272
+ args.bam = args.bamfile1
273
+ scale_factor_bam1 = get_scale_factor(args, stats1)
274
+ args.bam = args.bamfile2
275
+ scale_factor_bam2 = get_scale_factor(args, stats2)
276
+ scale_factors = [scale_factor_bam1, scale_factor_bam2]
277
+ else:
278
+ scale_factors = [1, 1]
279
+
280
+ if args.verbose:
281
+ print("Individual scale factors are {0}".format(scale_factors))
282
+
283
+ # the getRatio function is called and receives
284
+ # the func_args per each tile that is considered
285
+ FUNC = getRatio
286
+ func_args = {'valueType': args.operation,
287
+ 'scaleFactors': scale_factors,
288
+ 'pseudocount': args.pseudocount
289
+ }
290
+
291
+ wr = writeBedGraph.WriteBedGraph([args.bamfile1, args.bamfile2], args.binSize, 0,
292
+ stepSize=args.binSize,
293
+ region=args.region,
294
+ numberOfProcessors=args.numberOfProcessors,
295
+ extendReads=args.extendReads,
296
+ blackListFileName=args.blackListFileName,
297
+ minMappingQuality=args.minMappingQuality,
298
+ ignoreDuplicates=args.ignoreDuplicates,
299
+ center_read=args.centerReads,
300
+ zerosToNans=args.skipNonCoveredRegions,
301
+ skipZeroOverZero=args.skipZeroOverZero,
302
+ samFlag_include=args.samFlagInclude,
303
+ samFlag_exclude=args.samFlagExclude,
304
+ minFragmentLength=args.minFragmentLength,
305
+ maxFragmentLength=args.maxFragmentLength,
306
+ chrsToSkip=args.ignoreForNormalization,
307
+ verbose=args.verbose
308
+ )
309
+
310
+ wr.run(FUNC, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
311
+
312
+
313
+ if __name__ == "__main__":
314
+ main()
deepTools/source/deeptools/bamCoverage.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # own tools
5
+ import argparse
6
+ import sys
7
+ import numpy as np
8
+ from deeptools import writeBedGraph # This should be made directly into a bigWig
9
+ from deeptools import parserCommon
10
+ from deeptools.getScaleFactor import get_scale_factor
11
+ from deeptools.bamHandler import openBam
12
+
13
+ debug = 0
14
+
15
+
16
+ def parseArguments():
17
+ parentParser = parserCommon.getParentArgParse()
18
+ bamParser = parserCommon.read_options()
19
+ normalizationParser = parserCommon.normalization_options()
20
+ requiredArgs = get_required_args()
21
+ optionalArgs = get_optional_args()
22
+ outputParser = parserCommon.output()
23
+ parser = \
24
+ argparse.ArgumentParser(
25
+ parents=[requiredArgs, outputParser, optionalArgs,
26
+ parentParser, normalizationParser, bamParser],
27
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
28
+ description='This tool takes an alignment of reads or fragments '
29
+ 'as input (BAM file) and generates a coverage track (bigWig or '
30
+ 'bedGraph) as output. '
31
+ 'The coverage is calculated as the number of reads per bin, '
32
+ 'where bins are short consecutive counting windows of a defined '
33
+ 'size. It is possible to extended the length of the reads '
34
+ 'to better reflect the actual fragment length. *bamCoverage* '
35
+ 'offers normalization by scaling factor, Reads Per Kilobase per '
36
+ 'Million mapped reads (RPKM), counts per million (CPM), bins per '
37
+ 'million mapped reads (BPM) and 1x depth (reads per genome '
38
+ 'coverage, RPGC).\n',
39
+ usage='bamCoverage -b reads.bam -o coverage.bw\n'
40
+ 'help: bamCoverage -h / bamCoverage --help',
41
+ add_help=False)
42
+
43
+ return parser
44
+
45
+
46
+ def get_required_args():
47
+ parser = argparse.ArgumentParser(add_help=False)
48
+
49
+ required = parser.add_argument_group('Required arguments')
50
+
51
+ # define the arguments
52
+ required.add_argument('--bam', '-b',
53
+ help='BAM file to process',
54
+ metavar='BAM file',
55
+ required=True)
56
+
57
+ return parser
58
+
59
+
60
+ def get_optional_args():
61
+
62
+ parser = argparse.ArgumentParser(add_help=False)
63
+ optional = parser.add_argument_group('Optional arguments')
64
+
65
+ optional.add_argument("--help", "-h", action="help",
66
+ help="show this help message and exit")
67
+
68
+ optional.add_argument('--scaleFactor',
69
+ help='The computed scaling factor (or 1, if not applicable) will '
70
+ 'be multiplied by this. (Default: %(default)s)',
71
+ default=1.0,
72
+ type=float,
73
+ required=False)
74
+
75
+ optional.add_argument('--MNase',
76
+ help='Determine nucleosome positions from MNase-seq data. '
77
+ 'Only 3 nucleotides at the center of each fragment are counted. '
78
+ 'The fragment ends are defined by the two mate reads. Only fragment lengths'
79
+ 'between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. '
80
+ 'By default, any fragments smaller or larger than this are ignored. To '
81
+ 'over-ride this, use the --minFragmentLength and --maxFragmentLength options, '
82
+ 'which will default to 130 and 200 if not otherwise specified in the presence '
83
+ 'of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.',
84
+ action='store_true')
85
+
86
+ optional.add_argument('--Offset',
87
+ help='Uses this offset inside of each read as the signal. This is useful in '
88
+ 'cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the '
89
+ 'start of the read. This can be paired with the --filterRNAstrand option. '
90
+ 'Note that negative values indicate offsets from the end of each read. A value '
91
+ 'of 1 indicates the first base of the alignment (taking alignment orientation '
92
+ 'into account). Likewise, a value of -1 is the last base of the alignment. An '
93
+ 'offset of 0 is not permitted. If two values are specified, then they will be '
94
+ 'used to specify a range of positions. Note that specifying something like '
95
+ '--Offset 5 -1 will result in the 5th through last position being used, which '
96
+ 'is equivalent to trimming 4 bases from the 5-prime end of alignments. Note '
97
+ 'that if you specify --centerReads, the centering will be performed before the '
98
+ 'offset.',
99
+ metavar='INT',
100
+ type=int,
101
+ nargs='+',
102
+ required=False)
103
+
104
+ optional.add_argument('--filterRNAstrand',
105
+ help='Selects RNA-seq reads (single-end or paired-end) originating from genes '
106
+ 'on the given strand. This option assumes a standard dUTP-based library '
107
+ 'preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, '
108
+ 'which originally came from genes on the forward strand using a dUTP-based '
109
+ 'method). Consider using --samExcludeFlag instead for filtering by strand in '
110
+ 'other contexts.',
111
+ choices=['forward', 'reverse'],
112
+ default=None)
113
+
114
+ return parser
115
+
116
+
117
+ def scaleFactor(string):
118
+ try:
119
+ scalefactor1, scalefactor2 = string.split(":")
120
+ scalefactors = (float(scalefactor1), float(scalefactor2))
121
+ except:
122
+ raise argparse.ArgumentTypeError(
123
+ "Format of scaleFactors is factor1:factor2. "
124
+ "The value given ( {} ) is not valid".format(string))
125
+
126
+ return scalefactors
127
+
128
+
129
+ def process_args(args=None):
130
+ args = parseArguments().parse_args(args)
131
+
132
+ if args.smoothLength and args.smoothLength <= args.binSize:
133
+ print("Warning: the smooth length given ({}) is smaller than the bin "
134
+ "size ({}).\n\n No smoothing will be done".format(args.smoothLength, args.binSize))
135
+ args.smoothLength = None
136
+
137
+ if not args.ignoreForNormalization:
138
+ args.ignoreForNormalization = []
139
+
140
+ return args
141
+
142
+
143
+ def main(args=None):
144
+ args = process_args(args)
145
+
146
+ global debug
147
+ if args.verbose:
148
+ sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor))
149
+ debug = 1
150
+ else:
151
+ debug = 0
152
+
153
+ if args.normalizeUsing == 'None':
154
+ args.normalizeUsing = None # For the sake of sanity
155
+ elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
156
+ sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")
157
+
158
+ if args.normalizeUsing:
159
+ # if a normalization is required then compute the scale factors
160
+ bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
161
+ bam.close()
162
+ scale_factor = get_scale_factor(args, stats)
163
+ else:
164
+ scale_factor = args.scaleFactor
165
+
166
+ func_args = {'scaleFactor': scale_factor}
167
+
168
+ # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
169
+ if args.filterRNAstrand and not args.Offset:
170
+ args.Offset = [1, -1]
171
+
172
+ if args.MNase:
173
+ # check that library is paired end
174
+ # using getFragmentAndReadSize
175
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
176
+ frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
177
+ return_lengths=False,
178
+ blackListFileName=args.blackListFileName,
179
+ numberOfProcessors=args.numberOfProcessors,
180
+ verbose=args.verbose)
181
+ if frag_len_dict is None:
182
+ sys.exit("*Error*: For the --MNAse function a paired end library is required. ")
183
+
184
+ # Set some default fragment length bounds
185
+ if args.minFragmentLength == 0:
186
+ args.minFragmentLength = 130
187
+ if args.maxFragmentLength == 0:
188
+ args.maxFragmentLength = 200
189
+
190
+ wr = CenterFragment([args.bam],
191
+ binLength=args.binSize,
192
+ stepSize=args.binSize,
193
+ region=args.region,
194
+ blackListFileName=args.blackListFileName,
195
+ numberOfProcessors=args.numberOfProcessors,
196
+ extendReads=args.extendReads,
197
+ minMappingQuality=args.minMappingQuality,
198
+ ignoreDuplicates=args.ignoreDuplicates,
199
+ center_read=args.centerReads,
200
+ zerosToNans=args.skipNonCoveredRegions,
201
+ samFlag_include=args.samFlagInclude,
202
+ samFlag_exclude=args.samFlagExclude,
203
+ minFragmentLength=args.minFragmentLength,
204
+ maxFragmentLength=args.maxFragmentLength,
205
+ chrsToSkip=args.ignoreForNormalization,
206
+ verbose=args.verbose,
207
+ )
208
+
209
+ elif args.Offset:
210
+ if len(args.Offset) > 1:
211
+ if args.Offset[0] == 0:
212
+ sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
213
+ if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
214
+ sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
215
+ else:
216
+ if args.Offset[0] == 0:
217
+ sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
218
+ wr = OffsetFragment([args.bam],
219
+ binLength=args.binSize,
220
+ stepSize=args.binSize,
221
+ region=args.region,
222
+ numberOfProcessors=args.numberOfProcessors,
223
+ extendReads=args.extendReads,
224
+ minMappingQuality=args.minMappingQuality,
225
+ ignoreDuplicates=args.ignoreDuplicates,
226
+ center_read=args.centerReads,
227
+ zerosToNans=args.skipNonCoveredRegions,
228
+ samFlag_include=args.samFlagInclude,
229
+ samFlag_exclude=args.samFlagExclude,
230
+ minFragmentLength=args.minFragmentLength,
231
+ maxFragmentLength=args.maxFragmentLength,
232
+ chrsToSkip=args.ignoreForNormalization,
233
+ verbose=args.verbose)
234
+ wr.filter_strand = args.filterRNAstrand
235
+ wr.Offset = args.Offset
236
+ else:
237
+ wr = writeBedGraph.WriteBedGraph([args.bam],
238
+ binLength=args.binSize,
239
+ stepSize=args.binSize,
240
+ region=args.region,
241
+ blackListFileName=args.blackListFileName,
242
+ numberOfProcessors=args.numberOfProcessors,
243
+ extendReads=args.extendReads,
244
+ minMappingQuality=args.minMappingQuality,
245
+ ignoreDuplicates=args.ignoreDuplicates,
246
+ center_read=args.centerReads,
247
+ zerosToNans=args.skipNonCoveredRegions,
248
+ samFlag_include=args.samFlagInclude,
249
+ samFlag_exclude=args.samFlagExclude,
250
+ minFragmentLength=args.minFragmentLength,
251
+ maxFragmentLength=args.maxFragmentLength,
252
+ chrsToSkip=args.ignoreForNormalization,
253
+ verbose=args.verbose,
254
+ )
255
+
256
+ wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
257
+ blackListFileName=args.blackListFileName,
258
+ format=args.outFileFormat, smoothLength=args.smoothLength)
259
+
260
+
261
+ class OffsetFragment(writeBedGraph.WriteBedGraph):
262
+ """
263
+ Class to redefine the get_fragment_from_read for the --Offset case
264
+ """
265
+ def filterStrand(self, read, rv):
266
+ """
267
+ A generic read filtering function that gets used by everything in this class.
268
+
269
+ rv is returned if the strand is correct, otherwise [(None, None)]
270
+ """
271
+ # Filter by RNA strand, if desired
272
+ if read.is_paired:
273
+ if self.filter_strand == 'forward':
274
+ if read.flag & 144 == 128 or read.flag & 96 == 64:
275
+ return rv
276
+ elif self.filter_strand == 'reverse':
277
+ if read.flag & 144 == 144 or read.flag & 96 == 96:
278
+ return rv
279
+ else:
280
+ return rv
281
+ else:
282
+ if self.filter_strand == 'forward':
283
+ if read.flag & 16 == 16:
284
+ return rv
285
+ elif self.filter_strand == 'reverse':
286
+ if read.flag & 16 == 0:
287
+ return rv
288
+ else:
289
+ return rv
290
+
291
+ return [(None, None)]
292
+
293
+ def get_fragment_from_read_list(self, read, offset):
294
+ """
295
+ Return the range of exons from the 0th through 1st bases, inclusive. Positions are 1-based
296
+ """
297
+ rv = [(None, None)]
298
+ blocks = read.get_blocks()
299
+ blockLen = sum([x[1] - x[0] for x in blocks])
300
+
301
+ if self.defaultFragmentLength != 'read length':
302
+ if self.is_proper_pair(read, self.maxPairedFragmentLength):
303
+ if read.is_reverse:
304
+ foo = (read.next_reference_start, read.reference_start)
305
+ if foo[0] < foo[1]:
306
+ blocks.insert(0, foo)
307
+ else:
308
+ foo = (read.reference_end, read.reference_end + abs(read.template_length) - read.infer_query_length())
309
+ if foo[0] < foo[1]:
310
+ blocks.append(foo)
311
+
312
+ # Extend using the default fragment length
313
+ else:
314
+ if read.is_reverse:
315
+ foo = (read.reference_start - self.defaultFragmentLength + read.infer_query_length(), read.reference_start)
316
+ if foo[0] < 0:
317
+ foo = (0, foo[1])
318
+ if foo[0] < foo[1]:
319
+ blocks.insert(0, foo)
320
+ else:
321
+ foo = (read.reference_end, read.reference_end + self.defaultFragmentLength - read.infer_query_length())
322
+ if foo[0] < foo[1]:
323
+ blocks.append(foo)
324
+
325
+ stretch = []
326
+ # For the sake of simplicity, convert [(10, 20), (30, 40)] to [10, 11, 12, 13, ..., 40]
327
+ # Then subset accordingly
328
+ for block in blocks:
329
+ stretch.extend(range(block[0], block[1]))
330
+ if read.is_reverse:
331
+ stretch = stretch[::-1]
332
+
333
+ # Handle --centerReads
334
+ if self.center_read:
335
+ _ = (len(stretch) - blockLen) // 2
336
+ stretch = stretch[_:_ + blockLen]
337
+
338
+ # Subset by --Offset
339
+ try:
340
+ foo = stretch[offset[0]:offset[1]]
341
+ except:
342
+ return rv
343
+
344
+ if len(foo) == 0:
345
+ return rv
346
+ if read.is_reverse:
347
+ foo = foo[::-1]
348
+
349
+ # Convert the stretch back to a list of tuples
350
+ foo = np.array(foo)
351
+ d = foo[1:] - foo[:-1]
352
+ idx = np.argwhere(d > 1).flatten().tolist() # This now holds the interval bounds as a list
353
+ idx.append(-1)
354
+ last = 0
355
+ rv = []
356
+ for i in idx:
357
+ rv.append((foo[last].astype("int"), foo[i].astype("int") + 1))
358
+ last = i + 1
359
+
360
+ # Handle strand filtering, if needed
361
+ return self.filterStrand(read, rv)
362
+
363
+ def get_fragment_from_read(self, read):
364
+ """
365
+ This is mostly a wrapper for self.get_fragment_from_read_list(),
366
+ which needs a list and for the offsets to be tweaked by 1.
367
+ """
368
+ offset = [x for x in self.Offset]
369
+ if len(offset) > 1:
370
+ if offset[0] > 0:
371
+ offset[0] -= 1
372
+ if offset[1] < 0:
373
+ offset[1] += 1
374
+ else:
375
+ if offset[0] > 0:
376
+ offset[0] -= 1
377
+ offset = [offset[0], offset[0] + 1]
378
+ else:
379
+ if offset[0] < -1:
380
+ offset = [offset[0], offset[0] + 1]
381
+ else:
382
+ offset = [offset[0], None]
383
+ if offset[1] == 0:
384
+ # -1 gets switched to 0, which screws things up
385
+ offset = (offset[0], None)
386
+ return self.get_fragment_from_read_list(read, offset)
387
+
388
+
389
+ class CenterFragment(writeBedGraph.WriteBedGraph):
390
+ """
391
+ Class to redefine the get_fragment_from_read for the --MNase case
392
+
393
+ The coverage of the fragment is defined as the 2 or 3 basepairs at the
394
+ center of the fragment length.
395
+ """
396
+ def get_fragment_from_read(self, read):
397
+ """
398
+ Takes a proper pair fragment of high quality and limited
399
+ to a certain length and outputs the center
400
+ """
401
+ fragment_start = fragment_end = None
402
+
403
+ # only paired forward reads are considered
404
+ # Fragments have already been filtered according to length
405
+ if read.is_proper_pair and not read.is_reverse and 1 < abs(read.tlen):
406
+ # distance between pairs is even return two bases at the center
407
+ if read.tlen % 2 == 0:
408
+ fragment_start = read.pos + read.tlen / 2 - 1
409
+ fragment_end = fragment_start + 2
410
+
411
+ # distance is odd return three bases at the center
412
+ else:
413
+ fragment_start = read.pos + read.tlen / 2 - 1
414
+ fragment_end = fragment_start + 3
415
+
416
+ return [(fragment_start, fragment_end)]
deepTools/source/deeptools/bamHandler.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import pysam
3
+ from deeptools.mapReduce import mapReduce
4
+
5
+
6
+ def countReadsInInterval(args):
7
+ chrom, start, end, fname, toEOF = args
8
+
9
+ bam = openBam(fname)
10
+ mapped = 0
11
+ unmapped = 0
12
+ for b in bam.fetch(chrom, start, end):
13
+ if chrom == "*":
14
+ unmapped += 1
15
+ continue
16
+ if b.pos < start:
17
+ continue
18
+ if not b.is_unmapped:
19
+ mapped += 1
20
+ else:
21
+ unmapped += 1
22
+ return mapped, unmapped, chrom
23
+
24
+
25
+ def getMappingStats(bam, nThreads):
26
+ """
27
+ This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless
28
+
29
+ This requires pysam > 0.13.0
30
+ """
31
+ header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
32
+ res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)
33
+
34
+ mapped = sum([x[0] for x in res])
35
+ unmapped = sum([x[1] for x in res])
36
+ stats = {x[0]: [0, 0] for x in header}
37
+ for r in res:
38
+ stats[r[2]][0] += r[0]
39
+ stats[r[2]][1] += r[1]
40
+
41
+ # We need to count the number of unmapped reads as well
42
+ unmapped += bam.count("*")
43
+
44
+ return mapped, unmapped, stats
45
+
46
+
47
+ def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True):
48
+ """
49
+ A wrapper for opening BAM/CRAM files.
50
+
51
+ bamFile: str
52
+ A BAM/CRAM file name
53
+
54
+ returnStats: bool
55
+ Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict).
56
+ These additional values are needed by some downstream functions, since one
57
+ can't use file_handle.mapped on CRAM files (or idxstats())
58
+
59
+ nThreads: int
60
+ If returnStats is True, number of threads to use for computing statistics
61
+
62
+ minimalDecoding: Bool
63
+ For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions).
64
+
65
+ Returns either the file handle or a tuple as described in returnStats
66
+ """
67
+ format_options = ["required_fields=0x1FF"]
68
+ if sys.version_info.major >= 3:
69
+ format_options = [b"required_fields=0x1FF"]
70
+ if not minimalDecoding:
71
+ format_options = None
72
+ try:
73
+ bam = pysam.Samfile(bamFile, 'rb', format_options=format_options)
74
+ except IOError:
75
+ sys.exit("The file '{}' does not exist".format(bamFile))
76
+ except:
77
+ sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile))
78
+
79
+ try:
80
+ assert bam.check_index() is not False
81
+ except:
82
+ sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile))
83
+
84
+ if bam.is_cram and returnStats:
85
+ mapped, unmapped, stats = getMappingStats(bam, nThreads)
86
+ elif bam.is_bam:
87
+ mapped = bam.mapped
88
+ unmapped = bam.unmapped
89
+
90
+ # Make the dictionary to hold the stats
91
+ if returnStats:
92
+ stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()}
93
+
94
+ if bam.is_bam or (bam.is_cram and returnStats):
95
+ if mapped == 0:
96
+ sys.stderr.write("WARNING! '{}' does not have any mapped reads. Please "
97
+ "check that the file is properly indexed and "
98
+ "that it contains mapped reads.\n".format(bamFile))
99
+
100
+ if returnStats:
101
+ return bam, mapped, unmapped, stats
102
+ else:
103
+ return bam
deepTools/source/deeptools/bamPEFragmentSize.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import sys
6
+ import numpy as np
7
+
8
+ import matplotlib
9
+ matplotlib.use('Agg')
10
+ matplotlib.rcParams['pdf.fonttype'] = 42
11
+ matplotlib.rcParams['svg.fonttype'] = 'none'
12
+ from deeptools import cm # noqa: F401
13
+ import matplotlib.pyplot as plt
14
+
15
+ import plotly.offline as py
16
+ import plotly.graph_objs as go
17
+
18
+ # own tools
19
+ from deeptools.parserCommon import writableFile
20
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
21
+ from importlib.metadata import version
22
+
23
+
24
+ def parse_arguments():
25
+ parser = argparse.ArgumentParser(
26
+ description='This tool calculates the fragment sizes for read pairs given a BAM file from paired-end sequencing.'
27
+ 'Several regions are sampled depending on the '
28
+ 'size of the genome and number of processors to estimate the'
29
+ 'summary statistics on the fragment lengths. '
30
+ 'Properly paired reads are preferred for computation, i.e., '
31
+ 'it will only use discordant pairs if no concordant alignments '
32
+ 'overlap with a given region. '
33
+ 'The default setting simply prints the summary statistics to the screen.',
34
+ usage='bamPEFragmentSize -b sample1.bam sample2.bam -o hist.png\n'
35
+ 'help: bamPEFragmentSize -h / bamPEFragmentSize --help'
36
+ )
37
+ parser.add_argument('--bamfiles', '-b',
38
+ help='List of BAM files to process',
39
+ nargs='+',
40
+ metavar='bam files')
41
+
42
+ parser.add_argument('--histogram', '-hist', '-o',
43
+ help='Save a .png file with a histogram '
44
+ 'of the fragment length distribution.',
45
+ metavar='FILE')
46
+
47
+ parser.add_argument('--plotFileFormat',
48
+ metavar='FILETYPE',
49
+ help='Image format type. If given, this option '
50
+ 'overrides the image format based on the plotFile '
51
+ 'ending. The available options are: png, '
52
+ 'eps, pdf, svg and plotly.',
53
+ default=None,
54
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
55
+
56
+ parser.add_argument('--numberOfProcessors', '-p',
57
+ help='Number of processors to use. The default is '
58
+ 'to use 1. (Default: %(default)s)',
59
+ metavar="INT",
60
+ type=int,
61
+ default=1,
62
+ required=False)
63
+ parser.add_argument('--samplesLabel',
64
+ help='Labels for the samples plotted. The '
65
+ 'default is to use the file name of the '
66
+ 'sample. The sample labels should be separated '
67
+ 'by spaces and quoted if a label itself'
68
+ 'contains a space E.g. --samplesLabel label-1 "label 2" ',
69
+ nargs='+')
70
+ parser.add_argument('--plotTitle', '-T',
71
+ help='Title of the plot, to be printed on top of '
72
+ 'the generated image. Leave blank for no title. (Default: %(default)s)',
73
+ default='')
74
+ parser.add_argument('--maxFragmentLength',
75
+ help='The maximum fragment length in the histogram. A value of 0 (the default) indicates to use twice the mean fragment length. (Default: %(default)s)',
76
+ default=0,
77
+ type=int)
78
+ parser.add_argument('--logScale',
79
+ help='Plot on the log scale',
80
+ action='store_true')
81
+ parser.add_argument('--binSize', '-bs',
82
+ metavar='INT',
83
+ help='Length in bases of the window used to sample the genome. (Default: %(default)s)',
84
+ default=1000,
85
+ type=int)
86
+ parser.add_argument('--distanceBetweenBins', '-n',
87
+ metavar='INT',
88
+ help='To reduce the computation time, not every possible genomic '
89
+ 'bin is sampled. This option allows you to set the distance '
90
+ 'between bins actually sampled from. Larger numbers are sufficient '
91
+ 'for high coverage samples, while smaller values are useful for '
92
+ 'lower coverage samples. Note that if you specify a value that '
93
+ 'results in too few (<1000) reads sampled, the value will be '
94
+ 'decreased. (Default: %(default)s)',
95
+ default=1000000,
96
+ type=int)
97
+ parser.add_argument('--blackListFileName', '-bl',
98
+ help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
99
+ metavar="BED file",
100
+ required=False)
101
+ parser.add_argument('--table',
102
+ metavar='FILE',
103
+ help='In addition to printing read and fragment length metrics to the screen, write them to the given file in tabular format.',
104
+ required=False)
105
+ parser.add_argument('--outRawFragmentLengths',
106
+ metavar='FILE',
107
+ required=False,
108
+ type=writableFile,
109
+ help='Save the fragment (or read if the input is single-end) length and their associated number of occurrences to a tab-separated file. Columns are length, number of occurrences, and the sample label.')
110
+ parser.add_argument('--verbose',
111
+ help='Set if processing data messages are wanted.',
112
+ action='store_true',
113
+ required=False)
114
+ parser.add_argument('--version', action='version',
115
+ version='%(prog)s {}'.format(version('deeptools')))
116
+
117
+ return parser
118
+
119
+
120
+ def getDensity(lengths, minVal, maxVal):
121
+ """
122
+ This is essentially computing what hist() in matplotlib is doing and returning the results.
123
+ This then allows us to free up the memory consumed by each sample rather than returning it all back to main() for plotting.
124
+ """
125
+ n, bins, patches = plt.hist(lengths, bins=100, range=(minVal, maxVal), density=True)
126
+ plt.clf()
127
+ return (n, bins)
128
+
129
+
130
+ def getFragSize(bam, args, idx, outRawFrags):
131
+ fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True,
132
+ blackListFileName=args.blackListFileName,
133
+ numberOfProcessors=args.numberOfProcessors,
134
+ verbose=args.verbose,
135
+ binSize=args.binSize,
136
+ distanceBetweenBins=args.distanceBetweenBins)
137
+
138
+ if outRawFrags:
139
+ label = bam
140
+ if args.samplesLabel and idx < len(args.samplesLabel):
141
+ label = args.samplesLabel[idx]
142
+ if fragment_len_dict:
143
+ fragment_len_dict['lengths'] = [int(x) for x in fragment_len_dict['lengths']]
144
+ cnts = np.bincount(fragment_len_dict['lengths'], minlength=int(fragment_len_dict['max']) + 1)
145
+ else:
146
+ read_len_dict['lengths'] = [int(x) for x in read_len_dict['lengths']]
147
+ cnts = np.bincount(read_len_dict['lengths'], minlength=int(read_len_dict['max']) + 1)
148
+ for idx, v in enumerate(cnts):
149
+ if v > 0:
150
+ outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label))
151
+
152
+ if args.samplesLabel and idx < len(args.samplesLabel):
153
+ print("\n\nSample label: {}".format(args.samplesLabel[idx]))
154
+ else:
155
+ print("\n\nBAM file : {}".format(bam))
156
+
157
+ if fragment_len_dict:
158
+ if fragment_len_dict['mean'] == 0:
159
+ print("No pairs were found. Is the data from a paired-end sequencing experiment?")
160
+
161
+ print("Sample size: {}\n".format(fragment_len_dict['sample_size']))
162
+
163
+ print("Fragment lengths:")
164
+ print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
165
+ "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'],
166
+ fragment_len_dict['qtile25'],
167
+ fragment_len_dict['mean'],
168
+ fragment_len_dict['median'],
169
+ fragment_len_dict['qtile75'],
170
+ fragment_len_dict['max'],
171
+ fragment_len_dict['std']))
172
+ print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(fragment_len_dict['mad'],
173
+ fragment_len_dict['qtile10'],
174
+ fragment_len_dict['qtile20'],
175
+ fragment_len_dict['qtile30'],
176
+ fragment_len_dict['qtile40'],
177
+ fragment_len_dict['qtile60'],
178
+ fragment_len_dict['qtile70'],
179
+ fragment_len_dict['qtile80'],
180
+ fragment_len_dict['qtile90'],
181
+ fragment_len_dict['qtile99']))
182
+ else:
183
+ print("No pairs were found. Is the data from a paired-end sequencing experiment?")
184
+
185
+ print("\nRead lengths:")
186
+ print("Sample size: {}\n".format(read_len_dict['sample_size']))
187
+ print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
188
+ "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'],
189
+ read_len_dict['qtile25'],
190
+ read_len_dict['mean'],
191
+ read_len_dict['median'],
192
+ read_len_dict['qtile75'],
193
+ read_len_dict['max'],
194
+ read_len_dict['std']))
195
+ print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(read_len_dict['mad'],
196
+ read_len_dict['qtile10'],
197
+ read_len_dict['qtile20'],
198
+ read_len_dict['qtile30'],
199
+ read_len_dict['qtile40'],
200
+ read_len_dict['qtile60'],
201
+ read_len_dict['qtile70'],
202
+ read_len_dict['qtile80'],
203
+ read_len_dict['qtile90'],
204
+ read_len_dict['qtile99']))
205
+
206
+ # The read and fragment lists will just eat up memory if not removed!
207
+ if args.histogram:
208
+ if fragment_len_dict:
209
+ maxVal = fragment_len_dict['mean'] * 2
210
+ minVal = fragment_len_dict['min']
211
+ else:
212
+ maxVal = read_len_dict['mean'] * 2
213
+ minVal = read_len_dict['min']
214
+ if args.maxFragmentLength > 0:
215
+ maxVal = args.maxFragmentLength
216
+
217
+ if fragment_len_dict:
218
+ fragment_len_dict['lengths'] = getDensity(fragment_len_dict['lengths'], minVal, maxVal)
219
+ if read_len_dict:
220
+ read_len_dict['lengths'] = getDensity(read_len_dict['lengths'], minVal, maxVal)
221
+ else:
222
+ if fragment_len_dict:
223
+ del fragment_len_dict['lengths']
224
+ if read_len_dict:
225
+ del read_len_dict['lengths']
226
+
227
+ return (fragment_len_dict, read_len_dict)
228
+
229
+
230
+ def printTable(args, fragDict, readDict):
231
+ """
232
+ Print the read and fragment dictionary in more easily parsable tabular format to a file.
233
+ """
234
+ of = open(args.table, "w")
235
+ of.write("\tFrag. Sampled")
236
+ of.write("\tFrag. Len. Min.\tFrag. Len. 1st. Qu.\tFrag. Len. Mean\tFrag. Len. Median\tFrag. Len. 3rd Qu.\tFrag. Len. Max\tFrag. Len. Std.")
237
+ of.write("\tFrag. Med. Abs. Dev.\tFrag. Len. 10%\tFrag. Len. 20%\tFrag. Len. 30%\tFrag. Len. 40%\tFrag. Len. 60%\tFrag. Len. 70%\tFrag. Len. 80%\tFrag. Len. 90%\tFrag. Len. 99%")
238
+ of.write("\tReads Sampled")
239
+ of.write("\tRead Len. Min.\tRead Len. 1st. Qu.\tRead Len. Mean\tRead Len. Median\tRead Len. 3rd Qu.\tRead Len. Max\tRead Len. Std.")
240
+ of.write("\tRead Med. Abs. Dev.\tRead Len. 10%\tRead Len. 20%\tRead Len. 30%\tRead Len. 40%\tRead Len. 60%\tRead Len. 70%\tRead Len. 80%\tRead Len. 90%\tRead Len. 99%\n")
241
+
242
+ for idx, bam in enumerate(args.bamfiles):
243
+ if args.samplesLabel and idx < len(args.samplesLabel):
244
+ of.write(args.samplesLabel[idx])
245
+ else:
246
+ of.write(bam)
247
+ if fragDict is not None and fragDict[bam] is not None:
248
+ d = fragDict[bam]
249
+ of.write("\t{}".format(d['sample_size']))
250
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'],
251
+ d['qtile25'],
252
+ d['mean'],
253
+ d['median'],
254
+ d['qtile75'],
255
+ d['max'],
256
+ d['std']))
257
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['mad'],
258
+ d['qtile10'],
259
+ d['qtile20'],
260
+ d['qtile30'],
261
+ d['qtile40'],
262
+ d['qtile60'],
263
+ d['qtile70'],
264
+ d['qtile80'],
265
+ d['qtile90'],
266
+ d['qtile99']))
267
+ else:
268
+ of.write("\t0")
269
+ of.write("\t0\t0\t0\t0\t0\t0\t0")
270
+ of.write("\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0")
271
+ d = readDict[bam]
272
+ of.write("\t{}".format(d['sample_size']))
273
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'],
274
+ d['qtile25'],
275
+ d['mean'],
276
+ d['median'],
277
+ d['qtile75'],
278
+ d['max'],
279
+ d['std']))
280
+ of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(d['mad'],
281
+ d['qtile10'],
282
+ d['qtile20'],
283
+ d['qtile30'],
284
+ d['qtile40'],
285
+ d['qtile60'],
286
+ d['qtile70'],
287
+ d['qtile80'],
288
+ d['qtile90'],
289
+ d['qtile99']))
290
+ of.close()
291
+
292
+
293
+ def main(args=None):
294
+ args = parse_arguments().parse_args(args)
295
+
296
+ if len(sys.argv) == 1:
297
+ parse_arguments().print_help()
298
+ sys.exit()
299
+
300
+ fraglengths = {}
301
+ readlengths = {}
302
+ of = None
303
+ if args.outRawFragmentLengths is not None:
304
+ of = open(args.outRawFragmentLengths, "w")
305
+ of.write("#bamPEFragmentSize\nSize\tOccurrences\tSample\n")
306
+ for idx, bam in enumerate(args.bamfiles):
307
+ f, r = getFragSize(bam, args, idx, of)
308
+ fraglengths[bam] = f
309
+ readlengths[bam] = r
310
+
311
+ if args.table is not None:
312
+ printTable(args, fraglengths, readlengths)
313
+
314
+ if args.histogram:
315
+ if args.samplesLabel:
316
+ if len(args.bamfiles) != len(args.samplesLabel):
317
+ sys.exit("The number of labels does not match the number of BAM files.")
318
+ else:
319
+ labels = args.samplesLabel
320
+ else:
321
+ labels = list(fraglengths.keys())
322
+
323
+ i = 0
324
+ data = []
325
+ for bam in fraglengths.keys():
326
+ d = fraglengths[bam]
327
+ if d is None:
328
+ d = readlengths[bam]
329
+ if args.maxFragmentLength > 0:
330
+ maxVal = args.maxFragmentLength
331
+ else:
332
+ maxVal = d['mean'] * 2
333
+
334
+ if args.plotFileFormat == 'plotly':
335
+ trace = go.Histogram(x=d['lengths'],
336
+ histnorm='probability',
337
+ opacity=0.5,
338
+ name=labels[i],
339
+ nbinsx=100,
340
+ xbins=dict(start=d['min'], end=maxVal))
341
+ data.append(trace)
342
+ else:
343
+ plt.bar(d['lengths'][1][:-1], height=d['lengths'][0],
344
+ width=d['lengths'][1][1:] - d['lengths'][1][:-1],
345
+ align='edge', log=args.logScale,
346
+ alpha=0.5, label=labels[i])
347
+ i += 1
348
+
349
+ if args.plotFileFormat == 'plotly':
350
+ fig = go.Figure()
351
+ fig.add_traces(data)
352
+ fig['layout']['yaxis1'].update(title='Frequency')
353
+ fig['layout']['xaxis1'].update(title='Fragment Length')
354
+ fig['layout'].update(title=args.plotTitle)
355
+ fig['layout'].update(showlegend=True)
356
+ if args.logScale:
357
+ fig['layout']['yaxis1'].update(type='log')
358
+ py.plot(fig, filename=args.histogram, auto_open=False)
359
+ else:
360
+ plt.xlabel('Fragment Length')
361
+ plt.ylabel('Frequency')
362
+ plt.legend(loc='upper right')
363
+ plt.title(args.plotTitle)
364
+ plt.savefig(args.histogram, bbox_inches=0, format=args.plotFileFormat)
365
+ plt.close()
366
+
367
+
368
+ if __name__ == "__main__":
369
+ main()
deepTools/source/deeptools/bigwigAverage.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import sys
5
+ import numpy as np
6
+ from deeptools import parserCommon
7
+ from deeptools import writeBedGraph_bam_and_bw
8
+
9
+ debug = 0
10
+
11
+
12
+ def parse_arguments(args=None):
13
+ parentParser = parserCommon.getParentArgParse()
14
+ outputParser = parserCommon.output()
15
+ parser = argparse.ArgumentParser(
16
+ parents=[parentParser, outputParser],
17
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
18
+ description='This tool average multiple bigWig files based on the number '
19
+ 'of mapped reads. To average the bigWig files, the genome is '
20
+ 'partitioned into bins of equal size, then the scores '
21
+ 'in each bigwig file are computed per bin.'
22
+ 'These scores are averaged and scaleFactors can be applied before the average.',
23
+ usage='bigwigAverage -b sample1.bw sample2.bw -o outfile.bw\n'
24
+ 'help: bigwigAverage -h / bigwigAverage --help')
25
+
26
+ # define the arguments
27
+ parser.add_argument('--bigwigs', '-b',
28
+ metavar='Bigwig files',
29
+ help='Bigwig files separated by space.',
30
+ nargs='+',
31
+ required=True)
32
+
33
+ parser.add_argument('--scaleFactors',
34
+ help='Set this parameter to multipy the bigwig values '
35
+ 'by a constant. The format is '
36
+ 'scaleFactor1:scaleFactor2:scaleFactor3 etc. '
37
+ 'For example 0.7:1 to scale the first bigwig file '
38
+ 'by 0.7 while not scaling the second bigwig file',
39
+ default=None,
40
+ required=False)
41
+
42
+ parser.add_argument('--skipNonCoveredRegions', '--skipNAs',
43
+ help='This parameter determines if non-covered regions (regions without a score) '
44
+ 'in the bigWig files should be skipped. The default is to treat those '
45
+ 'regions as having a value of zero. '
46
+ 'The decision to skip non-covered regions '
47
+ 'depends on the interpretation of the data. Non-covered regions '
48
+ 'in a bigWig file may represent repetitive regions that should '
49
+ 'be skipped. Alternatively, the interpretation of non-covered regions as '
50
+ 'zeros may be wrong and this option should be used ',
51
+ action='store_true')
52
+
53
+ return parser
54
+
55
+
56
+ def getType(fname):
57
+ """
58
+ Tries to determine if a file is a wiggle, a bedgraph, or a bigWig file.
59
+ """
60
+ if fname.endswith(".wig") or fname.endswith(".wiggle"):
61
+ return "wiggle"
62
+ elif fname.lower().endswith(".bedgraph") or fname.endswith(".bdg"):
63
+ return "bedgraph"
64
+ else:
65
+ return "bigwig"
66
+
67
+
68
+ def average(tileCoverage, args):
69
+ r"""
70
+ The mapreduce method calls this function
71
+ for each tile. The parameters (args) are fixed
72
+ in the main method.
73
+
74
+ >>> funcArgs= {'scaleFactors': (1,1)}
75
+ >>> average([1, 2], funcArgs)
76
+ 1.5
77
+ >>> funcArgs= {'scaleFactors': (1,0.5)}
78
+ >>> average([1, 2], funcArgs)
79
+ 1.0
80
+ >>> funcArgs= {'scaleFactors': (1,0.5,0.1,0.2)}
81
+ >>> average([1, 2, 3, 12], funcArgs)
82
+ 1.175
83
+ >>> average([1, 2, 3, np.nan], funcArgs)
84
+ nan
85
+ """
86
+
87
+ norm_values = [args['scaleFactors'][i] * cov for i, cov in enumerate(tileCoverage)]
88
+
89
+ return np.mean(norm_values)
90
+
91
+
92
+ def main(args=None):
93
+ args = parse_arguments().parse_args(args)
94
+ if len(sys.argv) == 1:
95
+ parse_arguments().print_help()
96
+ sys.exit()
97
+
98
+ nFiles = len(args.bigwigs)
99
+
100
+ if args.scaleFactors:
101
+ scaleFactors = [float(x) for x in args.scaleFactors.split(":")]
102
+ if len(scaleFactors) == 1:
103
+ scaleFactors = scaleFactors * nFiles
104
+ elif len(scaleFactors) != nFiles:
105
+ raise argparse.ArgumentTypeError(
106
+ "Format of scaleFactors is factor or factor1:factor2... as many as bigwig files. "
107
+ "There are {} bigwigs and {} factors."
108
+ "The value given ( {} ) is not valid".format(nFiles, len(scaleFactors), args.scaleFactors))
109
+ else:
110
+ scaleFactors = [1] * nFiles
111
+
112
+ # the average function is called and receives
113
+ # the function_args per each tile that is considered
114
+ FUNC = average
115
+ function_args = {'scaleFactors': scaleFactors}
116
+
117
+ writeBedGraph_bam_and_bw.writeBedGraph(
118
+ [(b, getType(b)) for b in args.bigwigs],
119
+ args.outFileName, 0, FUNC,
120
+ function_args, tileSize=args.binSize, region=args.region,
121
+ blackListFileName=args.blackListFileName,
122
+ verbose=args.verbose,
123
+ numberOfProcessors=args.numberOfProcessors,
124
+ skipZeroOverZero=False,
125
+ format=args.outFileFormat,
126
+ smoothLength=False,
127
+ missingDataAsZero=not args.skipNonCoveredRegions,
128
+ extendPairedEnds=False)
deepTools/source/deeptools/bigwigCompare.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from deeptools import parserCommon
5
+ from deeptools.getRatio import getRatio
6
+ from deeptools import writeBedGraph_bam_and_bw
7
+
8
+ debug = 0
9
+
10
+
11
+ def parse_arguments(args=None):
12
+ parentParser = parserCommon.getParentArgParse()
13
+ outputParser = parserCommon.output()
14
+ parser = argparse.ArgumentParser(
15
+ parents=[parentParser, outputParser],
16
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
17
+ description='This tool compares two bigWig files based on the number '
18
+ 'of mapped reads. To compare the bigWig files, the genome is '
19
+ 'partitioned into bins of equal size, then the number of reads found '
20
+ 'in each BAM file are counted per bin and finally a summary '
21
+ 'value is reported. This value can be the ratio of the number of reads'
22
+ 'per bin, the log2 of the ratio, the sum or the difference.',
23
+ usage='bigwigCompare -b1 sample1.bw -b2 sample2.bw -o log2.bw\n'
24
+ 'help: bigwigCompare -h / bigwigCompare --help')
25
+
26
+ # define the arguments
27
+ parser.add_argument('--bigwig1', '-b1',
28
+ metavar='Bigwig file',
29
+ help='Bigwig file 1. Usually the file for the '
30
+ 'treatment.',
31
+ required=True)
32
+
33
+ parser.add_argument('--bigwig2', '-b2',
34
+ metavar='Bigwig file',
35
+ help='Bigwig file 2. Usually the file for the '
36
+ 'control.',
37
+ required=True)
38
+
39
+ parser.add_argument('--scaleFactors',
40
+ help='Set this parameter to multipy the bigwig values '
41
+ 'by a constant. The format is '
42
+ 'scaleFactor1:scaleFactor2. '
43
+ 'For example 0.7:1 to scale the first bigwig file '
44
+ 'by 0.7 while not scaling the second bigwig file',
45
+ default=None,
46
+ required=False)
47
+
48
+ parser.add_argument('--pseudocount',
49
+ help='A small number to avoid x/0. Only useful '
50
+ 'together with --operation log2 or --operation ratio. '
51
+ 'You can specify different values as pseudocounts for '
52
+ 'the numerator and the denominator by providing two '
53
+ 'values (the first value is used as the numerator '
54
+ 'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
55
+ default=1,
56
+ nargs='+',
57
+ action=parserCommon.requiredLength(1, 2),
58
+ type=float,
59
+ required=False)
60
+
61
+ parser.add_argument('--skipZeroOverZero',
62
+ help='Skip bins where BOTH BAM files lack coverage. '
63
+ 'This is determined BEFORE any applicable pseudocount '
64
+ 'is added.',
65
+ action='store_true')
66
+
67
+ parser.add_argument('--operation',
68
+ help='The default is to output the log2ratio of the '
69
+ 'two samples. The reciprocal ratio returns the '
70
+ 'the negative of the inverse of the ratio '
71
+ 'if the ratio is less than 0. The resulting '
72
+ 'values are interpreted as negative fold changes. '
73
+ 'Instead of performing a '
74
+ 'computation using both files, the scaled signal can '
75
+ 'alternatively be output for the first or second file using '
76
+ 'the \'--operation first\' or \'--operation second\' (Default: %(default)s)',
77
+ default='log2',
78
+ choices=['log2', 'ratio', 'subtract', 'add', 'mean',
79
+ 'reciprocal_ratio', 'first', 'second'],
80
+ required=False)
81
+
82
+ parser.add_argument('--skipNonCoveredRegions', '--skipNAs',
83
+ help='This parameter determines if non-covered regions (regions without a score) '
84
+ 'in the bigWig files should be skipped. The default is to treat those '
85
+ 'regions as having a value of zero. '
86
+ 'The decision to skip non-covered regions '
87
+ 'depends on the interpretation of the data. Non-covered regions '
88
+ 'in a bigWig file may represent repetitive regions that should '
89
+ 'be skipped. Alternatively, the interpretation of non-covered regions as '
90
+ 'zeros may be wrong and this option should be used ',
91
+ action='store_true')
92
+
93
+ parser.add_argument('--fixedStep',
94
+ help='Write out all bins (of size --binSize) '
95
+ 'instead of merging neighbouring bins with equal values.',
96
+ action='store_true')
97
+ return parser
98
+
99
+
100
+ def getType(fname):
101
+ """
102
+ Tries to determine if a file is a wiggle, a bedgraph or a bigWig.
103
+ """
104
+ if fname.endswith(".wig") or fname.endswith(".wiggle"):
105
+ return "wiggle"
106
+ elif fname.endswith(".bedgraph"):
107
+ return "bedgraph"
108
+ else:
109
+ return "bigwig"
110
+
111
+
112
+ def main(args=None):
113
+ args = parse_arguments().parse_args(args)
114
+
115
+ if args.scaleFactors:
116
+ scaleFactors = [float(x) for x in args.scaleFactors.split(":")]
117
+ else:
118
+ scaleFactors = [1, 1]
119
+
120
+ if not isinstance(args.pseudocount, list):
121
+ args.pseudocount = [args.pseudocount]
122
+
123
+ if len(args.pseudocount) == 1:
124
+ args.pseudocount *= 2
125
+
126
+ # the getRatio function is called and receives
127
+ # the function_args per each tile that is considered
128
+ FUNC = getRatio
129
+ function_args = {'valueType': args.operation,
130
+ 'scaleFactors': scaleFactors,
131
+ 'pseudocount': args.pseudocount}
132
+
133
+ writeBedGraph_bam_and_bw.writeBedGraph(
134
+ [(args.bigwig1, getType(args.bigwig1)),
135
+ (args.bigwig2, getType(args.bigwig2))],
136
+ args.outFileName, 0, FUNC,
137
+ function_args, tileSize=args.binSize, region=args.region,
138
+ blackListFileName=args.blackListFileName,
139
+ verbose=args.verbose,
140
+ numberOfProcessors=args.numberOfProcessors,
141
+ skipZeroOverZero=args.skipZeroOverZero,
142
+ format=args.outFileFormat,
143
+ smoothLength=False,
144
+ missingDataAsZero=not args.skipNonCoveredRegions,
145
+ extendPairedEnds=False,
146
+ fixedStep=args.fixedStep)
deepTools/source/deeptools/cm.py ADDED
@@ -0,0 +1,1088 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # This file comes from the seaborn project and is under a BSD license:
4
+
5
+ # Copyright (c) 2012-2019, Michael L. Waskom
6
+ # All rights reserved.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without
9
+ # modification, are permitted provided that the following conditions are met:
10
+ #
11
+ # * Redistributions of source code must retain the above copyright notice, this
12
+ # list of conditions and the following disclaimer.
13
+ #
14
+ # * Redistributions in binary form must reproduce the above copyright notice,
15
+ # this list of conditions and the following disclaimer in the documentation
16
+ # and/or other materials provided with the distribution.
17
+ #
18
+ # * Neither the name of the project nor the names of its
19
+ # contributors may be used to endorse or promote products derived from
20
+ # this software without specific prior written permission.
21
+ #
22
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29
+ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
+
33
+ from matplotlib import colors, colormaps as mpl_cm
34
+
35
+
36
+ _rocket_lut = [
37
+ [0.01060815, 0.01808215, 0.10018654],
38
+ [0.01428972, 0.02048237, 0.10374486],
39
+ [0.01831941, 0.0229766, 0.10738511],
40
+ [0.02275049, 0.02554464, 0.11108639],
41
+ [0.02759119, 0.02818316, 0.11483751],
42
+ [0.03285175, 0.03088792, 0.11863035],
43
+ [0.03853466, 0.03365771, 0.12245873],
44
+ [0.04447016, 0.03648425, 0.12631831],
45
+ [0.05032105, 0.03936808, 0.13020508],
46
+ [0.05611171, 0.04224835, 0.13411624],
47
+ [0.0618531, 0.04504866, 0.13804929],
48
+ [0.06755457, 0.04778179, 0.14200206],
49
+ [0.0732236, 0.05045047, 0.14597263],
50
+ [0.0788708, 0.05305461, 0.14995981],
51
+ [0.08450105, 0.05559631, 0.15396203],
52
+ [0.09011319, 0.05808059, 0.15797687],
53
+ [0.09572396, 0.06050127, 0.16200507],
54
+ [0.10132312, 0.06286782, 0.16604287],
55
+ [0.10692823, 0.06517224, 0.17009175],
56
+ [0.1125315, 0.06742194, 0.17414848],
57
+ [0.11813947, 0.06961499, 0.17821272],
58
+ [0.12375803, 0.07174938, 0.18228425],
59
+ [0.12938228, 0.07383015, 0.18636053],
60
+ [0.13501631, 0.07585609, 0.19044109],
61
+ [0.14066867, 0.0778224, 0.19452676],
62
+ [0.14633406, 0.07973393, 0.1986151],
63
+ [0.15201338, 0.08159108, 0.20270523],
64
+ [0.15770877, 0.08339312, 0.20679668],
65
+ [0.16342174, 0.0851396, 0.21088893],
66
+ [0.16915387, 0.08682996, 0.21498104],
67
+ [0.17489524, 0.08848235, 0.2190294],
68
+ [0.18065495, 0.09009031, 0.22303512],
69
+ [0.18643324, 0.09165431, 0.22699705],
70
+ [0.19223028, 0.09317479, 0.23091409],
71
+ [0.19804623, 0.09465217, 0.23478512],
72
+ [0.20388117, 0.09608689, 0.23860907],
73
+ [0.20973515, 0.09747934, 0.24238489],
74
+ [0.21560818, 0.09882993, 0.24611154],
75
+ [0.22150014, 0.10013944, 0.2497868],
76
+ [0.22741085, 0.10140876, 0.25340813],
77
+ [0.23334047, 0.10263737, 0.25697736],
78
+ [0.23928891, 0.10382562, 0.2604936],
79
+ [0.24525608, 0.10497384, 0.26395596],
80
+ [0.25124182, 0.10608236, 0.26736359],
81
+ [0.25724602, 0.10715148, 0.27071569],
82
+ [0.26326851, 0.1081815, 0.27401148],
83
+ [0.26930915, 0.1091727, 0.2772502],
84
+ [0.27536766, 0.11012568, 0.28043021],
85
+ [0.28144375, 0.11104133, 0.2835489],
86
+ [0.2875374, 0.11191896, 0.28660853],
87
+ [0.29364846, 0.11275876, 0.2896085],
88
+ [0.29977678, 0.11356089, 0.29254823],
89
+ [0.30592213, 0.11432553, 0.29542718],
90
+ [0.31208435, 0.11505284, 0.29824485],
91
+ [0.31826327, 0.1157429, 0.30100076],
92
+ [0.32445869, 0.11639585, 0.30369448],
93
+ [0.33067031, 0.11701189, 0.30632563],
94
+ [0.33689808, 0.11759095, 0.3088938],
95
+ [0.34314168, 0.11813362, 0.31139721],
96
+ [0.34940101, 0.11863987, 0.3138355],
97
+ [0.355676, 0.11910909, 0.31620996],
98
+ [0.36196644, 0.1195413, 0.31852037],
99
+ [0.36827206, 0.11993653, 0.32076656],
100
+ [0.37459292, 0.12029443, 0.32294825],
101
+ [0.38092887, 0.12061482, 0.32506528],
102
+ [0.38727975, 0.12089756, 0.3271175],
103
+ [0.39364518, 0.12114272, 0.32910494],
104
+ [0.40002537, 0.12134964, 0.33102734],
105
+ [0.40642019, 0.12151801, 0.33288464],
106
+ [0.41282936, 0.12164769, 0.33467689],
107
+ [0.41925278, 0.12173833, 0.33640407],
108
+ [0.42569057, 0.12178916, 0.33806605],
109
+ [0.43214263, 0.12179973, 0.33966284],
110
+ [0.43860848, 0.12177004, 0.34119475],
111
+ [0.44508855, 0.12169883, 0.34266151],
112
+ [0.45158266, 0.12158557, 0.34406324],
113
+ [0.45809049, 0.12142996, 0.34540024],
114
+ [0.46461238, 0.12123063, 0.34667231],
115
+ [0.47114798, 0.12098721, 0.34787978],
116
+ [0.47769736, 0.12069864, 0.34902273],
117
+ [0.48426077, 0.12036349, 0.35010104],
118
+ [0.49083761, 0.11998161, 0.35111537],
119
+ [0.49742847, 0.11955087, 0.35206533],
120
+ [0.50403286, 0.11907081, 0.35295152],
121
+ [0.51065109, 0.11853959, 0.35377385],
122
+ [0.51728314, 0.1179558, 0.35453252],
123
+ [0.52392883, 0.11731817, 0.35522789],
124
+ [0.53058853, 0.11662445, 0.35585982],
125
+ [0.53726173, 0.11587369, 0.35642903],
126
+ [0.54394898, 0.11506307, 0.35693521],
127
+ [0.5506426, 0.11420757, 0.35737863],
128
+ [0.55734473, 0.11330456, 0.35775059],
129
+ [0.56405586, 0.11235265, 0.35804813],
130
+ [0.57077365, 0.11135597, 0.35827146],
131
+ [0.5774991, 0.11031233, 0.35841679],
132
+ [0.58422945, 0.10922707, 0.35848469],
133
+ [0.59096382, 0.10810205, 0.35847347],
134
+ [0.59770215, 0.10693774, 0.35838029],
135
+ [0.60444226, 0.10573912, 0.35820487],
136
+ [0.61118304, 0.10450943, 0.35794557],
137
+ [0.61792306, 0.10325288, 0.35760108],
138
+ [0.62466162, 0.10197244, 0.35716891],
139
+ [0.63139686, 0.10067417, 0.35664819],
140
+ [0.63812122, 0.09938212, 0.35603757],
141
+ [0.64483795, 0.0980891, 0.35533555],
142
+ [0.65154562, 0.09680192, 0.35454107],
143
+ [0.65824241, 0.09552918, 0.3536529],
144
+ [0.66492652, 0.09428017, 0.3526697],
145
+ [0.67159578, 0.09306598, 0.35159077],
146
+ [0.67824099, 0.09192342, 0.3504148],
147
+ [0.684863, 0.09085633, 0.34914061],
148
+ [0.69146268, 0.0898675, 0.34776864],
149
+ [0.69803757, 0.08897226, 0.3462986],
150
+ [0.70457834, 0.0882129, 0.34473046],
151
+ [0.71108138, 0.08761223, 0.3430635],
152
+ [0.7175507, 0.08716212, 0.34129974],
153
+ [0.72398193, 0.08688725, 0.33943958],
154
+ [0.73035829, 0.0868623, 0.33748452],
155
+ [0.73669146, 0.08704683, 0.33543669],
156
+ [0.74297501, 0.08747196, 0.33329799],
157
+ [0.74919318, 0.08820542, 0.33107204],
158
+ [0.75535825, 0.08919792, 0.32876184],
159
+ [0.76145589, 0.09050716, 0.32637117],
160
+ [0.76748424, 0.09213602, 0.32390525],
161
+ [0.77344838, 0.09405684, 0.32136808],
162
+ [0.77932641, 0.09634794, 0.31876642],
163
+ [0.78513609, 0.09892473, 0.31610488],
164
+ [0.79085854, 0.10184672, 0.313391],
165
+ [0.7965014, 0.10506637, 0.31063031],
166
+ [0.80205987, 0.10858333, 0.30783],
167
+ [0.80752799, 0.11239964, 0.30499738],
168
+ [0.81291606, 0.11645784, 0.30213802],
169
+ [0.81820481, 0.12080606, 0.29926105],
170
+ [0.82341472, 0.12535343, 0.2963705],
171
+ [0.82852822, 0.13014118, 0.29347474],
172
+ [0.83355779, 0.13511035, 0.29057852],
173
+ [0.83850183, 0.14025098, 0.2876878],
174
+ [0.84335441, 0.14556683, 0.28480819],
175
+ [0.84813096, 0.15099892, 0.281943],
176
+ [0.85281737, 0.15657772, 0.27909826],
177
+ [0.85742602, 0.1622583, 0.27627462],
178
+ [0.86196552, 0.16801239, 0.27346473],
179
+ [0.86641628, 0.17387796, 0.27070818],
180
+ [0.87079129, 0.17982114, 0.26797378],
181
+ [0.87507281, 0.18587368, 0.26529697],
182
+ [0.87925878, 0.19203259, 0.26268136],
183
+ [0.8833417, 0.19830556, 0.26014181],
184
+ [0.88731387, 0.20469941, 0.25769539],
185
+ [0.89116859, 0.21121788, 0.2553592],
186
+ [0.89490337, 0.21785614, 0.25314362],
187
+ [0.8985026, 0.22463251, 0.25108745],
188
+ [0.90197527, 0.23152063, 0.24918223],
189
+ [0.90530097, 0.23854541, 0.24748098],
190
+ [0.90848638, 0.24568473, 0.24598324],
191
+ [0.911533, 0.25292623, 0.24470258],
192
+ [0.9144225, 0.26028902, 0.24369359],
193
+ [0.91717106, 0.26773821, 0.24294137],
194
+ [0.91978131, 0.27526191, 0.24245973],
195
+ [0.92223947, 0.28287251, 0.24229568],
196
+ [0.92456587, 0.29053388, 0.24242622],
197
+ [0.92676657, 0.29823282, 0.24285536],
198
+ [0.92882964, 0.30598085, 0.24362274],
199
+ [0.93078135, 0.31373977, 0.24468803],
200
+ [0.93262051, 0.3215093, 0.24606461],
201
+ [0.93435067, 0.32928362, 0.24775328],
202
+ [0.93599076, 0.33703942, 0.24972157],
203
+ [0.93752831, 0.34479177, 0.25199928],
204
+ [0.93899289, 0.35250734, 0.25452808],
205
+ [0.94036561, 0.36020899, 0.25734661],
206
+ [0.94167588, 0.36786594, 0.2603949],
207
+ [0.94291042, 0.37549479, 0.26369821],
208
+ [0.94408513, 0.3830811, 0.26722004],
209
+ [0.94520419, 0.39062329, 0.27094924],
210
+ [0.94625977, 0.39813168, 0.27489742],
211
+ [0.94727016, 0.4055909, 0.27902322],
212
+ [0.94823505, 0.41300424, 0.28332283],
213
+ [0.94914549, 0.42038251, 0.28780969],
214
+ [0.95001704, 0.42771398, 0.29244728],
215
+ [0.95085121, 0.43500005, 0.29722817],
216
+ [0.95165009, 0.44224144, 0.30214494],
217
+ [0.9524044, 0.44944853, 0.3072105],
218
+ [0.95312556, 0.45661389, 0.31239776],
219
+ [0.95381595, 0.46373781, 0.31769923],
220
+ [0.95447591, 0.47082238, 0.32310953],
221
+ [0.95510255, 0.47787236, 0.32862553],
222
+ [0.95569679, 0.48489115, 0.33421404],
223
+ [0.95626788, 0.49187351, 0.33985601],
224
+ [0.95681685, 0.49882008, 0.34555431],
225
+ [0.9573439, 0.50573243, 0.35130912],
226
+ [0.95784842, 0.51261283, 0.35711942],
227
+ [0.95833051, 0.51946267, 0.36298589],
228
+ [0.95879054, 0.52628305, 0.36890904],
229
+ [0.95922872, 0.53307513, 0.3748895],
230
+ [0.95964538, 0.53983991, 0.38092784],
231
+ [0.96004345, 0.54657593, 0.3870292],
232
+ [0.96042097, 0.55328624, 0.39319057],
233
+ [0.96077819, 0.55997184, 0.39941173],
234
+ [0.9611152, 0.5666337, 0.40569343],
235
+ [0.96143273, 0.57327231, 0.41203603],
236
+ [0.96173392, 0.57988594, 0.41844491],
237
+ [0.96201757, 0.58647675, 0.42491751],
238
+ [0.96228344, 0.59304598, 0.43145271],
239
+ [0.96253168, 0.5995944, 0.43805131],
240
+ [0.96276513, 0.60612062, 0.44471698],
241
+ [0.96298491, 0.6126247, 0.45145074],
242
+ [0.96318967, 0.61910879, 0.45824902],
243
+ [0.96337949, 0.6255736, 0.46511271],
244
+ [0.96355923, 0.63201624, 0.47204746],
245
+ [0.96372785, 0.63843852, 0.47905028],
246
+ [0.96388426, 0.64484214, 0.4861196],
247
+ [0.96403203, 0.65122535, 0.4932578],
248
+ [0.96417332, 0.65758729, 0.50046894],
249
+ [0.9643063, 0.66393045, 0.5077467],
250
+ [0.96443322, 0.67025402, 0.51509334],
251
+ [0.96455845, 0.67655564, 0.52251447],
252
+ [0.96467922, 0.68283846, 0.53000231],
253
+ [0.96479861, 0.68910113, 0.53756026],
254
+ [0.96492035, 0.69534192, 0.5451917],
255
+ [0.96504223, 0.7015636, 0.5528892],
256
+ [0.96516917, 0.70776351, 0.5606593],
257
+ [0.96530224, 0.71394212, 0.56849894],
258
+ [0.96544032, 0.72010124, 0.57640375],
259
+ [0.96559206, 0.72623592, 0.58438387],
260
+ [0.96575293, 0.73235058, 0.59242739],
261
+ [0.96592829, 0.73844258, 0.60053991],
262
+ [0.96612013, 0.74451182, 0.60871954],
263
+ [0.96632832, 0.75055966, 0.61696136],
264
+ [0.96656022, 0.75658231, 0.62527295],
265
+ [0.96681185, 0.76258381, 0.63364277],
266
+ [0.96709183, 0.76855969, 0.64207921],
267
+ [0.96739773, 0.77451297, 0.65057302],
268
+ [0.96773482, 0.78044149, 0.65912731],
269
+ [0.96810471, 0.78634563, 0.66773889],
270
+ [0.96850919, 0.79222565, 0.6764046],
271
+ [0.96893132, 0.79809112, 0.68512266],
272
+ [0.96935926, 0.80395415, 0.69383201],
273
+ [0.9698028, 0.80981139, 0.70252255],
274
+ [0.97025511, 0.81566605, 0.71120296],
275
+ [0.97071849, 0.82151775, 0.71987163],
276
+ [0.97120159, 0.82736371, 0.72851999],
277
+ [0.97169389, 0.83320847, 0.73716071],
278
+ [0.97220061, 0.83905052, 0.74578903],
279
+ [0.97272597, 0.84488881, 0.75440141],
280
+ [0.97327085, 0.85072354, 0.76299805],
281
+ [0.97383206, 0.85655639, 0.77158353],
282
+ [0.97441222, 0.86238689, 0.78015619],
283
+ [0.97501782, 0.86821321, 0.78871034],
284
+ [0.97564391, 0.87403763, 0.79725261],
285
+ [0.97628674, 0.87986189, 0.8057883],
286
+ [0.97696114, 0.88568129, 0.81430324],
287
+ [0.97765722, 0.89149971, 0.82280948],
288
+ [0.97837585, 0.89731727, 0.83130786],
289
+ [0.97912374, 0.90313207, 0.83979337],
290
+ [0.979891, 0.90894778, 0.84827858],
291
+ [0.98067764, 0.91476465, 0.85676611],
292
+ [0.98137749, 0.92061729, 0.86536915]
293
+ ]
294
+
295
+
296
+ _mako_lut = [
297
+ [0.04503935, 0.01482344, 0.02092227],
298
+ [0.04933018, 0.01709292, 0.02535719],
299
+ [0.05356262, 0.01950702, 0.03018802],
300
+ [0.05774337, 0.02205989, 0.03545515],
301
+ [0.06188095, 0.02474764, 0.04115287],
302
+ [0.06598247, 0.0275665, 0.04691409],
303
+ [0.07005374, 0.03051278, 0.05264306],
304
+ [0.07409947, 0.03358324, 0.05834631],
305
+ [0.07812339, 0.03677446, 0.06403249],
306
+ [0.08212852, 0.0400833, 0.06970862],
307
+ [0.08611731, 0.04339148, 0.07538208],
308
+ [0.09009161, 0.04664706, 0.08105568],
309
+ [0.09405308, 0.04985685, 0.08673591],
310
+ [0.09800301, 0.05302279, 0.09242646],
311
+ [0.10194255, 0.05614641, 0.09813162],
312
+ [0.10587261, 0.05922941, 0.103854],
313
+ [0.1097942, 0.06227277, 0.10959847],
314
+ [0.11370826, 0.06527747, 0.11536893],
315
+ [0.11761516, 0.06824548, 0.12116393],
316
+ [0.12151575, 0.07117741, 0.12698763],
317
+ [0.12541095, 0.07407363, 0.1328442],
318
+ [0.12930083, 0.07693611, 0.13873064],
319
+ [0.13317849, 0.07976988, 0.14465095],
320
+ [0.13701138, 0.08259683, 0.15060265],
321
+ [0.14079223, 0.08542126, 0.15659379],
322
+ [0.14452486, 0.08824175, 0.16262484],
323
+ [0.14820351, 0.09106304, 0.16869476],
324
+ [0.15183185, 0.09388372, 0.17480366],
325
+ [0.15540398, 0.09670855, 0.18094993],
326
+ [0.15892417, 0.09953561, 0.18713384],
327
+ [0.16238588, 0.10236998, 0.19335329],
328
+ [0.16579435, 0.10520905, 0.19960847],
329
+ [0.16914226, 0.10805832, 0.20589698],
330
+ [0.17243586, 0.11091443, 0.21221911],
331
+ [0.17566717, 0.11378321, 0.21857219],
332
+ [0.17884322, 0.11666074, 0.2249565],
333
+ [0.18195582, 0.11955283, 0.23136943],
334
+ [0.18501213, 0.12245547, 0.23781116],
335
+ [0.18800459, 0.12537395, 0.24427914],
336
+ [0.19093944, 0.1283047, 0.25077369],
337
+ [0.19381092, 0.13125179, 0.25729255],
338
+ [0.19662307, 0.13421303, 0.26383543],
339
+ [0.19937337, 0.13719028, 0.27040111],
340
+ [0.20206187, 0.14018372, 0.27698891],
341
+ [0.20469116, 0.14319196, 0.28359861],
342
+ [0.20725547, 0.14621882, 0.29022775],
343
+ [0.20976258, 0.14925954, 0.29687795],
344
+ [0.21220409, 0.15231929, 0.30354703],
345
+ [0.21458611, 0.15539445, 0.31023563],
346
+ [0.21690827, 0.15848519, 0.31694355],
347
+ [0.21916481, 0.16159489, 0.32366939],
348
+ [0.2213631, 0.16471913, 0.33041431],
349
+ [0.22349947, 0.1678599, 0.33717781],
350
+ [0.2255714, 0.1710185, 0.34395925],
351
+ [0.22758415, 0.17419169, 0.35075983],
352
+ [0.22953569, 0.17738041, 0.35757941],
353
+ [0.23142077, 0.18058733, 0.3644173],
354
+ [0.2332454, 0.18380872, 0.37127514],
355
+ [0.2350092, 0.18704459, 0.3781528],
356
+ [0.23670785, 0.190297, 0.38504973],
357
+ [0.23834119, 0.19356547, 0.39196711],
358
+ [0.23991189, 0.19684817, 0.39890581],
359
+ [0.24141903, 0.20014508, 0.4058667],
360
+ [0.24286214, 0.20345642, 0.4128484],
361
+ [0.24423453, 0.20678459, 0.41985299],
362
+ [0.24554109, 0.21012669, 0.42688124],
363
+ [0.2467815, 0.21348266, 0.43393244],
364
+ [0.24795393, 0.21685249, 0.4410088],
365
+ [0.24905614, 0.22023618, 0.448113],
366
+ [0.25007383, 0.22365053, 0.45519562],
367
+ [0.25098926, 0.22710664, 0.46223892],
368
+ [0.25179696, 0.23060342, 0.46925447],
369
+ [0.25249346, 0.23414353, 0.47623196],
370
+ [0.25307401, 0.23772973, 0.48316271],
371
+ [0.25353152, 0.24136961, 0.49001976],
372
+ [0.25386167, 0.24506548, 0.49679407],
373
+ [0.25406082, 0.2488164, 0.50348932],
374
+ [0.25412435, 0.25262843, 0.51007843],
375
+ [0.25404842, 0.25650743, 0.51653282],
376
+ [0.25383134, 0.26044852, 0.52286845],
377
+ [0.2534705, 0.26446165, 0.52903422],
378
+ [0.25296722, 0.2685428, 0.53503572],
379
+ [0.2523226, 0.27269346, 0.54085315],
380
+ [0.25153974, 0.27691629, 0.54645752],
381
+ [0.25062402, 0.28120467, 0.55185939],
382
+ [0.24958205, 0.28556371, 0.55701246],
383
+ [0.24842386, 0.28998148, 0.56194601],
384
+ [0.24715928, 0.29446327, 0.56660884],
385
+ [0.24580099, 0.29899398, 0.57104399],
386
+ [0.24436202, 0.30357852, 0.57519929],
387
+ [0.24285591, 0.30819938, 0.57913247],
388
+ [0.24129828, 0.31286235, 0.58278615],
389
+ [0.23970131, 0.3175495, 0.5862272],
390
+ [0.23807973, 0.32226344, 0.58941872],
391
+ [0.23644557, 0.32699241, 0.59240198],
392
+ [0.2348113, 0.33173196, 0.59518282],
393
+ [0.23318874, 0.33648036, 0.59775543],
394
+ [0.2315855, 0.34122763, 0.60016456],
395
+ [0.23001121, 0.34597357, 0.60240251],
396
+ [0.2284748, 0.35071512, 0.6044784],
397
+ [0.22698081, 0.35544612, 0.60642528],
398
+ [0.22553305, 0.36016515, 0.60825252],
399
+ [0.22413977, 0.36487341, 0.60994938],
400
+ [0.22280246, 0.36956728, 0.61154118],
401
+ [0.22152555, 0.37424409, 0.61304472],
402
+ [0.22030752, 0.37890437, 0.61446646],
403
+ [0.2191538, 0.38354668, 0.61581561],
404
+ [0.21806257, 0.38817169, 0.61709794],
405
+ [0.21703799, 0.39277882, 0.61831922],
406
+ [0.21607792, 0.39736958, 0.61948028],
407
+ [0.21518463, 0.40194196, 0.62059763],
408
+ [0.21435467, 0.40649717, 0.62167507],
409
+ [0.21358663, 0.41103579, 0.62271724],
410
+ [0.21288172, 0.41555771, 0.62373011],
411
+ [0.21223835, 0.42006355, 0.62471794],
412
+ [0.21165312, 0.42455441, 0.62568371],
413
+ [0.21112526, 0.42903064, 0.6266318],
414
+ [0.21065161, 0.43349321, 0.62756504],
415
+ [0.21023306, 0.43794288, 0.62848279],
416
+ [0.20985996, 0.44238227, 0.62938329],
417
+ [0.20951045, 0.44680966, 0.63030696],
418
+ [0.20916709, 0.45122981, 0.63124483],
419
+ [0.20882976, 0.45564335, 0.63219599],
420
+ [0.20849798, 0.46005094, 0.63315928],
421
+ [0.20817199, 0.46445309, 0.63413391],
422
+ [0.20785149, 0.46885041, 0.63511876],
423
+ [0.20753716, 0.47324327, 0.63611321],
424
+ [0.20722876, 0.47763224, 0.63711608],
425
+ [0.20692679, 0.48201774, 0.63812656],
426
+ [0.20663156, 0.48640018, 0.63914367],
427
+ [0.20634336, 0.49078002, 0.64016638],
428
+ [0.20606303, 0.49515755, 0.6411939],
429
+ [0.20578999, 0.49953341, 0.64222457],
430
+ [0.20552612, 0.50390766, 0.64325811],
431
+ [0.20527189, 0.50828072, 0.64429331],
432
+ [0.20502868, 0.51265277, 0.64532947],
433
+ [0.20479718, 0.51702417, 0.64636539],
434
+ [0.20457804, 0.52139527, 0.64739979],
435
+ [0.20437304, 0.52576622, 0.64843198],
436
+ [0.20418396, 0.53013715, 0.64946117],
437
+ [0.20401238, 0.53450825, 0.65048638],
438
+ [0.20385896, 0.53887991, 0.65150606],
439
+ [0.20372653, 0.54325208, 0.65251978],
440
+ [0.20361709, 0.5476249, 0.6535266],
441
+ [0.20353258, 0.55199854, 0.65452542],
442
+ [0.20347472, 0.55637318, 0.655515],
443
+ [0.20344718, 0.56074869, 0.65649508],
444
+ [0.20345161, 0.56512531, 0.65746419],
445
+ [0.20349089, 0.56950304, 0.65842151],
446
+ [0.20356842, 0.57388184, 0.65936642],
447
+ [0.20368663, 0.57826181, 0.66029768],
448
+ [0.20384884, 0.58264293, 0.6612145],
449
+ [0.20405904, 0.58702506, 0.66211645],
450
+ [0.20431921, 0.59140842, 0.66300179],
451
+ [0.20463464, 0.59579264, 0.66387079],
452
+ [0.20500731, 0.60017798, 0.66472159],
453
+ [0.20544449, 0.60456387, 0.66555409],
454
+ [0.20596097, 0.60894927, 0.66636568],
455
+ [0.20654832, 0.61333521, 0.66715744],
456
+ [0.20721003, 0.61772167, 0.66792838],
457
+ [0.20795035, 0.62210845, 0.66867802],
458
+ [0.20877302, 0.62649546, 0.66940555],
459
+ [0.20968223, 0.63088252, 0.6701105],
460
+ [0.21068163, 0.63526951, 0.67079211],
461
+ [0.21177544, 0.63965621, 0.67145005],
462
+ [0.21298582, 0.64404072, 0.67208182],
463
+ [0.21430361, 0.64842404, 0.67268861],
464
+ [0.21572716, 0.65280655, 0.67326978],
465
+ [0.21726052, 0.65718791, 0.6738255],
466
+ [0.21890636, 0.66156803, 0.67435491],
467
+ [0.220668, 0.66594665, 0.67485792],
468
+ [0.22255447, 0.67032297, 0.67533374],
469
+ [0.22458372, 0.67469531, 0.67578061],
470
+ [0.22673713, 0.67906542, 0.67620044],
471
+ [0.22901625, 0.6834332, 0.67659251],
472
+ [0.23142316, 0.68779836, 0.67695703],
473
+ [0.23395924, 0.69216072, 0.67729378],
474
+ [0.23663857, 0.69651881, 0.67760151],
475
+ [0.23946645, 0.70087194, 0.67788018],
476
+ [0.24242624, 0.70522162, 0.67813088],
477
+ [0.24549008, 0.70957083, 0.67835215],
478
+ [0.24863372, 0.71392166, 0.67854868],
479
+ [0.25187832, 0.71827158, 0.67872193],
480
+ [0.25524083, 0.72261873, 0.67887024],
481
+ [0.25870947, 0.72696469, 0.67898912],
482
+ [0.26229238, 0.73130855, 0.67907645],
483
+ [0.26604085, 0.73564353, 0.67914062],
484
+ [0.26993099, 0.73997282, 0.67917264],
485
+ [0.27397488, 0.74429484, 0.67917096],
486
+ [0.27822463, 0.74860229, 0.67914468],
487
+ [0.28264201, 0.75290034, 0.67907959],
488
+ [0.2873016, 0.75717817, 0.67899164],
489
+ [0.29215894, 0.76144162, 0.67886578],
490
+ [0.29729823, 0.76567816, 0.67871894],
491
+ [0.30268199, 0.76989232, 0.67853896],
492
+ [0.30835665, 0.77407636, 0.67833512],
493
+ [0.31435139, 0.77822478, 0.67811118],
494
+ [0.3206671, 0.78233575, 0.67786729],
495
+ [0.32733158, 0.78640315, 0.67761027],
496
+ [0.33437168, 0.79042043, 0.67734882],
497
+ [0.34182112, 0.79437948, 0.67709394],
498
+ [0.34968889, 0.79827511, 0.67685638],
499
+ [0.35799244, 0.80210037, 0.67664969],
500
+ [0.36675371, 0.80584651, 0.67649539],
501
+ [0.3759816, 0.80950627, 0.67641393],
502
+ [0.38566792, 0.81307432, 0.67642947],
503
+ [0.39579804, 0.81654592, 0.67656899],
504
+ [0.40634556, 0.81991799, 0.67686215],
505
+ [0.41730243, 0.82318339, 0.67735255],
506
+ [0.4285828, 0.82635051, 0.6780564],
507
+ [0.44012728, 0.82942353, 0.67900049],
508
+ [0.45189421, 0.83240398, 0.68021733],
509
+ [0.46378379, 0.83530763, 0.6817062],
510
+ [0.47573199, 0.83814472, 0.68347352],
511
+ [0.48769865, 0.84092197, 0.68552698],
512
+ [0.49962354, 0.84365379, 0.68783929],
513
+ [0.5114027, 0.8463718, 0.69029789],
514
+ [0.52301693, 0.84908401, 0.69288545],
515
+ [0.53447549, 0.85179048, 0.69561066],
516
+ [0.54578602, 0.8544913, 0.69848331],
517
+ [0.55695565, 0.85718723, 0.70150427],
518
+ [0.56798832, 0.85987893, 0.70468261],
519
+ [0.57888639, 0.86256715, 0.70802931],
520
+ [0.5896541, 0.8652532, 0.71154204],
521
+ [0.60028928, 0.86793835, 0.71523675],
522
+ [0.61079441, 0.87062438, 0.71910895],
523
+ [0.62116633, 0.87331311, 0.72317003],
524
+ [0.63140509, 0.87600675, 0.72741689],
525
+ [0.64150735, 0.87870746, 0.73185717],
526
+ [0.65147219, 0.8814179, 0.73648495],
527
+ [0.66129632, 0.8841403, 0.74130658],
528
+ [0.67097934, 0.88687758, 0.74631123],
529
+ [0.68051833, 0.88963189, 0.75150483],
530
+ [0.68991419, 0.89240612, 0.75687187],
531
+ [0.69916533, 0.89520211, 0.76241714],
532
+ [0.70827373, 0.89802257, 0.76812286],
533
+ [0.71723995, 0.90086891, 0.77399039],
534
+ [0.72606665, 0.90374337, 0.7800041],
535
+ [0.73475675, 0.90664718, 0.78615802],
536
+ [0.74331358, 0.90958151, 0.79244474],
537
+ [0.75174143, 0.91254787, 0.79884925],
538
+ [0.76004473, 0.91554656, 0.80536823],
539
+ [0.76827704, 0.91856549, 0.81196513],
540
+ [0.77647029, 0.921603, 0.81855729],
541
+ [0.78462009, 0.92466151, 0.82514119],
542
+ [0.79273542, 0.92773848, 0.83172131],
543
+ [0.8008109, 0.93083672, 0.83829355],
544
+ [0.80885107, 0.93395528, 0.84485982],
545
+ [0.81685878, 0.9370938, 0.85142101],
546
+ [0.82483206, 0.94025378, 0.8579751],
547
+ [0.83277661, 0.94343371, 0.86452477],
548
+ [0.84069127, 0.94663473, 0.87106853],
549
+ [0.84857662, 0.9498573, 0.8776059],
550
+ [0.8564431, 0.95309792, 0.88414253],
551
+ [0.86429066, 0.95635719, 0.89067759],
552
+ [0.87218969, 0.95960708, 0.89725384]
553
+ ]
554
+
555
+
556
+ _vlag_lut = [
557
+ [0.13850039, 0.41331206, 0.74052025],
558
+ [0.15077609, 0.41762684, 0.73970427],
559
+ [0.16235219, 0.4219191, 0.7389667],
560
+ [0.1733322, 0.42619024, 0.73832537],
561
+ [0.18382538, 0.43044226, 0.73776764],
562
+ [0.19394034, 0.4346772, 0.73725867],
563
+ [0.20367115, 0.43889576, 0.73685314],
564
+ [0.21313625, 0.44310003, 0.73648045],
565
+ [0.22231173, 0.44729079, 0.73619681],
566
+ [0.23125148, 0.45146945, 0.73597803],
567
+ [0.23998101, 0.45563715, 0.7358223],
568
+ [0.24853358, 0.45979489, 0.73571524],
569
+ [0.25691416, 0.4639437, 0.73566943],
570
+ [0.26513894, 0.46808455, 0.73568319],
571
+ [0.27322194, 0.47221835, 0.73575497],
572
+ [0.28117543, 0.47634598, 0.73588332],
573
+ [0.28901021, 0.48046826, 0.73606686],
574
+ [0.2967358, 0.48458597, 0.73630433],
575
+ [0.30436071, 0.48869986, 0.73659451],
576
+ [0.3118955, 0.49281055, 0.73693255],
577
+ [0.31935389, 0.49691847, 0.73730851],
578
+ [0.32672701, 0.5010247, 0.73774013],
579
+ [0.33402607, 0.50512971, 0.73821941],
580
+ [0.34125337, 0.50923419, 0.73874905],
581
+ [0.34840921, 0.51333892, 0.73933402],
582
+ [0.35551826, 0.51744353, 0.73994642],
583
+ [0.3625676, 0.52154929, 0.74060763],
584
+ [0.36956356, 0.52565656, 0.74131327],
585
+ [0.37649902, 0.52976642, 0.74207698],
586
+ [0.38340273, 0.53387791, 0.74286286],
587
+ [0.39025859, 0.53799253, 0.7436962],
588
+ [0.39706821, 0.54211081, 0.744578],
589
+ [0.40384046, 0.54623277, 0.74549872],
590
+ [0.41058241, 0.55035849, 0.74645094],
591
+ [0.41728385, 0.55448919, 0.74745174],
592
+ [0.42395178, 0.55862494, 0.74849357],
593
+ [0.4305964, 0.56276546, 0.74956387],
594
+ [0.4372044, 0.56691228, 0.75068412],
595
+ [0.4437909, 0.57106468, 0.75183427],
596
+ [0.45035117, 0.5752235, 0.75302312],
597
+ [0.45687824, 0.57938983, 0.75426297],
598
+ [0.46339713, 0.58356191, 0.75551816],
599
+ [0.46988778, 0.58774195, 0.75682037],
600
+ [0.47635605, 0.59192986, 0.75816245],
601
+ [0.48281101, 0.5961252, 0.75953212],
602
+ [0.4892374, 0.60032986, 0.76095418],
603
+ [0.49566225, 0.60454154, 0.76238852],
604
+ [0.50206137, 0.60876307, 0.76387371],
605
+ [0.50845128, 0.61299312, 0.76538551],
606
+ [0.5148258, 0.61723272, 0.76693475],
607
+ [0.52118385, 0.62148236, 0.76852436],
608
+ [0.52753571, 0.62574126, 0.77013939],
609
+ [0.53386831, 0.63001125, 0.77180152],
610
+ [0.54020159, 0.63429038, 0.7734803],
611
+ [0.54651272, 0.63858165, 0.77521306],
612
+ [0.55282975, 0.64288207, 0.77695608],
613
+ [0.55912585, 0.64719519, 0.77875327],
614
+ [0.56542599, 0.65151828, 0.78056551],
615
+ [0.57170924, 0.65585426, 0.78242747],
616
+ [0.57799572, 0.6602009, 0.78430751],
617
+ [0.58426817, 0.66456073, 0.78623458],
618
+ [0.590544, 0.66893178, 0.78818117],
619
+ [0.59680758, 0.67331643, 0.79017369],
620
+ [0.60307553, 0.67771273, 0.79218572],
621
+ [0.60934065, 0.68212194, 0.79422987],
622
+ [0.61559495, 0.68654548, 0.7963202],
623
+ [0.62185554, 0.69098125, 0.79842918],
624
+ [0.62810662, 0.69543176, 0.80058381],
625
+ [0.63436425, 0.69989499, 0.80275812],
626
+ [0.64061445, 0.70437326, 0.80497621],
627
+ [0.6468706, 0.70886488, 0.80721641],
628
+ [0.65312213, 0.7133717, 0.80949719],
629
+ [0.65937818, 0.71789261, 0.81180392],
630
+ [0.66563334, 0.72242871, 0.81414642],
631
+ [0.67189155, 0.72697967, 0.81651872],
632
+ [0.67815314, 0.73154569, 0.81892097],
633
+ [0.68441395, 0.73612771, 0.82136094],
634
+ [0.69068321, 0.74072452, 0.82382353],
635
+ [0.69694776, 0.7453385, 0.82633199],
636
+ [0.70322431, 0.74996721, 0.8288583],
637
+ [0.70949595, 0.75461368, 0.83143221],
638
+ [0.7157774, 0.75927574, 0.83402904],
639
+ [0.72206299, 0.76395461, 0.83665922],
640
+ [0.72835227, 0.76865061, 0.8393242],
641
+ [0.73465238, 0.7733628, 0.84201224],
642
+ [0.74094862, 0.77809393, 0.84474951],
643
+ [0.74725683, 0.78284158, 0.84750915],
644
+ [0.75357103, 0.78760701, 0.85030217],
645
+ [0.75988961, 0.79239077, 0.85313207],
646
+ [0.76621987, 0.79719185, 0.85598668],
647
+ [0.77255045, 0.8020125, 0.85888658],
648
+ [0.77889241, 0.80685102, 0.86181298],
649
+ [0.78524572, 0.81170768, 0.86476656],
650
+ [0.79159841, 0.81658489, 0.86776906],
651
+ [0.79796459, 0.82148036, 0.8707962],
652
+ [0.80434168, 0.82639479, 0.87385315],
653
+ [0.8107221, 0.83132983, 0.87695392],
654
+ [0.81711301, 0.8362844, 0.88008641],
655
+ [0.82351479, 0.84125863, 0.88325045],
656
+ [0.82992772, 0.84625263, 0.88644594],
657
+ [0.83634359, 0.85126806, 0.8896878],
658
+ [0.84277295, 0.85630293, 0.89295721],
659
+ [0.84921192, 0.86135782, 0.89626076],
660
+ [0.85566206, 0.866432, 0.89959467],
661
+ [0.86211514, 0.87152627, 0.90297183],
662
+ [0.86857483, 0.87663856, 0.90638248],
663
+ [0.87504231, 0.88176648, 0.90981938],
664
+ [0.88151194, 0.88690782, 0.91328493],
665
+ [0.88797938, 0.89205857, 0.91677544],
666
+ [0.89443865, 0.89721298, 0.9202854],
667
+ [0.90088204, 0.90236294, 0.92380601],
668
+ [0.90729768, 0.90749778, 0.92732797],
669
+ [0.91367037, 0.91260329, 0.93083814],
670
+ [0.91998105, 0.91766106, 0.93431861],
671
+ [0.92620596, 0.92264789, 0.93774647],
672
+ [0.93231683, 0.9275351, 0.94109192],
673
+ [0.93827772, 0.9322888, 0.94432312],
674
+ [0.94404755, 0.93686925, 0.94740137],
675
+ [0.94958284, 0.94123072, 0.95027696],
676
+ [0.95482682, 0.9453245, 0.95291103],
677
+ [0.9597248, 0.94909728, 0.95525103],
678
+ [0.96422552, 0.95249273, 0.95723271],
679
+ [0.96826161, 0.95545812, 0.95882188],
680
+ [0.97178458, 0.95793984, 0.95995705],
681
+ [0.97474105, 0.95989142, 0.96059997],
682
+ [0.97708604, 0.96127366, 0.96071853],
683
+ [0.97877855, 0.96205832, 0.96030095],
684
+ [0.97978484, 0.96222949, 0.95935496],
685
+ [0.9805997, 0.96155216, 0.95813083],
686
+ [0.98152619, 0.95993719, 0.95639322],
687
+ [0.9819726, 0.95766608, 0.95399269],
688
+ [0.98191855, 0.9547873, 0.95098107],
689
+ [0.98138514, 0.95134771, 0.94740644],
690
+ [0.98040845, 0.94739906, 0.94332125],
691
+ [0.97902107, 0.94300131, 0.93878672],
692
+ [0.97729348, 0.93820409, 0.93385135],
693
+ [0.9752533, 0.933073, 0.92858252],
694
+ [0.97297834, 0.92765261, 0.92302309],
695
+ [0.97049104, 0.92200317, 0.91723505],
696
+ [0.96784372, 0.91616744, 0.91126063],
697
+ [0.96507281, 0.91018664, 0.90514124],
698
+ [0.96222034, 0.90409203, 0.89890756],
699
+ [0.9593079, 0.89791478, 0.89259122],
700
+ [0.95635626, 0.89167908, 0.88621654],
701
+ [0.95338303, 0.88540373, 0.87980238],
702
+ [0.95040174, 0.87910333, 0.87336339],
703
+ [0.94742246, 0.87278899, 0.86691076],
704
+ [0.94445249, 0.86646893, 0.86045277],
705
+ [0.94150476, 0.86014606, 0.85399191],
706
+ [0.93857394, 0.85382798, 0.84753642],
707
+ [0.93566206, 0.84751766, 0.84108935],
708
+ [0.93277194, 0.8412164, 0.83465197],
709
+ [0.92990106, 0.83492672, 0.82822708],
710
+ [0.92704736, 0.82865028, 0.82181656],
711
+ [0.92422703, 0.82238092, 0.81541333],
712
+ [0.92142581, 0.81612448, 0.80902415],
713
+ [0.91864501, 0.80988032, 0.80264838],
714
+ [0.91587578, 0.80365187, 0.79629001],
715
+ [0.9131367, 0.79743115, 0.78994],
716
+ [0.91041602, 0.79122265, 0.78360361],
717
+ [0.90771071, 0.78502727, 0.77728196],
718
+ [0.90501581, 0.77884674, 0.7709771],
719
+ [0.90235365, 0.77267117, 0.76467793],
720
+ [0.8997019, 0.76650962, 0.75839484],
721
+ [0.89705346, 0.76036481, 0.752131],
722
+ [0.89444021, 0.75422253, 0.74587047],
723
+ [0.89183355, 0.74809474, 0.73962689],
724
+ [0.88923216, 0.74198168, 0.73340061],
725
+ [0.88665892, 0.73587283, 0.72717995],
726
+ [0.88408839, 0.72977904, 0.72097718],
727
+ [0.88153537, 0.72369332, 0.71478461],
728
+ [0.87899389, 0.7176179, 0.70860487],
729
+ [0.87645157, 0.71155805, 0.7024439],
730
+ [0.8739399, 0.70549893, 0.6962854],
731
+ [0.87142626, 0.6994551, 0.69014561],
732
+ [0.8689268, 0.69341868, 0.68401597],
733
+ [0.86643562, 0.687392, 0.67789917],
734
+ [0.86394434, 0.68137863, 0.67179927],
735
+ [0.86147586, 0.67536728, 0.665704],
736
+ [0.85899928, 0.66937226, 0.6596292],
737
+ [0.85654668, 0.66337773, 0.6535577],
738
+ [0.85408818, 0.65739772, 0.64750494],
739
+ [0.85164413, 0.65142189, 0.64145983],
740
+ [0.84920091, 0.6454565, 0.63542932],
741
+ [0.84676427, 0.63949827, 0.62941],
742
+ [0.84433231, 0.63354773, 0.62340261],
743
+ [0.84190106, 0.62760645, 0.61740899],
744
+ [0.83947935, 0.62166951, 0.61142404],
745
+ [0.8370538, 0.61574332, 0.60545478],
746
+ [0.83463975, 0.60981951, 0.59949247],
747
+ [0.83221877, 0.60390724, 0.593547],
748
+ [0.82980985, 0.59799607, 0.58760751],
749
+ [0.82740268, 0.59209095, 0.58167944],
750
+ [0.82498638, 0.5861973, 0.57576866],
751
+ [0.82258181, 0.5803034, 0.56986307],
752
+ [0.82016611, 0.57442123, 0.56397539],
753
+ [0.81776305, 0.56853725, 0.55809173],
754
+ [0.81534551, 0.56266602, 0.55222741],
755
+ [0.81294293, 0.55679056, 0.5463651],
756
+ [0.81052113, 0.55092973, 0.54052443],
757
+ [0.80811509, 0.54506305, 0.53468464],
758
+ [0.80568952, 0.53921036, 0.52886622],
759
+ [0.80327506, 0.53335335, 0.52305077],
760
+ [0.80084727, 0.52750583, 0.51725256],
761
+ [0.79842217, 0.5216578, 0.51146173],
762
+ [0.79599382, 0.51581223, 0.50568155],
763
+ [0.79355781, 0.50997127, 0.49991444],
764
+ [0.79112596, 0.50412707, 0.49415289],
765
+ [0.78867442, 0.49829386, 0.48841129],
766
+ [0.7862306, 0.49245398, 0.48267247],
767
+ [0.7837687, 0.48662309, 0.47695216],
768
+ [0.78130809, 0.4807883, 0.47123805],
769
+ [0.77884467, 0.47495151, 0.46553236],
770
+ [0.77636283, 0.46912235, 0.45984473],
771
+ [0.77388383, 0.46328617, 0.45416141],
772
+ [0.77138912, 0.45745466, 0.44849398],
773
+ [0.76888874, 0.45162042, 0.44283573],
774
+ [0.76638802, 0.44577901, 0.43718292],
775
+ [0.76386116, 0.43994762, 0.43155211],
776
+ [0.76133542, 0.43410655, 0.42592523],
777
+ [0.75880631, 0.42825801, 0.42030488],
778
+ [0.75624913, 0.42241905, 0.41470727],
779
+ [0.7536919, 0.41656866, 0.40911347],
780
+ [0.75112748, 0.41071104, 0.40352792],
781
+ [0.74854331, 0.40485474, 0.3979589],
782
+ [0.74594723, 0.39899309, 0.39240088],
783
+ [0.74334332, 0.39312199, 0.38685075],
784
+ [0.74073277, 0.38723941, 0.3813074],
785
+ [0.73809409, 0.38136133, 0.37578553],
786
+ [0.73544692, 0.37547129, 0.37027123],
787
+ [0.73278943, 0.36956954, 0.36476549],
788
+ [0.73011829, 0.36365761, 0.35927038],
789
+ [0.72743485, 0.35773314, 0.35378465],
790
+ [0.72472722, 0.35180504, 0.34831662],
791
+ [0.72200473, 0.34586421, 0.34285937],
792
+ [0.71927052, 0.33990649, 0.33741033],
793
+ [0.71652049, 0.33393396, 0.33197219],
794
+ [0.71375362, 0.32794602, 0.32654545],
795
+ [0.71096951, 0.32194148, 0.32113016],
796
+ [0.70816772, 0.31591904, 0.31572637],
797
+ [0.70534784, 0.30987734, 0.31033414],
798
+ [0.70250944, 0.30381489, 0.30495353],
799
+ [0.69965211, 0.2977301, 0.2995846],
800
+ [0.6967754, 0.29162126, 0.29422741],
801
+ [0.69388446, 0.28548074, 0.28887769],
802
+ [0.69097561, 0.2793096, 0.28353795],
803
+ [0.68803513, 0.27311993, 0.27821876],
804
+ [0.6850794, 0.26689144, 0.27290694],
805
+ [0.682108, 0.26062114, 0.26760246],
806
+ [0.67911013, 0.2543177, 0.26231367],
807
+ [0.67609393, 0.24796818, 0.25703372],
808
+ [0.67305921, 0.24156846, 0.25176238],
809
+ [0.67000176, 0.23511902, 0.24650278],
810
+ [0.66693423, 0.22859879, 0.24124404],
811
+ [0.6638441, 0.22201742, 0.2359961],
812
+ [0.66080672, 0.21526712, 0.23069468]
813
+ ]
814
+
815
+
816
+ _icefire_lut = [
817
+ [0.73936227, 0.90443867, 0.85757238],
818
+ [0.72888063, 0.89639109, 0.85488394],
819
+ [0.71834255, 0.88842162, 0.8521605],
820
+ [0.70773866, 0.88052939, 0.849422],
821
+ [0.69706215, 0.87271313, 0.84668315],
822
+ [0.68629021, 0.86497329, 0.84398721],
823
+ [0.67543654, 0.85730617, 0.84130969],
824
+ [0.66448539, 0.84971123, 0.83868005],
825
+ [0.65342679, 0.84218728, 0.83611512],
826
+ [0.64231804, 0.83471867, 0.83358584],
827
+ [0.63117745, 0.827294, 0.83113431],
828
+ [0.62000484, 0.81991069, 0.82876741],
829
+ [0.60879435, 0.81256797, 0.82648905],
830
+ [0.59754118, 0.80526458, 0.82430414],
831
+ [0.58624247, 0.79799884, 0.82221573],
832
+ [0.57489525, 0.7907688, 0.82022901],
833
+ [0.56349779, 0.78357215, 0.81834861],
834
+ [0.55204294, 0.77640827, 0.81657563],
835
+ [0.54052516, 0.76927562, 0.81491462],
836
+ [0.52894085, 0.76217215, 0.81336913],
837
+ [0.51728854, 0.75509528, 0.81194156],
838
+ [0.50555676, 0.74804469, 0.81063503],
839
+ [0.49373871, 0.7410187, 0.80945242],
840
+ [0.48183174, 0.73401449, 0.80839675],
841
+ [0.46982587, 0.72703075, 0.80747097],
842
+ [0.45770893, 0.72006648, 0.80667756],
843
+ [0.44547249, 0.71311941, 0.80601991],
844
+ [0.43318643, 0.70617126, 0.80549278],
845
+ [0.42110294, 0.69916972, 0.80506683],
846
+ [0.40925101, 0.69211059, 0.80473246],
847
+ [0.3976693, 0.68498786, 0.80448272],
848
+ [0.38632002, 0.67781125, 0.80431024],
849
+ [0.37523981, 0.67057537, 0.80420832],
850
+ [0.36442578, 0.66328229, 0.80417474],
851
+ [0.35385939, 0.65593699, 0.80420591],
852
+ [0.34358916, 0.64853177, 0.8043],
853
+ [0.33355526, 0.64107876, 0.80445484],
854
+ [0.32383062, 0.63356578, 0.80467091],
855
+ [0.31434372, 0.62600624, 0.8049475],
856
+ [0.30516161, 0.618389, 0.80528692],
857
+ [0.29623491, 0.61072284, 0.80569021],
858
+ [0.28759072, 0.60300319, 0.80616055],
859
+ [0.27923924, 0.59522877, 0.80669803],
860
+ [0.27114651, 0.5874047, 0.80730545],
861
+ [0.26337153, 0.57952055, 0.80799113],
862
+ [0.25588696, 0.57157984, 0.80875922],
863
+ [0.248686, 0.56358255, 0.80961366],
864
+ [0.24180668, 0.55552289, 0.81055123],
865
+ [0.23526251, 0.54739477, 0.8115939],
866
+ [0.22921445, 0.53918506, 0.81267292],
867
+ [0.22397687, 0.53086094, 0.8137141],
868
+ [0.21977058, 0.52241482, 0.81457651],
869
+ [0.21658989, 0.51384321, 0.81528511],
870
+ [0.21452772, 0.50514155, 0.81577278],
871
+ [0.21372783, 0.49630865, 0.81589566],
872
+ [0.21409503, 0.48734861, 0.81566163],
873
+ [0.2157176, 0.47827123, 0.81487615],
874
+ [0.21842857, 0.46909168, 0.81351614],
875
+ [0.22211705, 0.45983212, 0.81146983],
876
+ [0.22665681, 0.45052233, 0.80860217],
877
+ [0.23176013, 0.44119137, 0.80494325],
878
+ [0.23727775, 0.43187704, 0.80038017],
879
+ [0.24298285, 0.42261123, 0.79493267],
880
+ [0.24865068, 0.41341842, 0.78869164],
881
+ [0.25423116, 0.40433127, 0.78155831],
882
+ [0.25950239, 0.39535521, 0.77376848],
883
+ [0.2644736, 0.38651212, 0.76524809],
884
+ [0.26901584, 0.37779582, 0.75621942],
885
+ [0.27318141, 0.36922056, 0.746605],
886
+ [0.27690355, 0.3607736, 0.73659374],
887
+ [0.28023585, 0.35244234, 0.72622103],
888
+ [0.28306009, 0.34438449, 0.71500731],
889
+ [0.28535896, 0.33660243, 0.70303975],
890
+ [0.28708711, 0.32912157, 0.69034504],
891
+ [0.28816354, 0.32200604, 0.67684067],
892
+ [0.28862749, 0.31519824, 0.66278813],
893
+ [0.28847904, 0.30869064, 0.6482815],
894
+ [0.28770912, 0.30250126, 0.63331265],
895
+ [0.28640325, 0.29655509, 0.61811374],
896
+ [0.28458943, 0.29082155, 0.60280913],
897
+ [0.28233561, 0.28527482, 0.58742866],
898
+ [0.27967038, 0.2798938, 0.57204225],
899
+ [0.27665361, 0.27465357, 0.55667809],
900
+ [0.27332564, 0.2695165, 0.54145387],
901
+ [0.26973851, 0.26447054, 0.52634916],
902
+ [0.2659204, 0.25949691, 0.511417],
903
+ [0.26190145, 0.25458123, 0.49668768],
904
+ [0.2577151, 0.24971691, 0.48214874],
905
+ [0.25337618, 0.24490494, 0.46778758],
906
+ [0.24890842, 0.24013332, 0.45363816],
907
+ [0.24433654, 0.23539226, 0.4397245],
908
+ [0.23967922, 0.23067729, 0.4260591],
909
+ [0.23495608, 0.22598894, 0.41262952],
910
+ [0.23018113, 0.22132414, 0.39945577],
911
+ [0.22534609, 0.21670847, 0.38645794],
912
+ [0.22048761, 0.21211723, 0.37372555],
913
+ [0.2156198, 0.20755389, 0.36125301],
914
+ [0.21074637, 0.20302717, 0.34903192],
915
+ [0.20586893, 0.19855368, 0.33701661],
916
+ [0.20101757, 0.19411573, 0.32529173],
917
+ [0.19619947, 0.18972425, 0.31383846],
918
+ [0.19140726, 0.18540157, 0.30260777],
919
+ [0.1866769, 0.1811332, 0.29166583],
920
+ [0.18201285, 0.17694992, 0.28088776],
921
+ [0.17745228, 0.17282141, 0.27044211],
922
+ [0.17300684, 0.16876921, 0.26024893],
923
+ [0.16868273, 0.16479861, 0.25034479],
924
+ [0.16448691, 0.16091728, 0.24075373],
925
+ [0.16043195, 0.15714351, 0.23141745],
926
+ [0.15652427, 0.15348248, 0.22238175],
927
+ [0.15277065, 0.14994111, 0.21368395],
928
+ [0.14918274, 0.14653431, 0.20529486],
929
+ [0.14577095, 0.14327403, 0.19720829],
930
+ [0.14254381, 0.14016944, 0.18944326],
931
+ [0.13951035, 0.13723063, 0.18201072],
932
+ [0.13667798, 0.13446606, 0.17493774],
933
+ [0.13405762, 0.13188822, 0.16820842],
934
+ [0.13165767, 0.12950667, 0.16183275],
935
+ [0.12948748, 0.12733187, 0.15580631],
936
+ [0.12755435, 0.1253723, 0.15014098],
937
+ [0.12586516, 0.12363617, 0.1448459],
938
+ [0.12442647, 0.12213143, 0.13992571],
939
+ [0.12324241, 0.12086419, 0.13539995],
940
+ [0.12232067, 0.11984278, 0.13124644],
941
+ [0.12166209, 0.11907077, 0.12749671],
942
+ [0.12126982, 0.11855309, 0.12415079],
943
+ [0.12114244, 0.11829179, 0.1212385],
944
+ [0.12127766, 0.11828837, 0.11878534],
945
+ [0.12284806, 0.1179729, 0.11772022],
946
+ [0.12619498, 0.11721796, 0.11770203],
947
+ [0.129968, 0.11663788, 0.11792377],
948
+ [0.13410011, 0.11625146, 0.11839138],
949
+ [0.13855459, 0.11606618, 0.11910584],
950
+ [0.14333775, 0.11607038, 0.1200606],
951
+ [0.148417, 0.11626929, 0.12125453],
952
+ [0.15377389, 0.11666192, 0.12268364],
953
+ [0.15941427, 0.11723486, 0.12433911],
954
+ [0.16533376, 0.11797856, 0.12621303],
955
+ [0.17152547, 0.11888403, 0.12829735],
956
+ [0.17797765, 0.11994436, 0.13058435],
957
+ [0.18468769, 0.12114722, 0.13306426],
958
+ [0.19165663, 0.12247737, 0.13572616],
959
+ [0.19884415, 0.12394381, 0.1385669],
960
+ [0.20627181, 0.12551883, 0.14157124],
961
+ [0.21394877, 0.12718055, 0.14472604],
962
+ [0.22184572, 0.12893119, 0.14802579],
963
+ [0.22994394, 0.13076731, 0.15146314],
964
+ [0.23823937, 0.13267611, 0.15502793],
965
+ [0.24676041, 0.13462172, 0.15870321],
966
+ [0.25546457, 0.13661751, 0.16248722],
967
+ [0.26433628, 0.13865956, 0.16637301],
968
+ [0.27341345, 0.14070412, 0.17034221],
969
+ [0.28264773, 0.14277192, 0.1743957],
970
+ [0.29202272, 0.14486161, 0.17852793],
971
+ [0.30159648, 0.14691224, 0.1827169],
972
+ [0.31129002, 0.14897583, 0.18695213],
973
+ [0.32111555, 0.15103351, 0.19119629],
974
+ [0.33107961, 0.1530674, 0.19543758],
975
+ [0.34119892, 0.15504762, 0.1996803],
976
+ [0.35142388, 0.15701131, 0.20389086],
977
+ [0.36178937, 0.1589124, 0.20807639],
978
+ [0.37229381, 0.16073993, 0.21223189],
979
+ [0.38288348, 0.16254006, 0.2163249],
980
+ [0.39359592, 0.16426336, 0.22036577],
981
+ [0.40444332, 0.16588767, 0.22434027],
982
+ [0.41537995, 0.16745325, 0.2282297],
983
+ [0.42640867, 0.16894939, 0.23202755],
984
+ [0.43754706, 0.17034847, 0.23572899],
985
+ [0.44878564, 0.1716535, 0.23932344],
986
+ [0.4601126, 0.17287365, 0.24278607],
987
+ [0.47151732, 0.17401641, 0.24610337],
988
+ [0.48300689, 0.17506676, 0.2492737],
989
+ [0.49458302, 0.17601892, 0.25227688],
990
+ [0.50623876, 0.17687777, 0.255096],
991
+ [0.5179623, 0.17765528, 0.2577162],
992
+ [0.52975234, 0.17835232, 0.2601134],
993
+ [0.54159776, 0.17898292, 0.26226847],
994
+ [0.55348804, 0.17956232, 0.26416003],
995
+ [0.56541729, 0.18010175, 0.26575971],
996
+ [0.57736669, 0.180631, 0.26704888],
997
+ [0.58932081, 0.18117827, 0.26800409],
998
+ [0.60127582, 0.18175888, 0.26858488],
999
+ [0.61319563, 0.1824336, 0.2687872],
1000
+ [0.62506376, 0.18324015, 0.26858301],
1001
+ [0.63681202, 0.18430173, 0.26795276],
1002
+ [0.64842603, 0.18565472, 0.26689463],
1003
+ [0.65988195, 0.18734638, 0.26543435],
1004
+ [0.67111966, 0.18948885, 0.26357955],
1005
+ [0.68209194, 0.19216636, 0.26137175],
1006
+ [0.69281185, 0.19535326, 0.25887063],
1007
+ [0.70335022, 0.19891271, 0.25617971],
1008
+ [0.71375229, 0.20276438, 0.25331365],
1009
+ [0.72401436, 0.20691287, 0.25027366],
1010
+ [0.73407638, 0.21145051, 0.24710661],
1011
+ [0.74396983, 0.21631913, 0.24380715],
1012
+ [0.75361506, 0.22163653, 0.24043996],
1013
+ [0.7630579, 0.22731637, 0.23700095],
1014
+ [0.77222228, 0.23346231, 0.23356628],
1015
+ [0.78115441, 0.23998404, 0.23013825],
1016
+ [0.78979746, 0.24694858, 0.22678822],
1017
+ [0.79819286, 0.25427223, 0.22352658],
1018
+ [0.80630444, 0.26198807, 0.22040877],
1019
+ [0.81417437, 0.27001406, 0.21744645],
1020
+ [0.82177364, 0.27837336, 0.21468316],
1021
+ [0.82915955, 0.28696963, 0.21210766],
1022
+ [0.83628628, 0.2958499, 0.20977813],
1023
+ [0.84322168, 0.30491136, 0.20766435],
1024
+ [0.84995458, 0.31415945, 0.2057863],
1025
+ [0.85648867, 0.32358058, 0.20415327],
1026
+ [0.86286243, 0.33312058, 0.20274969],
1027
+ [0.86908321, 0.34276705, 0.20157271],
1028
+ [0.87512876, 0.3525416, 0.20064949],
1029
+ [0.88100349, 0.36243385, 0.19999078],
1030
+ [0.8866469, 0.37249496, 0.1997976],
1031
+ [0.89203964, 0.38273475, 0.20013431],
1032
+ [0.89713496, 0.39318156, 0.20121514],
1033
+ [0.90195099, 0.40380687, 0.20301555],
1034
+ [0.90648379, 0.41460191, 0.20558847],
1035
+ [0.9106967, 0.42557857, 0.20918529],
1036
+ [0.91463791, 0.43668557, 0.21367954],
1037
+ [0.91830723, 0.44790913, 0.21916352],
1038
+ [0.92171507, 0.45922856, 0.22568002],
1039
+ [0.92491786, 0.4705936, 0.23308207],
1040
+ [0.92790792, 0.48200153, 0.24145932],
1041
+ [0.93073701, 0.49341219, 0.25065486],
1042
+ [0.93343918, 0.5048017, 0.26056148],
1043
+ [0.93602064, 0.51616486, 0.27118485],
1044
+ [0.93850535, 0.52748892, 0.28242464],
1045
+ [0.94092933, 0.53875462, 0.29416042],
1046
+ [0.94330011, 0.5499628, 0.30634189],
1047
+ [0.94563159, 0.56110987, 0.31891624],
1048
+ [0.94792955, 0.57219822, 0.33184256],
1049
+ [0.95020929, 0.5832232, 0.34508419],
1050
+ [0.95247324, 0.59419035, 0.35859866],
1051
+ [0.95471709, 0.60510869, 0.37236035],
1052
+ [0.95698411, 0.61595766, 0.38629631],
1053
+ [0.95923863, 0.62676473, 0.40043317],
1054
+ [0.9615041, 0.6375203, 0.41474106],
1055
+ [0.96371553, 0.64826619, 0.42928335],
1056
+ [0.96591497, 0.65899621, 0.44380444],
1057
+ [0.96809871, 0.66971662, 0.45830232],
1058
+ [0.9702495, 0.6804394, 0.47280492],
1059
+ [0.9723881, 0.69115622, 0.48729272],
1060
+ [0.97450723, 0.70187358, 0.50178034],
1061
+ [0.9766108, 0.712592, 0.51626837],
1062
+ [0.97871716, 0.72330511, 0.53074053],
1063
+ [0.98082222, 0.73401769, 0.54520694],
1064
+ [0.9829001, 0.74474445, 0.5597019],
1065
+ [0.98497466, 0.75547635, 0.57420239],
1066
+ [0.98705581, 0.76621129, 0.58870185],
1067
+ [0.98913325, 0.77695637, 0.60321626],
1068
+ [0.99119918, 0.78771716, 0.61775821],
1069
+ [0.9932672, 0.79848979, 0.63231691],
1070
+ [0.99535958, 0.80926704, 0.64687278],
1071
+ [0.99740544, 0.82008078, 0.66150571],
1072
+ [0.9992197, 0.83100723, 0.6764127]
1073
+ ]
1074
+
1075
+
1076
+ _luts = [_rocket_lut, _mako_lut, _vlag_lut, _icefire_lut]
1077
+ _names = ["rocket", "mako", "vlag", "icefire"]
1078
+
1079
+ for _lut, _name in zip(_luts, _names):
1080
+
1081
+ _cmap = colors.ListedColormap(_lut, _name)
1082
+ locals()[_name] = _cmap
1083
+
1084
+ _cmap_r = colors.ListedColormap(_lut[::-1], _name + "_r")
1085
+ locals()[_name + "_r"] = _cmap_r
1086
+
1087
+ mpl_cm.register(_cmap, name=_name)
1088
+ mpl_cm.register(_cmap_r, name=_name + "_r")
deepTools/source/deeptools/computeGCBias.py ADDED
@@ -0,0 +1,800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import time
5
+
6
+ import multiprocessing
7
+ import numpy as np
8
+ import argparse
9
+ from scipy.stats import poisson
10
+ import py2bit
11
+ import sys
12
+
13
+ from deeptoolsintervals import GTF
14
+ from deeptools.utilities import tbitToBamChrName, getGC_content
15
+ from deeptools import parserCommon, mapReduce
16
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
17
+ from deeptools import bamHandler
18
+
19
+ debug = 0
20
+ old_settings = np.seterr(all='ignore')
21
+
22
+
23
+ def parse_arguments(args=None):
24
+ parentParser = parserCommon.getParentArgParse(binSize=False, blackList=True)
25
+ requiredArgs = getRequiredArgs()
26
+ parser = argparse.ArgumentParser(
27
+ parents=[requiredArgs, parentParser],
28
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
29
+ description='Computes the GC-bias using Benjamini\'s method '
30
+ '[Benjamini & Speed (2012). Nucleic Acids Research, 40(10). doi: 10.1093/nar/gks001]. '
31
+ 'The GC-bias is visualized and the resulting table can be used to'
32
+ 'correct the bias with `correctGCBias`.',
33
+ usage='computeGCBias '
34
+ '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit -l 200 --GCbiasFrequenciesFile freq.txt\n'
35
+ 'help: computeGCBias -h / computeGCBias --help',
36
+ conflict_handler='resolve',
37
+ add_help=False)
38
+
39
+ return parser
40
+
41
+
42
+ def getRequiredArgs():
43
+ parser = argparse.ArgumentParser(add_help=False)
44
+
45
+ required = parser.add_argument_group('Required arguments')
46
+
47
+ required.add_argument('--bamfile', '-b',
48
+ metavar='bam file',
49
+ help='Sorted BAM file. ',
50
+ required=True)
51
+
52
+ required.add_argument('--effectiveGenomeSize',
53
+ help='The effective genome size is the portion '
54
+ 'of the genome that is mappable. Large fractions of '
55
+ 'the genome are stretches of NNNN that should be '
56
+ 'discarded. Also, if repetitive regions were not '
57
+ 'included in the mapping of reads, the effective '
58
+ 'genome size needs to be adjusted accordingly. '
59
+ 'A table of values is available here: '
60
+ 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .',
61
+ default=None,
62
+ type=int,
63
+ required=True)
64
+
65
+ required.add_argument('--genome', '-g',
66
+ help='Genome in two bit format. Most genomes can be '
67
+ 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ '
68
+ 'Search for the .2bit ending. Otherwise, fasta '
69
+ 'files can be converted to 2bit using the UCSC '
70
+ 'programm called faToTwoBit available for different '
71
+ 'plattforms at '
72
+ 'http://hgdownload.cse.ucsc.edu/admin/exe/',
73
+ metavar='2bit FILE',
74
+ required=True)
75
+
76
+ required.add_argument('--GCbiasFrequenciesFile', '-freq', '-o',
77
+ help='Path to save the file containing '
78
+ 'the observed and expected read frequencies per %%GC-'
79
+ 'content. This file is needed to run the '
80
+ 'correctGCBias tool. This is a text file.',
81
+ type=argparse.FileType('w'),
82
+ metavar='FILE',
83
+ required=True)
84
+
85
+ # define the optional arguments
86
+ optional = parser.add_argument_group('Optional arguments')
87
+ optional.add_argument('--fragmentLength', '-l',
88
+ help='Fragment length used for the sequencing. If '
89
+ 'paired-end reads are used, the fragment length is '
90
+ 'computed based from the bam file',
91
+ type=int)
92
+
93
+ optional.add_argument("--help", "-h", action="help",
94
+ help="show this help message and exit")
95
+
96
+ optional.add_argument('--sampleSize',
97
+ default=5e7,
98
+ help='Number of sampling points to be considered. (Default: %(default)s)',
99
+ type=int)
100
+
101
+ optional.add_argument('--extraSampling',
102
+ help='BED file containing genomic regions for which '
103
+ 'extra sampling is required because they are '
104
+ 'underrepresented in the genome.',
105
+ type=argparse.FileType('r'),
106
+ metavar='BED file')
107
+
108
+ plot = parser.add_argument_group('Diagnostic plot options')
109
+
110
+ plot.add_argument('--biasPlot',
111
+ metavar='FILE NAME',
112
+ help='If given, a diagnostic image summarizing '
113
+ 'the GC-bias will be saved.')
114
+
115
+ plot.add_argument('--plotFileFormat',
116
+ metavar='',
117
+ help='image format type. If given, this '
118
+ 'option overrides the '
119
+ 'image format based on the plotFile ending. '
120
+ 'The available options are: "png", '
121
+ '"eps", "pdf", "plotly" and "svg"',
122
+ choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
123
+
124
+ plot.add_argument('--regionSize',
125
+ metavar='INT',
126
+ type=int,
127
+ default=300,
128
+ help='To plot the reads per %%GC over a region'
129
+ 'the size of the region is required. By default, '
130
+ 'the bin size is set to 300 bases, which is close to the '
131
+ 'standard fragment size for Illumina machines. However, '
132
+ 'if the depth of sequencing is low, a larger bin size '
133
+ 'will be required, otherwise many bins will not '
134
+ 'overlap with any read (Default: %(default)s)')
135
+
136
+ return parser
137
+
138
+
139
+ def getPositionsToSample(chrom, start, end, stepSize):
140
+ """
141
+ check if the region submitted to the worker
142
+ overlaps with the region to take extra effort to sample.
143
+ If that is the case, the regions to sample array is
144
+ increased to match each of the positions in the extra
145
+ effort region sampled at the same stepSize along the interval.
146
+
147
+ If a filter out tree is given, then from positions to sample
148
+ those regions are cleaned
149
+ """
150
+ positions_to_sample = np.arange(start, end, stepSize)
151
+
152
+ if global_vars['filter_out']:
153
+ filter_out_tree = GTF(global_vars['filter_out'])
154
+ else:
155
+ filter_out_tree = None
156
+
157
+ if global_vars['extra_sampling_file']:
158
+ extra_tree = GTF(global_vars['extra_sampling_file'])
159
+ else:
160
+ extra_tree = None
161
+
162
+ if extra_tree:
163
+ orig_len = len(positions_to_sample)
164
+ try:
165
+ extra_match = extra_tree.findOverlaps(chrom, start, end)
166
+ except KeyError:
167
+ extra_match = []
168
+
169
+ if len(extra_match) > 0:
170
+ for intval in extra_match:
171
+ positions_to_sample = np.append(positions_to_sample,
172
+ list(range(intval[0], intval[1], stepSize)))
173
+ # remove duplicates
174
+ positions_to_sample = np.unique(np.sort(positions_to_sample))
175
+ if debug:
176
+ print("sampling increased to {} from {}".format(
177
+ len(positions_to_sample),
178
+ orig_len))
179
+
180
+ # skip regions that are filtered out
181
+ if filter_out_tree:
182
+ try:
183
+ out_match = filter_out_tree.findOverlaps(chrom, start, end)
184
+ except KeyError:
185
+ out_match = []
186
+
187
+ if len(out_match) > 0:
188
+ for intval in out_match:
189
+ positions_to_sample = \
190
+ positions_to_sample[(positions_to_sample < intval[0]) | (positions_to_sample >= intval[1])]
191
+ return positions_to_sample
192
+
193
+
194
+ def countReadsPerGC_wrapper(args):
195
+ return countReadsPerGC_worker(*args)
196
+
197
+
198
+ def countReadsPerGC_worker(chromNameBam,
199
+ start, end, stepSize, regionSize,
200
+ chrNameBamToBit, verbose=False):
201
+ """given a genome region defined by
202
+ (start, end), the GC content is quantified for
203
+ regions of size regionSize that are contiguous
204
+ """
205
+
206
+ chromNameBit = chrNameBamToBit[chromNameBam]
207
+ tbit = py2bit.open(global_vars['2bit'])
208
+ bam = bamHandler.openBam(global_vars['bam'])
209
+ c = 1
210
+ sub_reads_per_gc = []
211
+ positions_to_sample = getPositionsToSample(chromNameBit,
212
+ start, end, stepSize)
213
+
214
+ for index in range(len(positions_to_sample)):
215
+ i = positions_to_sample[index]
216
+ # stop if region extends over the chromosome end
217
+ if tbit.chroms(chromNameBit) < i + regionSize:
218
+ break
219
+
220
+ try:
221
+ gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize))
222
+ except Exception as detail:
223
+ if verbose:
224
+ print("{}:{}-{}".format(chromNameBit, i, i + regionSize))
225
+ print(detail)
226
+ continue
227
+ numberReads = bam.count(chromNameBam, i, i + regionSize)
228
+ sub_reads_per_gc.append((numberReads, gc))
229
+ c += 1
230
+
231
+ return sub_reads_per_gc
232
+
233
+
234
+ def tabulateGCcontent_wrapper(args):
235
+ return tabulateGCcontent_worker(*args)
236
+
237
+
238
+ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
239
+ fragmentLength,
240
+ chrNameBamToBit, verbose=False):
241
+ r""" given genome regions, the GC content of the genome is tabulated for
242
+ fragments of length 'fragmentLength' each 'stepSize' positions.
243
+
244
+ >>> test = Tester()
245
+ >>> args = test.testTabulateGCcontentWorker()
246
+ >>> N_gc, F_gc = tabulateGCcontent_worker(*args)
247
+
248
+ The forward read positions are:
249
+ [1, 4, 10, 10, 16, 18]
250
+ which correspond to a GC of
251
+ [1, 1, 1, 1, 2, 1]
252
+
253
+ The evaluated position are
254
+ [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
255
+ the corresponding GC is
256
+ [2, 1, 1, 2, 2, 1, 2, 3, 2, 1]
257
+
258
+ >>> print(N_gc)
259
+ [0 4 5 1]
260
+ >>> print(F_gc)
261
+ [0 4 1 0]
262
+ >>> test.set_filter_out_file()
263
+ >>> chrNameBam2bit = {'2L': 'chr2L'}
264
+
265
+ Test for the filter out option
266
+ >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
267
+ ... {'median': 3}, chrNameBam2bit)
268
+ >>> test.unset_filter_out_file()
269
+
270
+ The evaluated positions are
271
+ [ 0 2 8 10 12 14 16 18]
272
+ >>> print(N_gc)
273
+ [0 3 4 1]
274
+ >>> print(F_gc)
275
+ [0 3 1 0]
276
+
277
+ Test for extra_sampling option
278
+ >>> test.set_extra_sampling_file()
279
+ >>> chrNameBam2bit = {'2L': 'chr2L'}
280
+ >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
281
+ ... {'median': 3}, chrNameBam2bit)
282
+
283
+ The new positions evaluated are
284
+ [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
285
+ and the GC is
286
+ [2, 1, 1, 0, 1, 2, 2, 1, 2, 3, 2, 1]
287
+ >>> print(res[0])
288
+ [1 5 5 1]
289
+ >>> print(res[1])
290
+ [0 5 1 0]
291
+
292
+ """
293
+ if start > end:
294
+ raise NameError("start %d bigger that end %d" % (start, end))
295
+
296
+ chromNameBit = chrNameBamToBit[chromNameBam]
297
+
298
+ # array to keep track of the GC from regions of length 'fragmentLength'
299
+ # from the genome. The index of the array is used to
300
+ # indicate the gc content. The values inside the
301
+ # array are counts. Thus, if N_gc[10] = 3, that means
302
+ # that 3 regions have a gc_content of 10.
303
+ subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
304
+ subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
305
+
306
+ tbit = py2bit.open(global_vars['2bit'])
307
+ bam = bamHandler.openBam(global_vars['bam'])
308
+ peak = 0
309
+ startTime = time.time()
310
+
311
+ if verbose:
312
+ print("[{:.3f}] computing positions to "
313
+ "sample".format(time.time() - startTime))
314
+
315
+ positions_to_sample = getPositionsToSample(chromNameBit,
316
+ start, end, stepSize)
317
+
318
+ read_counts = []
319
+ # Optimize IO.
320
+ # if the sample regions are far apart from each
321
+ # other is faster to go to each location and fetch
322
+ # the reads found there.
323
+ # Otherwise, if the regions to sample are close to
324
+ # each other, is faster to load all the reads in
325
+ # a large region into memory and consider only
326
+ # those falling into the positions to sample.
327
+ # The following code gets the reads
328
+ # that are at sampling positions that lie close together
329
+ if np.mean(np.diff(positions_to_sample)) < 1000:
330
+ start_pos = min(positions_to_sample)
331
+ end_pos = max(positions_to_sample)
332
+ if verbose:
333
+ print("[{:.3f}] caching reads".format(time.time() - startTime))
334
+
335
+ counts = np.bincount([r.pos - start_pos
336
+ for r in bam.fetch(chromNameBam, start_pos,
337
+ end_pos + 1)
338
+ if not r.is_reverse and not r.is_unmapped and r.pos >= start_pos],
339
+ minlength=end_pos - start_pos + 2)
340
+
341
+ read_counts = counts[positions_to_sample - min(positions_to_sample)]
342
+ if verbose:
343
+ print("[{:.3f}] finish caching reads.".format(
344
+ time.time() - startTime))
345
+
346
+ countTime = time.time()
347
+
348
+ c = 1
349
+ for index in range(len(positions_to_sample)):
350
+ i = positions_to_sample[index]
351
+ # stop if the end of the chromosome is reached
352
+ if i + fragmentLength['median'] > tbit.chroms(chromNameBit):
353
+ break
354
+
355
+ try:
356
+ gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False)
357
+ except Exception as detail:
358
+ if verbose:
359
+ print(detail)
360
+ continue
361
+
362
+ subN_gc[gc] += 1
363
+
364
+ # count all reads at position 'i'
365
+ if len(read_counts) == 0: # case when no cache was done
366
+ num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
367
+ if x.is_reverse is False and x.pos == i])
368
+ else:
369
+ num_reads = read_counts[index]
370
+
371
+ if num_reads >= global_vars['max_reads']:
372
+ peak += 1
373
+ continue
374
+
375
+ subF_gc[gc] += num_reads
376
+ if verbose:
377
+ if index % 50000 == 0:
378
+ endTime = time.time()
379
+ print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
380
+ (multiprocessing.current_process().name,
381
+ index, index / (endTime - countTime),
382
+ chromNameBit, start, end, stepSize))
383
+ c += 1
384
+
385
+ if verbose:
386
+ endTime = time.time()
387
+ print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
388
+ (multiprocessing.current_process().name,
389
+ index, index / (endTime - countTime),
390
+ chromNameBit, start, end, stepSize))
391
+ print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
392
+ (endTime - startTime), chromNameBit, start, end, stepSize))
393
+
394
+ return subN_gc, subF_gc
395
+
396
+
397
+ def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize,
398
+ chromSizes, numberOfProcessors=None, verbose=False,
399
+ region=None):
400
+ r"""
401
+ Subdivides the genome or the reads into chunks to be analyzed in parallel
402
+ using several processors. This codes handles the creation of
403
+ workers that tabulate the GC content for small regions and then
404
+ collects and integrates the results
405
+ >>> test = Tester()
406
+ >>> arg = test.testTabulateGCcontent()
407
+ >>> res = tabulateGCcontent(*arg)
408
+ >>> res
409
+ array([[ 0. , 18. , 1. ],
410
+ [ 3. , 63. , 0.45815996],
411
+ [ 7. , 159. , 0.42358185],
412
+ [ 25. , 192. , 1.25278115],
413
+ [ 28. , 215. , 1.25301422],
414
+ [ 16. , 214. , 0.71935396],
415
+ [ 12. , 95. , 1.21532959],
416
+ [ 9. , 24. , 3.60800971],
417
+ [ 3. , 11. , 2.62400706],
418
+ [ 0. , 0. , 1. ],
419
+ [ 0. , 0. , 1. ]])
420
+ """
421
+ global global_vars
422
+
423
+ chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
424
+ chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
425
+ chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())]
426
+
427
+ imap_res = mapReduce.mapReduce((stepSize,
428
+ fragmentLength, chrNameBamToBit,
429
+ verbose),
430
+ tabulateGCcontent_wrapper,
431
+ chromSizes,
432
+ genomeChunkLength=chunkSize,
433
+ numberOfProcessors=numberOfProcessors,
434
+ region=region)
435
+
436
+ for subN_gc, subF_gc in imap_res:
437
+ try:
438
+ F_gc += subF_gc
439
+ N_gc += subN_gc
440
+ except NameError:
441
+ F_gc = subF_gc
442
+ N_gc = subN_gc
443
+
444
+ if sum(F_gc) == 0:
445
+ sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter")
446
+ scaling = float(sum(N_gc)) / float(sum(F_gc))
447
+
448
+ R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling
449
+ if N_gc[x] and F_gc[x] > 0 else 1
450
+ for x in range(len(F_gc))])
451
+
452
+ data = np.transpose(np.vstack((F_gc, N_gc, R_gc)))
453
+ return data
454
+
455
+
456
+ def countReadsPerGC(regionSize, chrNameBitToBam, stepSize,
457
+ chromSizes, numberOfProcessors=None, verbose=False,
458
+ region=None):
459
+ r"""
460
+ Computes for a region of size regionSize, the GC of the region
461
+ and the number of reads that overlap it.
462
+ >>> test = Tester()
463
+ >>> arg = test.testCountReadsPerGC()
464
+ >>> reads_per_gc = countReadsPerGC(*arg)
465
+ >>> reads_per_gc[0:5,:]
466
+ array([[132. , 0.44 ],
467
+ [132. , 0.44 ],
468
+ [133. , 0.44 ],
469
+ [134. , 0.43666667],
470
+ [134. , 0.44 ]])
471
+ """
472
+ global global_vars
473
+
474
+ chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
475
+ chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
476
+
477
+ imap_res = mapReduce.mapReduce((stepSize,
478
+ regionSize, chrNameBamToBit,
479
+ verbose),
480
+ countReadsPerGC_wrapper,
481
+ chromSizes,
482
+ genomeChunkLength=chunkSize,
483
+ numberOfProcessors=numberOfProcessors,
484
+ region=region)
485
+
486
+ reads_per_gc = []
487
+ for sub_reads_per_gc in imap_res:
488
+ reads_per_gc += sub_reads_per_gc
489
+
490
+ reads_per_gc = np.asarray(reads_per_gc)
491
+ return reads_per_gc
492
+
493
+
494
+ def smooth(x, window_len=3):
495
+ """
496
+ *CURRENTLY* not being used
497
+ smooths the values from the frequencies by taking the average
498
+ of 'window_len' values. window_len has to be an odd number
499
+ """
500
+ # do not smooth small arrays
501
+ if len(x) < window_len * 2:
502
+ return x
503
+ i = 0
504
+ y = x[:]
505
+ half_width = (window_len - 1) / 2
506
+ for i in range(0, len(x)):
507
+ if i < half_width or i + half_width + 1 > len(x):
508
+ continue
509
+ else:
510
+ y[i] = np.mean(x[i - half_width:i + half_width + 1])
511
+ # clip low values, this avoid problems with zeros
512
+ return y
513
+
514
+
515
+ def bin_by(x, y, nbins=10):
516
+ """
517
+ Bin x by y.
518
+ Returns the binned "x" values and the left edges of the bins
519
+ """
520
+ bins = np.linspace(0, 1, nbins + 1)
521
+ # To avoid extra bin for the max value
522
+ bins[-1] += 1
523
+
524
+ indices = np.digitize(y, bins)
525
+
526
+ output = []
527
+ for i in range(1, len(bins)):
528
+ output.append(x[indices == i])
529
+
530
+ # Just return the left edges of the bins
531
+ bins = bins[:-1]
532
+
533
+ return output, bins
534
+
535
+
536
+ def plotlyGCbias(file_name, frequencies, reads_per_gc, region_size):
537
+ import plotly.offline as py
538
+ import plotly.graph_objs as go
539
+ import matplotlib.cbook as cbook
540
+
541
+ fig = go.Figure()
542
+ fig['layout']['xaxis1'] = dict(domain=[0.0, 1.0], anchor="y1", title="GC fraction")
543
+ fig['layout']['yaxis1'] = dict(domain=[0.55, 1.0], anchor="x1", title="Number of reads")
544
+ fig['layout']['xaxis2'] = dict(domain=[0.0, 1.0], anchor="y2", title="GC fraction", range=[0.2, 0.7])
545
+ fig['layout']['yaxis2'] = dict(domain=[0.0, 0.45], anchor="x2", title="log2(observed/expected)")
546
+ text = "reads per {} base region".format(region_size)
547
+ annos = [{'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 1.0, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False}]
548
+ text = "normalized observed/expected read counts"
549
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 0.5, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False})
550
+
551
+ # prepare data for boxplot
552
+ reads, GC = reads_per_gc.T
553
+ reads_per_gc, bin_labels = bin_by(reads, GC, nbins=100)
554
+ to_keep = [idx for idx, x in enumerate(bin_labels) if 0.2 <= x <= 0.7]
555
+ reads_per_gc = [reads_per_gc[x] for x in to_keep]
556
+ bin_labels = [bin_labels[x] for x in to_keep]
557
+
558
+ # produce the same boxplot as matplotlib as vastly reduce the output file size
559
+ bins = []
560
+ for b in reads_per_gc:
561
+ s = cbook.boxplot_stats(b)[0]
562
+ bins.append([s['whislo'], s['q1'], s['q1'], s['med'], s['med'], s['med'], s['q3'], s['q3'], s['whishi']])
563
+
564
+ data = []
565
+
566
+ # top plot
567
+ for x, y in zip(bin_labels, bins):
568
+ trace = go.Box(x=x, y=y, xaxis='x1', yaxis='y1', boxpoints='outliers', showlegend=False, name="{}".format(x), line=dict(color='rgb(107,174,214)'))
569
+ data.append(trace)
570
+
571
+ # bottom plot
572
+ x = np.linspace(0, 1, frequencies.shape[0])
573
+ trace = go.Scatter(x=x, y=np.log2(frequencies[:, 2]), xaxis='x2', yaxis='y2', showlegend=False, line=dict(color='rgb(107,174,214)'))
574
+ data.append(trace)
575
+ fig.add_traces(data)
576
+ fig['layout']['annotations'] = annos
577
+ py.plot(fig, filename=file_name, auto_open=False)
578
+
579
+
580
+ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=None):
581
+ import matplotlib
582
+ matplotlib.use('Agg')
583
+ matplotlib.rcParams['pdf.fonttype'] = 42
584
+ matplotlib.rcParams['svg.fonttype'] = 'none'
585
+ import matplotlib.pyplot as plt
586
+
587
+ # prepare data for boxplot
588
+ reads, GC = reads_per_gc.T
589
+ reads_per_gc, bin_labels = bin_by(reads, GC, nbins=100)
590
+ to_keep = [idx for idx, x in enumerate(bin_labels) if 0.2 <= x <= 0.7]
591
+ reads_per_gc = [reads_per_gc[x] for x in to_keep]
592
+ bin_labels = [bin_labels[x] for x in to_keep]
593
+
594
+ title = "reads per regions of {} bp".format(region_size)
595
+ fig = plt.figure(figsize=(6, 8))
596
+ ax1 = fig.add_subplot(211, title=title)
597
+ ax2 = fig.add_subplot(212,
598
+ title='normalized observed/expected read counts')
599
+
600
+ # make boxplot
601
+
602
+ bp = ax1.boxplot(reads_per_gc, notch=0, patch_artist=True)
603
+ plt.setp(bp['boxes'], color='black', facecolor='LightGreen')
604
+ plt.setp(bp['medians'], color='black')
605
+ plt.setp(bp['whiskers'], color='black', linestyle='dashed')
606
+ plt.setp(bp['fliers'], marker='None')
607
+ # get the whisker that spands the most
608
+ y_max = np.nanmax([x.get_data()[1][1] for x in bp['whiskers']])
609
+ ax1.set_ylim(0 - (y_max * 0.05), y_max * 1.05)
610
+ ax1.set_ylabel('Number of reads')
611
+ ax1.set_xlabel('GC fraction')
612
+
613
+ xticks = [idx for idx, x in enumerate(bin_labels) if int(x * 100) % 10 == 0]
614
+
615
+ ax1.set_xticks(xticks)
616
+ ax1.set_xticklabels(["{:.1f}".format(bin_labels[x]) for x in xticks])
617
+
618
+ x = np.linspace(0, 1, frequencies.shape[0])
619
+ y = np.log2(frequencies[:, 2])
620
+ ax2.plot(x, y, color='#8c96f0')
621
+ ax2.set_xlabel('GC fraction')
622
+ ax2.set_ylabel('log2ratio observed/expected')
623
+ ax2.set_xlim(0.2, 0.7)
624
+ y_max = max(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1])
625
+ y_min = min(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1])
626
+ if y_max > 0:
627
+ y_max *= 1.1
628
+ else:
629
+ y_max *= 0.9
630
+ if y_min < 0:
631
+ y_min *= 1.1
632
+ else:
633
+ y_min *= 0.9
634
+ ax2.set_ylim(y_min, y_max)
635
+ plt.tight_layout()
636
+ plt.savefig(file_name, bbox_inches='tight', dpi=100, format=image_format)
637
+ plt.close()
638
+
639
+
640
+ def main(args=None):
641
+ args = parse_arguments().parse_args(args)
642
+
643
+ if args.extraSampling:
644
+ extra_sampling_file = args.extraSampling.name
645
+ args.extraSampling.close()
646
+ else:
647
+ extra_sampling_file = None
648
+
649
+ global global_vars
650
+ global_vars = {}
651
+ global_vars['2bit'] = args.genome
652
+ global_vars['bam'] = args.bamfile
653
+ global_vars['filter_out'] = args.blackListFileName
654
+ global_vars['extra_sampling_file'] = extra_sampling_file
655
+
656
+ tbit = py2bit.open(global_vars['2bit'])
657
+ bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors)
658
+
659
+ if args.fragmentLength:
660
+ fragment_len_dict = \
661
+ {'median': args.fragmentLength}
662
+
663
+ else:
664
+ fragment_len_dict, __ = \
665
+ get_read_and_fragment_length(args.bamfile, None,
666
+ numberOfProcessors=args.numberOfProcessors,
667
+ verbose=args.verbose)
668
+ if not fragment_len_dict:
669
+ print("\nPlease provide the fragment length used for the "
670
+ "sample preparation.\n")
671
+ exit(1)
672
+
673
+ fragment_len_dict = {'median': int(fragment_len_dict['median'])}
674
+
675
+ chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
676
+
677
+ global_vars['genome_size'] = sum(tbit.chroms().values())
678
+ global_vars['total_reads'] = mapped
679
+ global_vars['reads_per_bp'] = \
680
+ float(global_vars['total_reads']) / args.effectiveGenomeSize
681
+
682
+ confidence_p_value = float(1) / args.sampleSize
683
+
684
+ # chromSizes: list of tuples
685
+ chromSizes = [(bam.references[i], bam.lengths[i])
686
+ for i in range(len(bam.references))]
687
+ chromSizes = [x for x in chromSizes if x[0] in tbit.chroms()]
688
+
689
+ # use poisson distribution to identify peaks that should be discarted.
690
+ # I multiply by 4, because the real distribution of reads
691
+ # vary depending on the gc content
692
+ # and the global number of reads per bp may a be too low.
693
+ # empirically, a value of at least 4 times as big as the
694
+ # reads_per_bp was found.
695
+ # Similarly for the min value, I divide by 4.
696
+ global_vars['max_reads'] = poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value)
697
+ # this may be of not use, unless the depth of sequencing is really high
698
+ # as this value is close to 0
699
+ global_vars['min_reads'] = poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value)
700
+
701
+ for key in global_vars:
702
+ print("{}: {}".format(key, global_vars[key]))
703
+
704
+ print("computing frequencies")
705
+ # the GC of the genome is sampled each stepSize bp.
706
+ stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
707
+ print("stepSize: {}".format(stepSize))
708
+ data = tabulateGCcontent(fragment_len_dict,
709
+ chrNameBitToBam, stepSize,
710
+ chromSizes,
711
+ numberOfProcessors=args.numberOfProcessors,
712
+ verbose=args.verbose,
713
+ region=args.region)
714
+
715
+ np.savetxt(args.GCbiasFrequenciesFile.name, data)
716
+
717
+ if args.biasPlot:
718
+ reads_per_gc = countReadsPerGC(args.regionSize,
719
+ chrNameBitToBam, stepSize * 10,
720
+ chromSizes,
721
+ numberOfProcessors=args.numberOfProcessors,
722
+ verbose=args.verbose,
723
+ region=args.region)
724
+ if args.plotFileFormat == "plotly":
725
+ plotlyGCbias(args.biasPlot, data, reads_per_gc, args.regionSize)
726
+ else:
727
+ plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
728
+
729
+
730
+ class Tester():
731
+ def __init__(self):
732
+ import os
733
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
734
+ self.tbitFile = self.root + "sequence.2bit"
735
+ self.bamFile = self.root + "test.bam"
736
+ self.mappability = self.root + "mappability.bw"
737
+ self.chrNameBam = '2L'
738
+ self.chrNameBit = 'chr2L'
739
+ bam, mapped, unmapped, stats = bamHandler.openBam(self.bamFile, returnStats=True)
740
+ tbit = py2bit.open(self.tbitFile)
741
+ global debug
742
+ debug = 0
743
+ global global_vars
744
+ global_vars = {'2bit': self.tbitFile,
745
+ 'bam': self.bamFile,
746
+ 'filter_out': None,
747
+ 'mappability': self.mappability,
748
+ 'extra_sampling_file': None,
749
+ 'max_reads': 5,
750
+ 'min_reads': 0,
751
+ 'min_reads': 0,
752
+ 'reads_per_bp': 0.3,
753
+ 'total_reads': mapped,
754
+ 'genome_size': sum(tbit.chroms().values())
755
+ }
756
+
757
+ def testTabulateGCcontentWorker(self):
758
+ stepSize = 2
759
+ fragmentLength = {'min': 1, 'median': 3, 'max': 5}
760
+ start = 0
761
+ end = 20
762
+ chrNameBam2bit = {'2L': 'chr2L'}
763
+ return (self.chrNameBam,
764
+ start, end, stepSize, fragmentLength, chrNameBam2bit)
765
+
766
+ def set_filter_out_file(self):
767
+ global global_vars
768
+ global_vars['filter_out'] = self.root + "filter_out.bed"
769
+
770
+ def unset_filter_out_file(self):
771
+ global global_vars
772
+ global_vars['filter_out'] = None
773
+
774
+ def set_extra_sampling_file(self):
775
+ global global_vars
776
+ global_vars['extra_sampling_file'] = self.root + "extra_sampling.bed"
777
+
778
+ def testTabulateGCcontent(self):
779
+ fragmentLength = {'median': 10}
780
+ chrNameBitToBam = {'chr2L': '2L'}
781
+ stepSize = 1
782
+ bam = bamHandler.openBam(global_vars['bam'])
783
+ chromSizes = [(bam.references[i], bam.lengths[i])
784
+ for i in range(len(bam.references))]
785
+ return (fragmentLength,
786
+ chrNameBitToBam, stepSize, chromSizes, 1)
787
+
788
+ def testCountReadsPerGC(self):
789
+ regionSize = 300
790
+ chrNameBitToBam = {'chr2L': '2L'}
791
+ stepSize = 1
792
+ bam = bamHandler.openBam(global_vars['bam'])
793
+ chromSizes = [(bam.references[i], bam.lengths[i])
794
+ for i in range(len(bam.references))]
795
+ return (regionSize,
796
+ chrNameBitToBam, stepSize, chromSizes, 1)
797
+
798
+
799
+ if __name__ == "__main__":
800
+ main()
deepTools/source/deeptools/computeMatrix.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import sys
6
+ from deeptools.parserCommon import writableFile, numberOfProcessors
7
+ from deeptools import parserCommon
8
+ from deeptools import heatmapper
9
+ import deeptools.computeMatrixOperations as cmo
10
+ from importlib.metadata import version
11
+
12
+
13
+ def parse_arguments(args=None):
14
+ parser = \
15
+ argparse.ArgumentParser(
16
+ formatter_class=argparse.RawDescriptionHelpFormatter,
17
+ description="""
18
+
19
+ This tool calculates scores per genome regions and prepares an intermediate file that can be used with ``plotHeatmap`` and ``plotProfiles``.
20
+ Typically, the genome regions are genes, but any other regions defined in a BED file can be used.
21
+ computeMatrix accepts multiple score files (bigWig format) and multiple regions files (BED format).
22
+ This tool can also be used to filter and sort regions according
23
+ to their score.
24
+
25
+ To learn more about the specific parameters, type:
26
+
27
+ $ computeMatrix reference-point --help or
28
+
29
+ $ computeMatrix scale-regions --help
30
+
31
+ """,
32
+ epilog='An example usage is:\n computeMatrix reference-point -S '
33
+ '<bigwig file(s)> -R <bed file(s)> -b 1000\n \n')
34
+
35
+ parser.add_argument('--version', action='version',
36
+ version='%(prog)s {}'.format(version('deeptools')))
37
+
38
+ subparsers = parser.add_subparsers(
39
+ title='Commands',
40
+ dest='command',
41
+ metavar='')
42
+
43
+ # scale-regions mode options
44
+ subparsers.add_parser(
45
+ 'scale-regions',
46
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
47
+ parents=[
48
+ computeMatrixRequiredArgs(),
49
+ computeMatrixOutputArgs(),
50
+ computeMatrixOptArgs(case='scale-regions'),
51
+ parserCommon.gtf_options()
52
+ ],
53
+ help="In the scale-regions mode, all regions in the BED file are "
54
+ "stretched or shrunken to the length (in bases) indicated by the user.",
55
+ usage='An example usage is:\n computeMatrix scale-regions -S '
56
+ '<biwig file(s)> -R <bed file> -b 1000\n\n')
57
+
58
+ # reference point arguments
59
+ subparsers.add_parser(
60
+ 'reference-point',
61
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
62
+ parents=[computeMatrixRequiredArgs(),
63
+ computeMatrixOutputArgs(),
64
+ computeMatrixOptArgs(case='reference-point'),
65
+ parserCommon.gtf_options()
66
+ ],
67
+ help="Reference-point refers to a position within a BED region "
68
+ "(e.g., the starting point). In this mode, only those genomic"
69
+ "positions before (upstream) and/or after (downstream) of the "
70
+ "reference point will be plotted.",
71
+ usage='An example usage is:\n computeMatrix reference-point -S '
72
+ '<biwig file(s)> -R <bed file> -a 3000 -b 3000\n\n')
73
+
74
+ return parser
75
+
76
+
77
+ def computeMatrixRequiredArgs(args=None):
78
+ parser = argparse.ArgumentParser(add_help=False)
79
+ required = parser.add_argument_group('Required arguments')
80
+ required.add_argument('--regionsFileName', '-R',
81
+ metavar='File',
82
+ help='File name or names, in BED or GTF format, containing '
83
+ 'the regions to plot. If multiple bed files are given, each one is considered a '
84
+ 'group that can be plotted separately. Also, adding a "#" symbol in the bed file '
85
+ 'causes all the regions until the previous "#" to be considered one group.',
86
+ nargs='+',
87
+ required=True)
88
+ required.add_argument('--scoreFileName', '-S',
89
+ help='bigWig file(s) containing '
90
+ 'the scores to be plotted. Multiple files should be separated by spaced. BigWig '
91
+ 'files can be obtained by using the bamCoverage '
92
+ 'or bamCompare tools. More information about '
93
+ 'the bigWig file format can be found at '
94
+ 'http://genome.ucsc.edu/goldenPath/help/bigWig.html ',
95
+ metavar='File',
96
+ nargs='+',
97
+ required=True)
98
+ return parser
99
+
100
+
101
+ def computeMatrixOutputArgs(args=None):
102
+ parser = argparse.ArgumentParser(add_help=False)
103
+ output = parser.add_argument_group('Output options')
104
+ output.add_argument('--outFileName', '-out', '-o',
105
+ help='File name to save the gzipped matrix file '
106
+ 'needed by the "plotHeatmap" and "plotProfile" tools.',
107
+ type=writableFile,
108
+ required=True)
109
+
110
+ output.add_argument('--outFileNameMatrix',
111
+ help='If this option is given, then the matrix '
112
+ 'of values underlying the heatmap will be saved '
113
+ 'using the indicated name, e.g. IndividualValues.tab.'
114
+ 'This matrix can easily be loaded into R or '
115
+ 'other programs.',
116
+ metavar='FILE',
117
+ type=writableFile)
118
+ output.add_argument('--outFileSortedRegions',
119
+ help='File name in which the regions are saved '
120
+ 'after skiping zeros or min/max threshold values. The '
121
+ 'order of the regions in the file follows the sorting '
122
+ 'order selected. This is useful, for example, to '
123
+ 'generate other heatmaps keeping the sorting of the '
124
+ 'first heatmap. Example: Heatmap1sortedRegions.bed',
125
+ metavar='BED file',
126
+ type=argparse.FileType('w'))
127
+ return parser
128
+
129
+
130
+ def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]):
131
+
132
+ parser = argparse.ArgumentParser(add_help=False)
133
+ optional = parser.add_argument_group('Optional arguments')
134
+ optional.add_argument('--version', action='version',
135
+ version='%(prog)s {}'.format(version('deeptools')))
136
+
137
+ if case == 'scale-regions':
138
+ optional.add_argument('--regionBodyLength', '-m',
139
+ default=1000,
140
+ type=int,
141
+ help='Distance in bases to which all regions will '
142
+ 'be fit. (Default: %(default)s)')
143
+ optional.add_argument('--startLabel',
144
+ default='TSS',
145
+ help='Label shown in the plot for the start of '
146
+ 'the region. Default is TSS (transcription '
147
+ 'start site), but could be changed to anything, '
148
+ 'e.g. "peak start". Note that this is only '
149
+ 'useful if you plan to plot the results yourself '
150
+ 'and not, for example, with plotHeatmap, which '
151
+ 'will override this. (Default: %(default)s)')
152
+ optional.add_argument('--endLabel',
153
+ default='TES',
154
+ help='Label shown in the plot for the region '
155
+ 'end. Default is TES (transcription end site). '
156
+ 'See the --startLabel option for more '
157
+ 'information. (Default: %(default)s) ')
158
+ optional.add_argument('--beforeRegionStartLength', '-b', '--upstream',
159
+ default=0,
160
+ type=int,
161
+ help='Distance upstream of the start site of '
162
+ 'the regions defined in the region file. If the '
163
+ 'regions are genes, this would be the distance '
164
+ 'upstream of the transcription start site. (Default: %(default)s)')
165
+ optional.add_argument('--afterRegionStartLength', '-a', '--downstream',
166
+ default=0,
167
+ type=int,
168
+ help='Distance downstream of the end site '
169
+ 'of the given regions. If the '
170
+ 'regions are genes, this would be the distance '
171
+ 'downstream of the transcription end site. (Default: %(default)s)')
172
+ optional.add_argument("--unscaled5prime",
173
+ default=0,
174
+ type=int,
175
+ help='Number of bases at the 5-prime end of the '
176
+ 'region to exclude from scaling. By default, '
177
+ 'each region is scaled to a given length (see the --regionBodyLength option). In some cases it is useful to look at unscaled signals around region boundaries, so this setting specifies the number of unscaled bases on the 5-prime end of each boundary. (Default: %(default)s)')
178
+ optional.add_argument("--unscaled3prime",
179
+ default=0,
180
+ type=int,
181
+ help='Like --unscaled5prime, but for the 3-prime '
182
+ 'end. (Default: %(default)s)')
183
+
184
+ elif case == 'reference-point':
185
+ optional.add_argument('--referencePoint',
186
+ default='TSS',
187
+ choices=['TSS', 'TES', 'center'],
188
+ help='The reference point for the plotting '
189
+ 'could be either the region start (TSS), the '
190
+ 'region end (TES) or the center of the region. '
191
+ 'Note that regardless of what you specify, '
192
+ 'plotHeatmap/plotProfile will default to using "TSS" as the '
193
+ 'label. (Default: %(default)s)')
194
+
195
+ # set region body length to zero for reference point mode
196
+ optional.add_argument('--regionBodyLength', help=argparse.SUPPRESS,
197
+ default=0, type=int)
198
+ optional.add_argument('--unscaled5prime', default=0, type=int, help=argparse.SUPPRESS)
199
+ optional.add_argument('--unscaled3prime', default=0, type=int, help=argparse.SUPPRESS)
200
+ optional.add_argument('--beforeRegionStartLength', '-b', '--upstream',
201
+ default=500,
202
+ type=int,
203
+ metavar='INT bp',
204
+ help='Distance upstream of the reference-point '
205
+ 'selected. (Default: %(default)s)')
206
+ optional.add_argument('--afterRegionStartLength', '-a', '--downstream',
207
+ default=1500,
208
+ metavar='INT bp',
209
+ type=int,
210
+ help='Distance downstream of the '
211
+ 'reference-point selected. (Default: %(default)s)')
212
+ optional.add_argument('--nanAfterEnd',
213
+ action='store_true',
214
+ help='If set, any values after the region end '
215
+ 'are discarded. This is useful to visualize '
216
+ 'the region end when not using the '
217
+ 'scale-regions mode and when the reference-'
218
+ 'point is set to the TSS.')
219
+
220
+ optional.add_argument('--binSize', '-bs',
221
+ help='Length, in bases, of the non-overlapping '
222
+ 'bins for averaging the score over the '
223
+ 'regions length. (Default: %(default)s)',
224
+ type=int,
225
+ default=10)
226
+
227
+ optional.add_argument('--sortRegions',
228
+ help='Whether the output file should present the '
229
+ 'regions sorted. The default is to not sort the regions. '
230
+ 'Note that this is only useful if you plan to plot '
231
+ 'the results yourself and not, for example, with '
232
+ 'plotHeatmap, which will override this. Note also that '
233
+ 'unsorted output will be in whatever order the regions '
234
+ 'happen to be processed in and not match the order in '
235
+ 'the input files. If you require the output order to '
236
+ 'match that of the input regions, then either specify '
237
+ '"keep" or use computeMatrixOperations to resort the '
238
+ 'results file. (Default: %(default)s)',
239
+ choices=["descend", "ascend", "no", "keep"],
240
+ default='keep')
241
+
242
+ optional.add_argument('--sortUsing',
243
+ help='Indicate which method should be used for '
244
+ 'sorting. The value is computed for each row.'
245
+ 'Note that the region_length option will lead '
246
+ 'to a dotted line within the heatmap that indicates '
247
+ 'the end of the regions. (Default: %(default)s)',
248
+ choices=["mean", "median", "max", "min", "sum",
249
+ "region_length"],
250
+ default='mean')
251
+
252
+ optional.add_argument('--sortUsingSamples',
253
+ help='List of sample numbers (order as in matrix), '
254
+ 'that are used for sorting by --sortUsing, '
255
+ 'no value uses all samples, '
256
+ 'example: --sortUsingSamples 1 3',
257
+ type=int, nargs='+')
258
+
259
+ optional.add_argument('--averageTypeBins',
260
+ default='mean',
261
+ choices=["mean", "median", "min",
262
+ "max", "std", "sum"],
263
+ help='Define the type of statistic that should be '
264
+ 'used over the bin size range. The '
265
+ 'options are: "mean", "median", "min", "max", "sum" '
266
+ 'and "std". The default is "mean". (Default: %(default)s)')
267
+
268
+ optional.add_argument('--missingDataAsZero',
269
+ help='If set, missing data (NAs) will be treated as zeros. '
270
+ 'The default is to ignore such cases, which will be depicted as black areas in '
271
+ 'a heatmap. (see the --missingDataColor argument '
272
+ 'of the plotHeatmap command for additional options).',
273
+ action='store_true')
274
+
275
+ optional.add_argument('--skipZeros',
276
+ help='Whether regions with only scores of zero '
277
+ 'should be included or not. Default is to include '
278
+ 'them.',
279
+ action='store_true')
280
+
281
+ optional.add_argument('--minThreshold',
282
+ default=None,
283
+ type=float,
284
+ help='Numeric value. Any region containing a '
285
+ 'value that is less than or equal to this '
286
+ 'will be skipped. This is useful to skip, '
287
+ 'for example, genes where the read count is zero '
288
+ 'for any of the bins. This could be the result of '
289
+ 'unmappable areas and can bias the overall results. (Default: %(default)s)')
290
+
291
+ optional.add_argument('--maxThreshold',
292
+ default=None,
293
+ type=float,
294
+ help='Numeric value. Any region containing a value '
295
+ 'greater than or equal to this '
296
+ 'will be skipped. The maxThreshold is useful to '
297
+ 'skip those few regions with very high read counts '
298
+ '(e.g. micro satellites) that may bias the average '
299
+ 'values. (Default: %(default)s)')
300
+
301
+ optional.add_argument('--blackListFileName', '-bl',
302
+ help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
303
+ metavar="BED file",
304
+ required=False)
305
+
306
+ optional.add_argument('--samplesLabel',
307
+ help='Labels for the samples. This will then be passed to plotHeatmap and plotProfile. The '
308
+ 'default is to use the file name of the '
309
+ 'sample. The sample labels should be separated '
310
+ 'by spaces and quoted if a label itself'
311
+ 'contains a space E.g. --samplesLabel label-1 "label 2" ',
312
+ nargs='+')
313
+
314
+ optional.add_argument('--smartLabels',
315
+ action='store_true',
316
+ help='Instead of manually specifying labels for the input '
317
+ 'bigWig and BED/GTF files, this causes deepTools to use the file name '
318
+ 'after removing the path and extension.')
319
+
320
+ # in contrast to other tools,
321
+ # computeMatrix by default outputs
322
+ # messages and the --quiet flag supresses them
323
+ optional.add_argument('--quiet', '-q',
324
+ help='Set to remove any warning or processing '
325
+ 'messages.',
326
+ action='store_true')
327
+
328
+ optional.add_argument('--verbose',
329
+ help='Being VERY verbose in the status messages. --quiet will disable this.',
330
+ action='store_true')
331
+
332
+ optional.add_argument('--scale',
333
+ help='If set, all values are multiplied by '
334
+ 'this number. (Default: %(default)s)',
335
+ type=float,
336
+ default=1)
337
+ optional.add_argument('--numberOfProcessors', '-p',
338
+ help='Number of processors to use. Type "max/2" to '
339
+ 'use half the maximum number of processors or "max" '
340
+ 'to use all available processors. (Default: %(default)s)',
341
+ metavar="INT",
342
+ type=numberOfProcessors,
343
+ default=1,
344
+ required=False)
345
+ return parser
346
+
347
+
348
+ def process_args(args=None):
349
+ args = parse_arguments().parse_args(args)
350
+
351
+ if len(sys.argv) == 1:
352
+ parse_arguments().print_help()
353
+ sys.exit()
354
+
355
+ if args.quiet is True:
356
+ args.verbose = False
357
+
358
+ # Ensure before and after region length is positive
359
+ if args.beforeRegionStartLength < 0:
360
+ print(f"beforeRegionStartLength changed from {args.beforeRegionStartLength} into {abs(args.beforeRegionStartLength)}")
361
+ args.beforeRegionStartLength = abs(args.beforeRegionStartLength)
362
+ if args.afterRegionStartLength < 0:
363
+ print(f"afterRegionStartLength changed from {args.afterRegionStartLength} into {abs(args.afterRegionStartLength)}")
364
+ args.afterRegionStartLength = abs(args.afterRegionStartLength)
365
+
366
+ if args.command == 'scale-regions':
367
+ args.nanAfterEnd = False
368
+ args.referencePoint = None
369
+ elif args.command == 'reference-point':
370
+ if args.beforeRegionStartLength == 0 and \
371
+ args.afterRegionStartLength == 0:
372
+ sys.exit("\nUpstrean and downstream regions are both "
373
+ "set to 0. Nothing to output. Maybe you want to "
374
+ "use the scale-regions mode?\n")
375
+
376
+ return args
377
+
378
+
379
+ def main(args=None):
380
+
381
+ args = process_args(args)
382
+
383
+ parameters = {'upstream': args.beforeRegionStartLength,
384
+ 'downstream': args.afterRegionStartLength,
385
+ 'body': args.regionBodyLength,
386
+ 'bin size': args.binSize,
387
+ 'ref point': args.referencePoint,
388
+ 'verbose': args.verbose,
389
+ 'bin avg type': args.averageTypeBins,
390
+ 'missing data as zero': args.missingDataAsZero,
391
+ 'min threshold': args.minThreshold,
392
+ 'max threshold': args.maxThreshold,
393
+ 'scale': args.scale,
394
+ 'skip zeros': args.skipZeros,
395
+ 'nan after end': args.nanAfterEnd,
396
+ 'proc number': args.numberOfProcessors,
397
+ 'sort regions': args.sortRegions,
398
+ 'sort using': args.sortUsing,
399
+ 'unscaled 5 prime': args.unscaled5prime,
400
+ 'unscaled 3 prime': args.unscaled3prime
401
+ }
402
+
403
+ hm = heatmapper.heatmapper()
404
+
405
+ scores_file_list = args.scoreFileName
406
+ hm.computeMatrix(scores_file_list, args.regionsFileName, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args)
407
+ if args.sortRegions not in ['no', 'keep']:
408
+ sortUsingSamples = []
409
+ if args.sortUsingSamples is not None:
410
+ for i in args.sortUsingSamples:
411
+ if (i > 0 and i <= hm.matrix.get_num_samples()):
412
+ sortUsingSamples.append(i - 1)
413
+ else:
414
+ exit("The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples()))
415
+ print('Samples used for ordering within each group: ', sortUsingSamples)
416
+
417
+ hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples)
418
+ elif args.sortRegions == 'keep':
419
+ hm.parameters['group_labels'] = hm.matrix.group_labels
420
+ hm.parameters["group_boundaries"] = hm.matrix.group_boundaries
421
+ cmo.sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator, verbose=not args.quiet)
422
+
423
+ hm.save_matrix(args.outFileName)
424
+
425
+ if args.outFileNameMatrix:
426
+ hm.save_matrix_values(args.outFileNameMatrix)
427
+
428
+ if args.outFileSortedRegions:
429
+ hm.save_BED(args.outFileSortedRegions)
deepTools/source/deeptools/computeMatrixOperations.py ADDED
@@ -0,0 +1,852 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import deeptools.heatmapper as heatmapper
3
+ import deeptoolsintervals.parse as dti
4
+ import numpy as np
5
+ import argparse
6
+ import sys
7
+ import os
8
+ import csv
9
+ from importlib.metadata import version
10
+
11
+
12
+ def parse_arguments():
13
+ parser = argparse.ArgumentParser(
14
+ formatter_class=argparse.RawDescriptionHelpFormatter,
15
+ description="""
16
+ This tool performs a variety of operations on files produced by computeMatrix.
17
+
18
+ detailed help:
19
+
20
+ computeMatrixOperations info -h
21
+
22
+ or
23
+
24
+ computeMatrixOperations relabel -h
25
+
26
+ or
27
+
28
+ computeMatrixOperations subset -h
29
+
30
+ or
31
+
32
+ computeMatrixOperations filterStrand -h
33
+
34
+ or
35
+
36
+ computeMatrixOperations filterValues -h
37
+
38
+ or
39
+
40
+ computeMatrixOperations rbind -h
41
+
42
+ or
43
+
44
+ computeMatrixOperations cbind -h
45
+
46
+ or
47
+ computeMatrixOperations sort -h
48
+
49
+ or
50
+ computeMatrixOperations dataRange -h
51
+
52
+ """,
53
+ epilog='example usages:\n'
54
+ 'computeMatrixOperations subset -m input.mat.gz -o output.mat.gz --group "group 1" "group 2" --samples "sample 3" "sample 10"\n\n'
55
+ ' \n\n')
56
+
57
+ subparsers = parser.add_subparsers(
58
+ title='Commands',
59
+ dest='command',
60
+ metavar='')
61
+
62
+ # info
63
+ subparsers.add_parser(
64
+ 'info',
65
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
66
+ parents=[infoArgs()],
67
+ help="Print group and sample information",
68
+ usage='An example usage is:\n computeMatrixOperations info -m input.mat.gz\n\n')
69
+
70
+ # relabel
71
+ subparsers.add_parser(
72
+ 'relabel',
73
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
74
+ parents=[infoArgs(), relabelArgs()],
75
+ help="Change sample and/or group label information",
76
+ usage='An example usage is:\n computeMatrixOperations relabel -m input.mat.gz -o output.mat.gz --sampleLabels "sample 1" "sample 2"\n\n')
77
+
78
+ # subset
79
+ subparsers.add_parser(
80
+ 'subset',
81
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
82
+ parents=[infoArgs(), subsetArgs()],
83
+ help="Actually subset the matrix. The group and sample orders are honored, so one can also reorder files.",
84
+ usage='An example usage is:\n computeMatrixOperations subset -m '
85
+ 'input.mat.gz -o output.mat.gz --groups "group 1" "group 2" '
86
+ '--samples "sample 3" "sample 10"\n\n')
87
+
88
+ # filterStrand
89
+ subparsers.add_parser(
90
+ 'filterStrand',
91
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
92
+ parents=[infoArgs(), filterStrandArgs()],
93
+ help="Filter entries by strand.",
94
+ usage='Example usage:\n computeMatrixOperations filterStrand -m '
95
+ 'input.mat.gz -o output.mat.gz --strand +\n\n')
96
+
97
+ # filterValues
98
+ subparsers.add_parser(
99
+ 'filterValues',
100
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
101
+ parents=[infoArgs(), filterValuesArgs()],
102
+ help="Filter entries by min/max value.",
103
+ usage='Example usage:\n computeMatrixOperations filterValues -m '
104
+ 'input.mat.gz -o output.mat.gz --min 10 --max 1000\n\n')
105
+
106
+ # rbind
107
+ subparsers.add_parser(
108
+ 'rbind',
109
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
110
+ parents=[bindArgs()],
111
+ help="merge multiple matrices by concatenating them head to tail. This assumes that the same samples are present in each in the same order.",
112
+ usage='Example usage:\n computeMatrixOperations rbind -m '
113
+ 'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n')
114
+
115
+ # cbind
116
+ subparsers.add_parser(
117
+ 'cbind',
118
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
119
+ parents=[bindArgs()],
120
+ help="merge multiple matrices by concatenating them left to right. No assumptions are made about the row order. Regions not present in the first file specified are ignored. Regions missing in subsequent files will result in NAs. Regions are matches based on the first 6 columns of the computeMatrix output (essentially the columns in a BED file).",
121
+ usage='Example usage:\n computeMatrixOperations cbind -m '
122
+ 'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n')
123
+
124
+ # sort
125
+ subparsers.add_parser(
126
+ 'sort',
127
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
128
+ parents=[sortArgs()],
129
+ help='Sort a matrix file to correspond to the order of entries in the desired input file(s). The groups of regions designated by the files must be present in the order found in the output of computeMatrix (otherwise, use the subset command first). Note that this subcommand can also be used to remove unwanted regions, since regions not present in the input file(s) will be omitted from the output.',
130
+ usage='Example usage:\n computeMatrixOperations sort -m input.mat.gz -R regions1.bed regions2.bed regions3.gtf -o input.sorted.mat.gz\n\n')
131
+
132
+ # dataRange
133
+ subparsers.add_parser(
134
+ 'dataRange',
135
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
136
+ parents=[infoArgs()],
137
+ help='Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.',
138
+ usage='Example usage:\n computeMatrixOperations dataRange -m input.mat.gz\n\n')
139
+
140
+ parser.add_argument('--version', action='version',
141
+ version='%(prog)s {}'.format(version('deeptools')))
142
+
143
+ return parser
144
+
145
+
146
+ def bindArgs():
147
+ parser = argparse.ArgumentParser(add_help=False)
148
+ required = parser.add_argument_group('Required arguments')
149
+
150
+ required.add_argument('--matrixFile', '-m',
151
+ help='Matrix files from the computeMatrix tool.',
152
+ nargs='+',
153
+ required=True)
154
+
155
+ required.add_argument('--outFileName', '-o',
156
+ help='Output file name',
157
+ required=True)
158
+
159
+ return parser
160
+
161
+
162
+ def infoArgs():
163
+ parser = argparse.ArgumentParser(add_help=False)
164
+ required = parser.add_argument_group('Required arguments')
165
+
166
+ required.add_argument('--matrixFile', '-m',
167
+ help='Matrix file from the computeMatrix tool.',
168
+ required=True)
169
+
170
+ return parser
171
+
172
+
173
+ def relabelArgs():
174
+ parser = argparse.ArgumentParser(add_help=False)
175
+ required = parser.add_argument_group('Required arguments')
176
+
177
+ required.add_argument('--outFileName', '-o',
178
+ help='Output file name',
179
+ required=True)
180
+
181
+ optional = parser.add_argument_group('Optional arguments')
182
+
183
+ optional.add_argument('--groupLabels',
184
+ nargs='+',
185
+ help="Groups labels. If none are specified then the current labels will be kept.")
186
+
187
+ optional.add_argument('--sampleLabels',
188
+ nargs='+',
189
+ help="Sample labels. If none are specified then the current labels will be kept.")
190
+
191
+ return parser
192
+
193
+
194
+ def subsetArgs():
195
+ parser = argparse.ArgumentParser(add_help=False)
196
+ required = parser.add_argument_group('Required arguments')
197
+
198
+ required.add_argument('--outFileName', '-o',
199
+ help='Output file name',
200
+ required=True)
201
+
202
+ optional = parser.add_argument_group('Optional arguments')
203
+
204
+ optional.add_argument('--groups',
205
+ nargs='+',
206
+ help="Groups to include. If none are specified then all will be included.")
207
+
208
+ optional.add_argument('--samples',
209
+ nargs='+',
210
+ help="Samples to include. If none are specified then all will be included.")
211
+
212
+ return parser
213
+
214
+
215
+ def filterStrandArgs():
216
+ parser = argparse.ArgumentParser(add_help=False)
217
+ required = parser.add_argument_group('Required arguments')
218
+
219
+ required.add_argument('--outFileName', '-o',
220
+ help='Output file name',
221
+ required=True)
222
+
223
+ required.add_argument('--strand', '-s',
224
+ help='Strand',
225
+ choices=['+', '-', '.'],
226
+ required=True)
227
+
228
+ return parser
229
+
230
+
231
+ def filterValuesArgs():
232
+ parser = argparse.ArgumentParser(add_help=False)
233
+ required = parser.add_argument_group('Required arguments')
234
+
235
+ required.add_argument('--outFileName', '-o',
236
+ help='Output file name',
237
+ required=True)
238
+
239
+ optional = parser.add_argument_group('Optional arguments')
240
+ optional.add_argument('--min',
241
+ help='Minimum value. Any row having a single entry less than this will be excluded. The default is no minimum.',
242
+ type=float,
243
+ default=None)
244
+
245
+ optional.add_argument('--max',
246
+ help='Maximum value. Any row having a single entry more than this will be excluded. The default is no maximum.',
247
+ type=float,
248
+ default=None)
249
+
250
+ return parser
251
+
252
+
253
+ def sortArgs():
254
+ parser = argparse.ArgumentParser(add_help=False)
255
+ required = parser.add_argument_group('Required arguments')
256
+
257
+ required.add_argument('--matrixFile', '-m',
258
+ help='Matrix file from the computeMatrix tool.',
259
+ required=True)
260
+
261
+ required.add_argument('--outFileName', '-o',
262
+ help='Output file name',
263
+ required=True)
264
+
265
+ required.add_argument('--regionsFileName', '-R',
266
+ help='File name(s), in BED or GTF format, containing the regions. '
267
+ 'If multiple bed files are given, each one is '
268
+ 'considered a group that can be plotted separately. '
269
+ 'Also, adding a "#" symbol in the bed file causes all '
270
+ 'the regions until the previous "#" to be considered '
271
+ 'one group. Alternatively for BED files, putting '
272
+ 'deepTools_group in the header can be used to indicate a '
273
+ 'column with group labels. Note that these should be '
274
+ 'sorted such that all group entries are together.',
275
+ required=True,
276
+ nargs='+')
277
+
278
+ optional = parser.add_argument_group('Optional arguments')
279
+
280
+ optional.add_argument('--transcriptID',
281
+ default='transcript',
282
+ help='When a GTF file is used to provide regions, only '
283
+ 'entries with this value as their feature (column 3) '
284
+ 'will be processed as transcripts. (Default: %(default)s)')
285
+
286
+ optional.add_argument('--transcript_id_designator',
287
+ default='transcript_id',
288
+ help='Each region has an ID (e.g., ACTB) assigned to it, '
289
+ 'which for BED files is either column 4 (if it exists) '
290
+ 'or the interval bounds. For GTF files this is instead '
291
+ 'stored in the last column as a key:value pair (e.g., as '
292
+ '\'transcript_id "ACTB"\', for a key of transcript_id '
293
+ 'and a value of ACTB). In some cases it can be '
294
+ 'convenient to use a different identifier. To do so, set '
295
+ 'this to the desired key. (Default: %(default)s)')
296
+
297
+ return parser
298
+
299
+
300
+ def printInfo(matrix):
301
+ """
302
+ Print the groups and samples
303
+ """
304
+
305
+ print("Groups:")
306
+ for group in matrix.matrix.group_labels:
307
+ print("\t{0}".format(group))
308
+
309
+ print("Samples:")
310
+ for sample in matrix.matrix.sample_labels:
311
+ print("\t{0}".format(sample))
312
+
313
+
314
+ def printDataRange(matrix):
315
+ """
316
+ Prints the min, max, median, 10th and 90th percentile of the matrix values per sample.
317
+ """
318
+ print("Samples\tMin\tMax\tMedian\t10th\t90th")
319
+ for i, sample in enumerate(matrix.matrix.sample_labels):
320
+ start = matrix.matrix.sample_boundaries[i]
321
+ end = matrix.matrix.sample_boundaries[i + 1]
322
+ sample_matrix = matrix.matrix.matrix[..., start:end]
323
+ print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sample, np.amin(sample_matrix),
324
+ np.amax(sample_matrix),
325
+ np.ma.median(sample_matrix),
326
+ np.percentile(sample_matrix, 10),
327
+ np.percentile(sample_matrix, 90)))
328
+
329
+
330
+ def relabelMatrix(matrix, args):
331
+ """
332
+ Relabel the samples and groups in a matrix
333
+ """
334
+ if args.groupLabels:
335
+ if len(args.groupLabels) != len(matrix.matrix.group_labels):
336
+ sys.exit("You specified {} group labels, but {} are required.\n".format(len(args.groupLabels), len(matrix.matrix.group_labels)))
337
+ matrix.matrix.group_labels = args.groupLabels
338
+ if args.sampleLabels:
339
+ if len(args.sampleLabels) != len(matrix.matrix.sample_labels):
340
+ sys.exit("You specified {} sample labels, but {} are required.\n".format(len(args.sampleLabels), len(matrix.matrix.sample_labels)))
341
+ matrix.matrix.sample_labels = args.sampleLabels
342
+
343
+
344
+ def getGroupBounds(args, matrix):
345
+ """
346
+ Given the group labels, return an indexing array and the resulting boundaries
347
+ """
348
+ bounds = matrix.parameters['group_boundaries']
349
+ if args.groups is None:
350
+ return range(0, matrix.matrix.matrix.shape[0]), np.array(bounds)
351
+ else:
352
+ o = list()
353
+ obounds = [0]
354
+ for group in args.groups:
355
+ if group not in matrix.matrix.group_labels:
356
+ sys.exit("Error: '{0}' is not a valid group\n".format(group))
357
+ idx = matrix.matrix.group_labels.index(group)
358
+ o.extend(range(bounds[idx], bounds[idx + 1]))
359
+ obounds.append(bounds[idx + 1] - bounds[idx])
360
+ return o, np.cumsum(obounds)
361
+
362
+
363
+ def getSampleBounds(args, matrix):
364
+ """
365
+ Given the sample labels, return an indexing array
366
+ """
367
+ bounds = matrix.parameters['sample_boundaries']
368
+ if args.samples is None:
369
+ return np.arange(0, matrix.matrix.matrix.shape[1])
370
+ else:
371
+ o = list()
372
+ for sample in args.samples:
373
+ if sample not in matrix.matrix.sample_labels:
374
+ sys.exit("Error: '{0}' is not a valid sample\n".format(sample))
375
+ idx = matrix.matrix.sample_labels.index(sample)
376
+ o.extend(range(bounds[idx], bounds[idx + 1]))
377
+ return o
378
+
379
+
380
+ def subsetRegions(hm, bounds):
381
+ out = []
382
+ for x in bounds:
383
+ reg = hm.matrix.regions[x]
384
+ # we need to add a list of [chrom, [(start, end), (start, end)], name, 0, strand, score)]
385
+ if isinstance(reg, dict):
386
+ # This happens on occasion
387
+ starts = reg["start"].split(",")
388
+ starts = [int(x) for x in starts]
389
+ ends = reg["end"].split(",")
390
+ ends = [int(x) for x in ends]
391
+ regs = [(x, y) for x, y in zip(starts, ends)]
392
+ out.append([reg["chrom"], regs, reg["name"], 0, reg["strand"], reg["score"]])
393
+ else:
394
+ out.append(reg)
395
+ return out
396
+
397
+
398
+ def filterHeatmap(hm, args):
399
+ bounds = [0]
400
+ regions = []
401
+ keep = []
402
+ for region in hm.matrix.regions:
403
+ if region[4] == args.strand:
404
+ keep.append(True)
405
+ regions.append(region)
406
+ else:
407
+ keep.append(False)
408
+ keep = np.array(keep)
409
+
410
+ # Get the new bounds
411
+ for idx in range(1, len(hm.matrix.group_boundaries)):
412
+ i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]]))
413
+ bounds.append(bounds[idx - 1] + i)
414
+
415
+ hm.matrix.group_boundaries = bounds
416
+
417
+ # subset the matrix
418
+ hm.matrix.matrix = hm.matrix.matrix[keep, :]
419
+ hm.matrix.regions = regions
420
+
421
+
422
+ def filterHeatmapValues(hm, minVal, maxVal):
423
+ bounds = [0]
424
+ regions = []
425
+ keep = []
426
+ if minVal is None:
427
+ minVal = -np.inf
428
+ if maxVal is None:
429
+ maxVal = np.inf
430
+ np.warnings.filterwarnings('ignore')
431
+ for i, (x, y) in enumerate(zip(np.nanmin(hm.matrix.matrix, axis=1), np.nanmax(hm.matrix.matrix, axis=1))):
432
+ # x/y will be nan iff a row is entirely nan. Don't filter.
433
+ if np.isnan(x) or (x >= minVal and y <= maxVal):
434
+ keep.append(True)
435
+ regions.append(hm.matrix.regions[i])
436
+ else:
437
+ keep.append(False)
438
+ keep = np.array(keep)
439
+
440
+ # Get the new bounds
441
+ for idx in range(1, len(hm.matrix.group_boundaries)):
442
+ i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]]))
443
+ bounds.append(bounds[idx - 1] + i)
444
+
445
+ hm.matrix.group_boundaries = bounds
446
+
447
+ # subset the matrix
448
+ hm.matrix.matrix = hm.matrix.matrix[keep, :]
449
+ hm.matrix.regions = regions
450
+
451
+
452
+ def insertMatrix(hm, hm2, groupName):
453
+ """
454
+ Given two heatmapper objects and a region group name, insert the regions and
455
+ values from hm2 for that group to the end of those for hm.
456
+ """
457
+ # get the bounds for hm
458
+ idx = hm.parameters["group_labels"].index(groupName)
459
+ hmEnd = hm.parameters["group_boundaries"][idx + 1]
460
+ # get the bounds for hm2
461
+ idx2 = hm2.parameters["group_labels"].index(groupName)
462
+ hm2Start = hm2.parameters["group_boundaries"][idx2]
463
+ hm2End = hm2.parameters["group_boundaries"][idx2 + 1]
464
+
465
+ # Insert the subset hm2 into hm along axis 0
466
+ hm.matrix.matrix = np.insert(hm.matrix.matrix, hmEnd, hm2.matrix.matrix[hm2Start:hm2End, :], axis=0)
467
+
468
+ # Insert the regions
469
+ hm.matrix.regions[hmEnd:hmEnd] = hm2.matrix.regions[hm2Start:hm2End]
470
+
471
+ # Increase the group boundaries
472
+ bounds = []
473
+ for idx3, bound in enumerate(hm.parameters["group_boundaries"]):
474
+ if idx3 > idx:
475
+ bound += hm2End - hm2Start
476
+ bounds.append(bound)
477
+ hm.parameters["group_boundaries"] = bounds
478
+
479
+
480
+ def appendMatrix(hm, hm2, groupName):
481
+ """
482
+ Given two heatmapper objects and a region group name, append the values from
483
+ that group in hm2 onto the end of hm.
484
+ """
485
+ # get the bounds for hm2
486
+ idx2 = hm2.parameters["group_labels"].index(groupName)
487
+ hm2Start = hm2.parameters["group_boundaries"][idx2]
488
+ hm2End = hm2.parameters["group_boundaries"][idx2 + 1]
489
+
490
+ # Append the matrix
491
+ hm.matrix.matrix = np.concatenate([hm.matrix.matrix, hm2.matrix.matrix[hm2Start:hm2End, :]], axis=0)
492
+ # Update the bounds
493
+ hm.parameters["group_boundaries"].append(hm.parameters["group_boundaries"][-1] + hm2End - hm2Start)
494
+ # Append the regions
495
+ hm.matrix.regions.extend(hm2.matrix.regions[hm2Start:hm2End])
496
+
497
+
498
+ def rbindMatrices(hm, args):
499
+ """
500
+ Bind matrices, top to bottom while accounting for the groups.
501
+
502
+ It's assumed that the same samples are present in both and in the exact same order
503
+ """
504
+ hm2 = heatmapper.heatmapper()
505
+ hm.read_matrix_file(args.matrixFile[0])
506
+ for idx in range(1, len(args.matrixFile)):
507
+ hm2.read_matrix_file(args.matrixFile[idx])
508
+ for idx, group in enumerate(hm2.parameters["group_labels"]):
509
+ if group in hm.parameters["group_labels"]:
510
+ insertMatrix(hm, hm2, group)
511
+ else:
512
+ appendMatrix(hm, hm2, group)
513
+ hm.parameters["group_labels"].append(group)
514
+
515
+ # Update the group boundaries attribute
516
+ hm.matrix.group_labels = hm.parameters['group_labels']
517
+ hm.matrix.group_boundaries = hm.parameters['group_boundaries']
518
+
519
+
520
+ def cbindMatrices(hm, args):
521
+ """
522
+ Bind columns from different matrices according to the group and region names
523
+
524
+ Missing regions are left as NA
525
+ """
526
+ hm2 = heatmapper.heatmapper()
527
+
528
+ # Make a dict of region name:row associations
529
+ hm.read_matrix_file(args.matrixFile[0])
530
+ d = dict({x: dict() for x in hm.parameters["group_labels"]})
531
+ for idx, group in enumerate(hm.parameters["group_labels"]):
532
+ s = hm.parameters["group_boundaries"][idx]
533
+ e = hm.parameters["group_boundaries"][idx + 1]
534
+ for idx2, reg in enumerate(hm.matrix.regions[s:e]):
535
+ d[group][reg[2]] = idx2 + s
536
+
537
+ # Iterate through the other matrices
538
+ for idx in range(1, len(args.matrixFile)):
539
+ hm2.read_matrix_file(args.matrixFile[idx])
540
+ # Add the sample labels
541
+ hm.parameters['sample_labels'].extend(hm2.parameters['sample_labels'])
542
+ # Add the sample boundaries
543
+ lens = [x + hm.parameters['sample_boundaries'][-1] for x in hm2.parameters['sample_boundaries']][1:]
544
+ hm.parameters['sample_boundaries'].extend(lens)
545
+
546
+ # Add on additional NA initialized columns
547
+ ncol = hm.matrix.matrix.shape[1]
548
+ hm.matrix.matrix = np.hstack((hm.matrix.matrix, np.empty(hm2.matrix.matrix.shape)))
549
+ hm.matrix.matrix[:, ncol:] = np.nan
550
+
551
+ # Update the values
552
+ for idx2, group in enumerate(hm2.parameters["group_labels"]):
553
+ if group not in d:
554
+ continue
555
+ s = hm2.parameters["group_boundaries"][idx2]
556
+ e = hm2.parameters["group_boundaries"][idx2 + 1]
557
+ for idx3, reg in enumerate(hm2.matrix.regions[s:e]):
558
+ if reg[2] not in d[group]:
559
+ continue
560
+ hm.matrix.matrix[d[group][reg[2]], ncol:] = hm2.matrix.matrix[s + idx3, :]
561
+
562
+ # Append the special params
563
+ for s in hm.special_params:
564
+ hm.parameters[s].extend(hm2.parameters[s])
565
+
566
+ # Update the sample parameters
567
+ hm.matrix.sample_labels = hm.parameters['sample_labels']
568
+ hm.matrix.sample_boundaries = hm.parameters['sample_boundaries']
569
+
570
+
571
+ def loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup):
572
+ """
573
+ Given a first line, possibly a label column and a list of labels and regions, add the labels and regions in the file to them
574
+ """
575
+
576
+ # This is largely parseBED from deeptoolsintervals
577
+ labelIdx = None
578
+ localRegions = {}
579
+
580
+ cols = line.strip().split("\t")
581
+ if labelColumn is not None:
582
+ label = cols.pop(labelColumn)
583
+ if label not in labels:
584
+ labels[label] = len(labels)
585
+ labelIdx = labels[label]
586
+ if labelIdx >= len(regions):
587
+ regions.append(localRegions)
588
+ else:
589
+ localRegions = regions[labelIdx]
590
+
591
+ if len(cols) >= 6:
592
+ name = cols[3]
593
+ else:
594
+ name = "{0}:{1}-{2}".format(cols[0], cols[1], cols[2])
595
+ localRegions[name] = len(localRegions)
596
+
597
+ for line in fp:
598
+ if line.startswith("#") and labelColumn is None:
599
+ if len(localRegions) > 0:
600
+ label = line[1:].strip()
601
+ if len(label):
602
+ labels[dti.findRandomLabel(labels, label)] = len(labels)
603
+ else:
604
+ labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len(labels)
605
+ regions.append(localRegions)
606
+ localRegions = dict()
607
+ continue
608
+ elif line.startswith("#") and labelColumn is not None:
609
+ continue
610
+
611
+ cols = line.strip().split("\t")
612
+ if len(cols) < 3:
613
+ continue
614
+ if labelColumn is not None:
615
+ label = cols.pop(labelColumn)
616
+ if label not in labels:
617
+ labels[label] = len(labels)
618
+ labelIdx = labels[label]
619
+ if labelIdx >= len(regions):
620
+ regions.append({})
621
+ localRegions = regions[labelIdx]
622
+
623
+ if len(cols) >= 6:
624
+ name = cols[3]
625
+ else:
626
+ name = "{0}:{1}-{2}".format(cols[0], cols[1], cols[2])
627
+ name = dti.findRandomLabel(localRegions, name)
628
+ localRegions[name] = len(localRegions)
629
+
630
+ # Handle the last group if there is no label
631
+ if labelIdx is None and len(localRegions) > 0:
632
+ if defaultGroup is not None:
633
+ labels[dti.findRandomLabel(labels, defaultGroup)] = len(labels)
634
+ else:
635
+ labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len(labels)
636
+ regions.append(localRegions)
637
+
638
+
639
+ def loadGTFtranscript(cols, label, defaultGroup, transcript_id_designator):
640
+ s = next(csv.reader([cols[8]], delimiter=' '))
641
+ if "deepTools_group" in s and s[-1] != "deepTools_group":
642
+ label = s[s.index("deepTools_group") + 1].rstrip(";")
643
+ elif defaultGroup is not None:
644
+ label = defaultGroup
645
+
646
+ if transcript_id_designator not in s or s[-1] == transcript_id_designator:
647
+ sys.stderr.write("Warning: {0} is malformed!\n".format("\t".join(cols)))
648
+ return None, None
649
+
650
+ name = s[s.index(transcript_id_designator) + 1].rstrip(";")
651
+ return label, name
652
+
653
+
654
+ def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup):
655
+ """
656
+ Like loadBED, but for a GTF file
657
+
658
+ This is largely a copy of what's in deeptoolsintervals
659
+ """
660
+ file_label = dti.findRandomLabel(labels, os.path.basename(fname))
661
+
662
+ # handle the first line
663
+ cols = line.split("\t")
664
+ if cols[2].lower() == transcriptID.lower():
665
+ label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator)
666
+ if label is not None:
667
+ if label not in labels:
668
+ labels[label] = len(labels)
669
+ regions.append(dict())
670
+ labelIdx = labels[label]
671
+ regions[labelIdx][name] = len(regions[labelIdx])
672
+
673
+ for line in fp:
674
+ if not isinstance(line, str):
675
+ line = line.decode('ascii')
676
+ if not line.startswith('#'):
677
+ cols = line.strip().split('\t')
678
+ if len(cols) == 0:
679
+ continue
680
+ if cols[2].lower() == transcriptID:
681
+ label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator)
682
+ if label is None:
683
+ continue
684
+ if label not in labels:
685
+ labels[label] = len(labels)
686
+ regions.append(dict())
687
+ labelIdx = labels[label]
688
+ regions[labelIdx][name] = len(regions[labelIdx])
689
+
690
+
691
+ def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator, verbose=True):
692
+ """
693
+ Iterate through the files noted by regionsFileName and sort hm accordingly
694
+ """
695
+
696
+ labels = dict()
697
+ regions = []
698
+ defaultGroup = None
699
+ if len(regionsFileName) == 1:
700
+ defaultGroup = "genes"
701
+ for fname in regionsFileName:
702
+ fp = dti.openPossiblyCompressed(fname)
703
+ line = dti.getNext(fp)
704
+ labelColumn = None
705
+ while line.startswith("#"):
706
+ if not labelColumn:
707
+ labelColumn = dti.getLabel(line)
708
+ line = dti.getNext(fp)
709
+ while line.startswith("track "):
710
+ line = dti.getNext(fp)
711
+
712
+ # Find the label column
713
+ subtract = 0
714
+ if labelColumn is not None:
715
+ subtract = 1
716
+
717
+ # Determine the file type and load into a list (or list of lists)
718
+ cols = line.strip().split("\t")
719
+ if len(cols) - subtract < 3:
720
+ raise RuntimeError('{0} does not seem to be a recognized file type!'.format(fname))
721
+ elif len(cols) - subtract <= 6:
722
+ loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
723
+ elif len(cols) and dti.seemsLikeGTF(cols):
724
+ loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup)
725
+ else:
726
+ loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
727
+ fp.close()
728
+
729
+ # Do some sanity checking on the group labels and region names within them
730
+ s1 = set(hm.parameters['group_labels'])
731
+ if verbose:
732
+ for e in labels:
733
+ if e not in s1:
734
+ sys.exit("The computeMatrix output is missing the '{}' region group. It has {} but the specified regions have {}.\n".format(e, s1, labels.keys()))
735
+
736
+ # Make a dictionary out of current labels and regions
737
+ d = dict()
738
+ pos = 0
739
+ groupSizes = dict()
740
+ for idx, label in enumerate(hm.parameters['group_labels']):
741
+ s = hm.parameters['group_boundaries'][idx]
742
+ e = hm.parameters['group_boundaries'][idx + 1]
743
+ if label not in labels:
744
+ continue
745
+ d[label] = dict()
746
+ groupSize = 0
747
+ for reg in hm.matrix.regions[s:e]:
748
+ d[label][reg[2]] = pos
749
+ pos += 1
750
+ groupSize += 1
751
+ groupSizes[label] = groupSize
752
+
753
+ # Convert labels to an ordered list
754
+ labelsList = [""] * len(labels)
755
+ for k, v in labels.items():
756
+ labelsList[v] = k
757
+
758
+ # Reorder
759
+ order = []
760
+ boundaries = [0]
761
+ for idx, label in enumerate(labelsList):
762
+ # Make an ordered list out of the region names in this region group
763
+ _ = [""] * len(regions[idx])
764
+ for k, v in regions[idx].items():
765
+ _[v] = k
766
+ sz = 0 # Track the number of enries actually matched
767
+ for name in _:
768
+ if name not in d[label]:
769
+ if verbose:
770
+ sys.stderr.write("Skipping {}, due to being absent in the computeMatrix output.\n".format(name))
771
+ continue
772
+ sz += 1
773
+ order.append(d[label][name])
774
+ if sz == 0 and verbose:
775
+ sys.exit("The region group {} had no matching entries!\n".format(label))
776
+ boundaries.append(sz + boundaries[-1])
777
+ hm.matrix.regions = [hm.matrix.regions[i] for i in order]
778
+ order = np.array(order)
779
+ hm.matrix.matrix = hm.matrix.matrix[order, :]
780
+
781
+ # Update the parameters
782
+ hm.parameters["group_labels"] = labelsList
783
+ hm.matrix.group_labels = labelsList
784
+ hm.parameters["group_boundaries"] = boundaries
785
+ hm.matrix.group_boundaries = boundaries
786
+
787
+
788
+ def main(args=None):
789
+ # if args none is need since otherwise pytest passes 'pytest' as sys.argv
790
+ if args is None:
791
+ if len(sys.argv) == 1:
792
+ args = ["-h"]
793
+ if len(sys.argv) == 2:
794
+ args = [sys.argv[1], "-h"]
795
+
796
+ args = parse_arguments().parse_args(args)
797
+
798
+ hm = heatmapper.heatmapper()
799
+ if not isinstance(args.matrixFile, list):
800
+ hm.read_matrix_file(args.matrixFile)
801
+ if args.command == 'info':
802
+ printInfo(hm)
803
+ elif args.command == 'dataRange':
804
+ printDataRange(hm)
805
+ elif args.command == 'subset':
806
+ sIdx = getSampleBounds(args, hm)
807
+ gIdx, gBounds = getGroupBounds(args, hm)
808
+
809
+ # groups
810
+ hm.matrix.regions = subsetRegions(hm, gIdx)
811
+ # matrix
812
+ hm.matrix.matrix = hm.matrix.matrix[gIdx, :]
813
+ hm.matrix.matrix = hm.matrix.matrix[:, sIdx]
814
+ # boundaries
815
+ if args.samples is None:
816
+ args.samples = hm.matrix.sample_labels
817
+ hm.matrix.sample_boundaries = hm.matrix.sample_boundaries[0:len(args.samples) + 1]
818
+ hm.matrix.group_boundaries = gBounds.tolist()
819
+ # special params
820
+ keepIdx = set()
821
+ for _, sample in enumerate(hm.matrix.sample_labels):
822
+ if sample in args.samples:
823
+ keepIdx.add(_)
824
+ for param in hm.special_params:
825
+ hm.parameters[param] = [v for k, v in enumerate(hm.parameters[param]) if k in keepIdx]
826
+ # labels
827
+ hm.matrix.sample_labels = args.samples
828
+ if args.groups is None:
829
+ args.groups = hm.matrix.group_labels
830
+ hm.matrix.group_labels = args.groups
831
+ # save
832
+ hm.save_matrix(args.outFileName)
833
+ elif args.command == 'filterStrand':
834
+ filterHeatmap(hm, args)
835
+ hm.save_matrix(args.outFileName)
836
+ elif args.command == 'filterValues':
837
+ filterHeatmapValues(hm, args.min, args.max)
838
+ hm.save_matrix(args.outFileName)
839
+ elif args.command == 'rbind':
840
+ rbindMatrices(hm, args)
841
+ hm.save_matrix(args.outFileName)
842
+ elif args.command == 'cbind':
843
+ cbindMatrices(hm, args)
844
+ hm.save_matrix(args.outFileName)
845
+ elif args.command == 'sort':
846
+ sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator)
847
+ hm.save_matrix(args.outFileName)
848
+ elif args.command == 'relabel':
849
+ relabelMatrix(hm, args)
850
+ hm.save_matrix(args.outFileName)
851
+ else:
852
+ sys.exit("Unknown command {0}!\n".format(args.command))
deepTools/source/deeptools/correctGCBias.py ADDED
@@ -0,0 +1,746 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import shutil
6
+ import time
7
+ import subprocess
8
+ import sys
9
+
10
+ import py2bit
11
+ import pysam
12
+ import multiprocessing
13
+ import numpy as np
14
+ import argparse
15
+
16
+ from scipy.stats import binom
17
+
18
+ from deeptools.utilities import tbitToBamChrName, getGC_content
19
+ from deeptools import writeBedGraph, parserCommon, mapReduce
20
+ from deeptools import utilities
21
+ from deeptools.bamHandler import openBam
22
+
23
+ old_settings = np.seterr(all='ignore')
24
+
25
+
26
+ def parse_arguments(args=None):
27
+ parentParser = parserCommon.getParentArgParse(binSize=True, blackList=False)
28
+ requiredArgs = getRequiredArgs()
29
+ parser = argparse.ArgumentParser(
30
+ parents=[requiredArgs, parentParser],
31
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
32
+ description='This tool corrects the GC-bias using the'
33
+ ' method proposed by [Benjamini & Speed (2012). '
34
+ 'Nucleic Acids Research, 40(10)]. It will remove reads'
35
+ ' from regions with too high coverage compared to the'
36
+ ' expected values (typically GC-rich regions) and will'
37
+ ' add reads to regions where too few reads are seen '
38
+ '(typically AT-rich regions). '
39
+ 'The tool ``computeGCBias`` needs to be run first to generate the '
40
+ 'frequency table needed here.',
41
+ usage='correctGCBias '
42
+ '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit '
43
+ '--GCbiasFrequenciesFile freq.txt -o gc_corrected.bam\n'
44
+ 'help: correctGCBias -h / correctGCBias --help',
45
+ conflict_handler='resolve',
46
+ add_help=False)
47
+ return parser
48
+
49
+
50
+ def process_args(args=None):
51
+ args = parse_arguments().parse_args(args)
52
+
53
+ return args
54
+
55
+
56
+ def getRequiredArgs():
57
+ parser = argparse.ArgumentParser(add_help=False)
58
+
59
+ required = parser.add_argument_group('Required arguments')
60
+
61
+ # define the arguments
62
+ required.add_argument('--bamfile', '-b',
63
+ metavar='BAM file',
64
+ help='Sorted BAM file to correct.',
65
+ required=True)
66
+ required.add_argument('--effectiveGenomeSize',
67
+ help='The effective genome size is the portion '
68
+ 'of the genome that is mappable. Large fractions of '
69
+ 'the genome are stretches of NNNN that should be '
70
+ 'discarded. Also, if repetitive regions were not '
71
+ 'included in the mapping of reads, the effective '
72
+ 'genome size needs to be adjusted accordingly. '
73
+ 'A table of values is available here: '
74
+ 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .',
75
+ default=None,
76
+ type=int,
77
+ required=True)
78
+
79
+ required.add_argument('--genome', '-g',
80
+ help='Genome in two bit format. Most genomes can be '
81
+ 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ '
82
+ 'Search for the .2bit ending. Otherwise, fasta '
83
+ 'files can be converted to 2bit using faToTwoBit '
84
+ 'available here: '
85
+ 'http://hgdownload.cse.ucsc.edu/admin/exe/',
86
+ metavar='two bit file',
87
+ required=True)
88
+
89
+ required.add_argument('--GCbiasFrequenciesFile', '-freq',
90
+ help='Indicate the output file from '
91
+ 'computeGCBias containing '
92
+ 'the observed and expected read frequencies per GC-'
93
+ 'content.',
94
+ type=argparse.FileType('r'),
95
+ metavar='FILE',
96
+ required=True)
97
+
98
+ output = parser.add_argument_group('Output options')
99
+ output.add_argument('--correctedFile', '-o',
100
+ help='Name of the corrected file. The ending will '
101
+ 'be used to decide the output file format. The options '
102
+ 'are ".bam", ".bw" for a bigWig file, ".bg" for a '
103
+ 'bedGraph file.',
104
+ metavar='FILE',
105
+ type=argparse.FileType('w'),
106
+ required=True)
107
+
108
+ # define the optional arguments
109
+ optional = parser.add_argument_group('Optional arguments')
110
+ optional.add_argument("--help", "-h", action="help",
111
+ help="show this help message and exit")
112
+
113
+ return parser
114
+
115
+
116
+ def getReadGCcontent(tbit, read, fragmentLength, chrNameBit):
117
+ """
118
+ The fragments for forward and reverse reads are defined as follows::
119
+
120
+ |- read.pos |- read.aend
121
+ ---+=================>-----------------------+--------- Forward strand
122
+
123
+ |-fragStart |-fragEnd
124
+
125
+ ---+-----------------------<=================+--------- Reverse strand
126
+ |-read.pos |-read.aend
127
+
128
+ |-----------------------------------------|
129
+ read.tlen
130
+
131
+ """
132
+ fragStart = None
133
+ fragEnd = None
134
+
135
+ if read.is_paired and read.is_proper_pair and abs(read.tlen) < 2 * fragmentLength:
136
+ if read.is_reverse and read.tlen < 0:
137
+ fragEnd = read.reference_end
138
+ fragStart = read.reference_end + read.template_length
139
+ elif read.template_length >= read.query_alignment_length:
140
+ fragStart = read.pos
141
+ fragEnd = read.pos + read.template_length
142
+
143
+ if not fragStart:
144
+ if read.is_reverse:
145
+ fragEnd = read.reference_end
146
+ fragStart = read.reference_end - fragmentLength
147
+ else:
148
+ fragStart = read.pos
149
+ fragEnd = fragStart + fragmentLength
150
+ fragStart = max(0, fragStart)
151
+ try:
152
+ gc = getGC_content(tbit, chrNameBit, fragStart, fragEnd)
153
+ except Exception:
154
+ return None
155
+ if gc is None:
156
+ return None
157
+
158
+ # match the gc to the given fragmentLength
159
+ gc = int(np.round(gc * fragmentLength))
160
+ return gc
161
+
162
+
163
+ def writeCorrected_wrapper(args):
164
+ return writeCorrected_worker(*args)
165
+
166
+
167
+ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
168
+ r"""writes a bedgraph file containing the GC correction of
169
+ a region from the genome
170
+
171
+ >>> test = Tester()
172
+ >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
173
+ >>> open(tempFile, 'r').readlines()
174
+ ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
175
+ >>> os.remove(tempFile)
176
+ """
177
+ global R_gc
178
+ fragmentLength = len(R_gc) - 1
179
+
180
+ cvg_corr = np.zeros(end - start)
181
+
182
+ i = 0
183
+
184
+ tbit = py2bit.open(global_vars['2bit'])
185
+ bam = openBam(global_vars['bam'])
186
+ read_repetitions = 0
187
+ removed_duplicated_reads = 0
188
+ startTime = time.time()
189
+
190
+ # caching seems to be faster
191
+ # r.flag & 4 == 0 is to skip unmapped
192
+ # reads that nevertheless are asigned
193
+ # to a genomic position
194
+ reads = [r for r in bam.fetch(chrNameBam, start, end)
195
+ if r.flag & 4 == 0]
196
+
197
+ bam.close()
198
+
199
+ r_index = -1
200
+ for read in reads:
201
+ if read.is_unmapped:
202
+ continue
203
+ r_index += 1
204
+ try:
205
+ # calculate GC content of read fragment
206
+ gc = getReadGCcontent(tbit, read, fragmentLength,
207
+ chrNameBit)
208
+ except Exception as detail:
209
+ print(detail)
210
+ """ this exception happens when the end of a
211
+ chromosome is reached """
212
+ continue
213
+ if not gc:
214
+ continue
215
+
216
+ # is this read in the same orientation and position as the previous?
217
+ if r_index > 0 and read.pos == reads[r_index - 1].pos and \
218
+ read.is_reverse == reads[r_index - 1].is_reverse \
219
+ and read.pnext == reads[r_index - 1].pnext:
220
+ read_repetitions += 1
221
+ if read_repetitions >= global_vars['max_dup_gc'][gc]:
222
+ removed_duplicated_reads += 1
223
+ continue
224
+ else:
225
+ read_repetitions = 0
226
+
227
+ try:
228
+ fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True)
229
+ vectorStart = max(fragmentStart - start, 0)
230
+ vectorEnd = min(fragmentEnd - start, end - start)
231
+ except TypeError:
232
+ # the get_fragment_from_read functions returns None in some cases.
233
+ # Those cases are to be skipped, hence the continue line.
234
+ continue
235
+
236
+ cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
237
+ i += 1
238
+
239
+ try:
240
+ if debug:
241
+ endTime = time.time()
242
+ print("{}, processing {} ({:.1f} per sec) "
243
+ "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
244
+ i, i / (endTime - startTime),
245
+ chrNameBit, start, end))
246
+ except NameError:
247
+ pass
248
+
249
+ if i == 0:
250
+ return None
251
+
252
+ _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
253
+ # save in bedgraph format
254
+ for bin in range(0, len(cvg_corr), step):
255
+ value = np.mean(cvg_corr[bin:min(bin + step, end)])
256
+ if value > 0:
257
+ writeStart = start + bin
258
+ writeEnd = min(start + bin + step, end)
259
+ _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart,
260
+ writeEnd, value))
261
+
262
+ tempFileName = _file.name
263
+ _file.close()
264
+ return tempFileName
265
+
266
+
267
+ def numCopiesOfRead(value):
268
+ """
269
+ Based int he R_gc value, decides
270
+ whether to keep, duplicate, triplicate or delete the read.
271
+ It returns an integer, that tells the number of copies of the read
272
+ that should be keep.
273
+ >>> np.random.seed(1)
274
+ >>> numCopiesOfRead(0.8)
275
+ 1
276
+ >>> numCopiesOfRead(2.5)
277
+ 2
278
+ >>> numCopiesOfRead(None)
279
+ 1
280
+ """
281
+ copies = 1
282
+ if value:
283
+ copies = int(value) + (1 if np.random.rand() < value % 1 else 0)
284
+ return copies
285
+
286
+
287
+ def writeCorrectedSam_wrapper(args):
288
+ return writeCorrectedSam_worker(*args)
289
+
290
+
291
+ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
292
+ step=None,
293
+ tag_but_not_change_number=False,
294
+ verbose=True):
295
+ r"""
296
+ Writes a BAM file, deleting and adding some reads in order to compensate
297
+ for the GC bias. **This is a stochastic method.**
298
+ >>> np.random.seed(1)
299
+ >>> test = Tester()
300
+ >>> args = test.testWriteCorrectedSam()
301
+ >>> tempFile = writeCorrectedSam_worker(*args, \
302
+ ... tag_but_not_change_number=True, verbose=False)
303
+ >>> try:
304
+ ... import StringIO
305
+ ... except ImportError:
306
+ ... from io import StringIO
307
+ >>> ostdout = sys.stdout
308
+ >>> import tempfile
309
+ >>> sys.stdout = tempfile.TemporaryFile()
310
+ >>> idx = pysam.index(tempFile)
311
+ >>> sys.stdout = ostdout
312
+ >>> bam = pysam.Samfile(tempFile)
313
+ >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
314
+ [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
315
+ >>> res = os.remove(tempFile)
316
+ >>> res = os.remove(tempFile+".bai")
317
+ >>> tempFile = \
318
+ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
319
+ ... tag_but_not_change_number=True, verbose=False)
320
+ >>> sys.stdout = tempfile.TemporaryFile()
321
+ >>> idx = pysam.index(tempFile)
322
+ >>> sys.stdout = ostdout
323
+ >>> bam = pysam.Samfile(tempFile)
324
+ >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
325
+ [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
326
+ >>> res = os.remove(tempFile)
327
+ >>> res = os.remove(tempFile+".bai")
328
+ """
329
+ global R_gc
330
+ fragmentLength = len(R_gc) - 1
331
+
332
+ if verbose:
333
+ print("Sam for %s %s %s " % (chrNameBit, start, end))
334
+ i = 0
335
+
336
+ tbit = py2bit.open(global_vars['2bit'])
337
+
338
+ bam = openBam(global_vars['bam'])
339
+ tempFileName = utilities.getTempFileName(suffix='.bam')
340
+
341
+ outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
342
+ startTime = time.time()
343
+ matePairs = {}
344
+ read_repetitions = 0
345
+ removed_duplicated_reads = 0
346
+
347
+ # cache data
348
+ # r.flag & 4 == 0 is to filter unmapped reads that
349
+ # have a genomic position
350
+ reads = [r for r in bam.fetch(chrNameBam, start, end)
351
+ if r.pos > start and r.flag & 4 == 0]
352
+
353
+ r_index = -1
354
+ for read in reads:
355
+ if read.pos <= start or read.is_unmapped:
356
+ continue
357
+ r_index += 1
358
+ copies = None
359
+ gc = None
360
+
361
+ # check if a mate has already been procesed
362
+ # to apply the same correction
363
+ try:
364
+ copies = matePairs[read.qname]['copies']
365
+ gc = matePairs[read.qname]['gc']
366
+ del matePairs[read.qname]
367
+ except:
368
+ # this exception happens when a mate is
369
+ # not present. This could
370
+ # happen because of removal of the mate
371
+ # by some filtering
372
+ gc = getReadGCcontent(tbit, read, fragmentLength,
373
+ chrNameBit)
374
+ if gc:
375
+ copies = numCopiesOfRead(float(1) / R_gc[gc])
376
+ else:
377
+ copies = 1
378
+ # is this read in the same orientation and position as the previous?
379
+ if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
380
+ and read.is_reverse == reads[r_index - 1].is_reverse \
381
+ and read.pnext == reads[r_index - 1].pnext:
382
+ read_repetitions += 1
383
+ if read_repetitions >= global_vars['max_dup_gc'][gc]:
384
+ copies = 0 # in other words do not take into account this read
385
+ removed_duplicated_reads += 1
386
+ else:
387
+ read_repetitions = 0
388
+
389
+ readName = read.qname
390
+ # Each tag is a tuple of (tag name, value, type)
391
+ # Note that get_tags() returns ord(type) rather than type and this must
392
+ # be fixed!
393
+ # It turns out that the "with_value_type" option only started working in
394
+ # pysam-0.8.4, so we can't reliably add tags on earlier versions without
395
+ # potentially creating BAM files that break HTSJDK/IGV/etc.
396
+
397
+ readTag = read.get_tags(with_value_type=True)
398
+ replace_tags = False
399
+ if len(readTag) > 0:
400
+ if len(readTag[0]) == 3:
401
+ if type(readTag[2]) is int:
402
+ readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
403
+ replace_tags = True
404
+ else:
405
+ replace_tags = True
406
+
407
+ if gc:
408
+ GC = int(100 * np.round(float(gc) / fragmentLength,
409
+ decimals=2))
410
+ readTag.append(
411
+ ('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
412
+ readTag.append(('YN', copies, "i"))
413
+ else:
414
+ GC = -1
415
+
416
+ readTag.append(('YG', GC, "i"))
417
+ if replace_tags:
418
+ read.set_tags(readTag)
419
+
420
+ if read.is_paired and read.is_proper_pair \
421
+ and not read.mate_is_unmapped \
422
+ and not read.is_reverse:
423
+ matePairs[readName] = {'copies': copies,
424
+ 'gc': gc}
425
+
426
+ """
427
+ outfile.write(read)
428
+ """
429
+ if tag_but_not_change_number:
430
+ outfile.write(read)
431
+ continue
432
+
433
+ for numCop in range(1, copies + 1):
434
+ # the read has to be renamed such that newly
435
+ # formed pairs will match
436
+ if numCop > 1:
437
+ read.qname = readName + "_%d" % (numCop)
438
+ outfile.write(read)
439
+
440
+ if verbose:
441
+ if i % 500000 == 0 and i > 0:
442
+ endTime = time.time()
443
+ print("{}, processing {} ({:.1f} per sec) reads "
444
+ "@ {}:{}-{}".format(multiprocessing.current_process().name,
445
+ i, i / (endTime - startTime),
446
+ chrNameBit, start, end))
447
+ i += 1
448
+
449
+ outfile.close()
450
+ if verbose:
451
+ endTime = time.time()
452
+ print("{}, processing {} ({:.1f} per sec) reads "
453
+ "@ {}:{}-{}".format(multiprocessing.current_process().name,
454
+ i, i / (endTime - startTime),
455
+ chrNameBit, start, end))
456
+ percentage = float(removed_duplicated_reads) * 100 / len(reads) \
457
+ if len(reads) > 0 else 0
458
+ print("duplicated reads removed %d of %d (%.2f) " %
459
+ (removed_duplicated_reads, len(reads), percentage))
460
+
461
+ return tempFileName
462
+
463
+
464
+ def getFragmentFromRead(read, defaultFragmentLength, extendPairedEnds=True):
465
+ """
466
+ The read has to be pysam object.
467
+
468
+ The following values are defined (for forward reads)::
469
+
470
+
471
+ |-- -- read.tlen -- --|
472
+ |-- read.alen --|
473
+ -----|===============>------------<==============|----
474
+ | | |
475
+ read.pos read.aend read.pnext
476
+
477
+
478
+ and for reverse reads
479
+
480
+
481
+ |-- -- read.tlen -- --|
482
+ |-- read.alen --|
483
+ -----|===============>-----------<===============|----
484
+ | | |
485
+ read.pnext read.pos read.aend
486
+
487
+ this is a sketch of a pair-end reads
488
+
489
+ The function returns the fragment start and end, either
490
+ using the paired end information (if available) or
491
+ extending the read in the appropriate direction if this
492
+ is single-end.
493
+
494
+ Parameters
495
+ ----------
496
+ read : pysam read object
497
+
498
+
499
+ Returns
500
+ -------
501
+ tuple
502
+ (fragment start, fragment end)
503
+
504
+ """
505
+ # convert reads to fragments
506
+
507
+ # this option indicates that the paired ends correspond
508
+ # to the fragment ends
509
+ # condition read.tlen < maxPairedFragmentLength is added to avoid read pairs
510
+ # that span thousands of base pairs
511
+
512
+ if extendPairedEnds is True and read.is_paired and 0 < abs(read.tlen) < 1000:
513
+ if read.is_reverse:
514
+ fragmentStart = read.pnext
515
+ fragmentEnd = read.aend
516
+ else:
517
+ fragmentStart = read.pos
518
+ # the end of the fragment is defined as
519
+ # the start of the forward read plus the insert length
520
+ fragmentEnd = read.pos + read.tlen
521
+ else:
522
+ if defaultFragmentLength <= read.aend - read.pos:
523
+ fragmentStart = read.pos
524
+ fragmentEnd = read.aend
525
+ else:
526
+ if read.is_reverse:
527
+ fragmentStart = read.aend - defaultFragmentLength
528
+ fragmentEnd = read.aend
529
+ else:
530
+ fragmentStart = read.pos
531
+ fragmentEnd = read.pos + defaultFragmentLength
532
+
533
+ return fragmentStart, fragmentEnd
534
+
535
+
536
+ def run_shell_command(command):
537
+ """
538
+ Runs the given shell command. Report
539
+ any errors found.
540
+ """
541
+ try:
542
+ subprocess.check_call(command, shell=True)
543
+
544
+ except subprocess.CalledProcessError as error:
545
+ sys.stderr.write('Error{}\n'.format(error))
546
+ exit(1)
547
+ except Exception as error:
548
+ sys.stderr.write('Error: {}\n'.format(error))
549
+ exit(1)
550
+
551
+
552
+ def main(args=None):
553
+ args = process_args(args)
554
+ global F_gc, N_gc, R_gc
555
+
556
+ data = np.loadtxt(args.GCbiasFrequenciesFile.name)
557
+
558
+ F_gc = data[:, 0]
559
+ N_gc = data[:, 1]
560
+ R_gc = data[:, 2]
561
+
562
+ global global_vars
563
+ global_vars = {}
564
+ global_vars['2bit'] = args.genome
565
+ global_vars['bam'] = args.bamfile
566
+
567
+ # compute the probability to find more than one read (a redundant read)
568
+ # at a certain position based on the gc of the read fragment
569
+ # the binomial function is used for that
570
+ max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x])
571
+ if F_gc[x] > 0 and N_gc[x] > 0 else 1
572
+ for x in range(len(F_gc))]
573
+
574
+ global_vars['max_dup_gc'] = max_dup_gc
575
+
576
+ tbit = py2bit.open(global_vars['2bit'])
577
+ bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)
578
+
579
+ global_vars['genome_size'] = sum(tbit.chroms().values())
580
+ global_vars['total_reads'] = mapped
581
+ global_vars['reads_per_bp'] = \
582
+ float(global_vars['total_reads']) / args.effectiveGenomeSize
583
+
584
+ # apply correction
585
+ print("applying correction")
586
+ # divide the genome in fragments containing about 4e5 reads.
587
+ # This amount of reads takes about 20 seconds
588
+ # to process per core (48 cores, 256 Gb memory)
589
+ chunkSize = int(4e5 / global_vars['reads_per_bp'])
590
+
591
+ # chromSizes: list of tuples
592
+ chromSizes = [(bam.references[i], bam.lengths[i])
593
+ for i in range(len(bam.references))]
594
+
595
+ regionStart = 0
596
+ if args.region:
597
+ chromSizes, regionStart, regionEnd, chunkSize = \
598
+ mapReduce.getUserRegion(chromSizes, args.region,
599
+ max_chunk_size=chunkSize)
600
+
601
+ print("genome partition size for multiprocessing: {}".format(chunkSize))
602
+ print("using region {}".format(args.region))
603
+ mp_args = []
604
+ bedGraphStep = args.binSize
605
+ chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
606
+ chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
607
+ print(chrNameBitToBam, chrNameBamToBit)
608
+ c = 1
609
+ for chrom, size in chromSizes:
610
+ start = 0 if regionStart == 0 else regionStart
611
+ for i in range(start, size, chunkSize):
612
+ try:
613
+ chrNameBamToBit[chrom]
614
+ except KeyError:
615
+ print("no sequence information for ")
616
+ "chromosome {} in 2bit file".format(chrom)
617
+ print("Reads in this chromosome will be skipped")
618
+ continue
619
+ length = min(size, i + chunkSize)
620
+ mp_args.append((chrom, chrNameBamToBit[chrom], i, length,
621
+ bedGraphStep))
622
+ c += 1
623
+
624
+ pool = multiprocessing.Pool(args.numberOfProcessors)
625
+
626
+ if args.correctedFile.name.endswith('bam'):
627
+ if len(mp_args) > 1 and args.numberOfProcessors > 1:
628
+ print(("using {} processors for {} "
629
+ "number of tasks".format(args.numberOfProcessors,
630
+ len(mp_args))))
631
+
632
+ res = pool.map_async(
633
+ writeCorrectedSam_wrapper, mp_args).get(9999999)
634
+ else:
635
+ res = list(map(writeCorrectedSam_wrapper, mp_args))
636
+
637
+ if len(res) == 1:
638
+ command = "cp {} {}".format(res[0], args.correctedFile.name)
639
+ run_shell_command(command)
640
+ else:
641
+ print("concatenating (sorted) intermediate BAMs")
642
+ header = pysam.Samfile(res[0])
643
+ of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
644
+ header.close()
645
+ for f in res:
646
+ f = pysam.Samfile(f)
647
+ for e in f.fetch(until_eof=True):
648
+ of.write(e)
649
+ f.close()
650
+ of.close()
651
+
652
+ print("indexing BAM")
653
+ pysam.index(args.correctedFile.name)
654
+
655
+ for tempFileName in res:
656
+ os.remove(tempFileName)
657
+
658
+ if args.correctedFile.name.endswith('bg') or \
659
+ args.correctedFile.name.endswith('bw'):
660
+
661
+ if len(mp_args) > 1 and args.numberOfProcessors > 1:
662
+
663
+ res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
664
+ else:
665
+ res = list(map(writeCorrected_wrapper, mp_args))
666
+
667
+ oname = args.correctedFile.name
668
+ args.correctedFile.close()
669
+ if oname.endswith('bg'):
670
+ f = open(oname, 'wb')
671
+ for tempFileName in res:
672
+ if tempFileName:
673
+ shutil.copyfileobj(open(tempFileName, 'rb'), f)
674
+ os.remove(tempFileName)
675
+ f.close()
676
+ else:
677
+ chromSizes = [(k, v) for k, v in tbit.chroms().items()]
678
+ writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
679
+
680
+
681
+ class Tester():
682
+ def __init__(self):
683
+ import os
684
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
685
+ self.tbitFile = self.root + "sequence.2bit"
686
+ self.bamFile = self.root + "test.bam"
687
+ self.chrNameBam = '2L'
688
+ self.chrNameBit = 'chr2L'
689
+ bam, mapped, unmapped, stats = openBam(self.bamFile, returnStats=True)
690
+ tbit = py2bit.open(self.tbitFile)
691
+ global debug
692
+ debug = 0
693
+ global global_vars
694
+ global_vars = {'2bit': self.tbitFile,
695
+ 'bam': self.bamFile,
696
+ 'filter_out': None,
697
+ 'extra_sampling_file': None,
698
+ 'max_reads': 5,
699
+ 'min_reads': 0,
700
+ 'min_reads': 0,
701
+ 'reads_per_bp': 0.3,
702
+ 'total_reads': mapped,
703
+ 'genome_size': sum(tbit.chroms().values())}
704
+
705
+ def testWriteCorrectedChunk(self):
706
+ """ prepare arguments for test
707
+ """
708
+ global R_gc, R_gc_min, R_gc_max
709
+ R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
710
+
711
+ global_vars['max_dup_gc'] = np.ones(301)
712
+
713
+ start = 200
714
+ end = 300
715
+ bedGraphStep = 25
716
+ return (self.chrNameBam,
717
+ self.chrNameBit, start, end, bedGraphStep)
718
+
719
+ def testWriteCorrectedSam(self):
720
+ """ prepare arguments for test
721
+ """
722
+ global R_gc, R_gc_min, R_gc_max
723
+ R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
724
+
725
+ global_vars['max_dup_gc'] = np.ones(301)
726
+
727
+ start = 200
728
+ end = 250
729
+ return (self.chrNameBam,
730
+ self.chrNameBit, start, end)
731
+
732
+ def testWriteCorrectedSam_paired(self):
733
+ """ prepare arguments for test.
734
+ """
735
+ global R_gc, R_gc_min, R_gc_max
736
+ R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
737
+
738
+ start = 0
739
+ end = 500
740
+ global global_vars
741
+ global_vars['bam'] = self.root + "paired.bam"
742
+ return 'chr2L', 'chr2L', start, end
743
+
744
+
745
+ if __name__ == "__main__":
746
+ main()
deepTools/source/deeptools/correlation.py ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import itertools
3
+ import copy
4
+ import numpy as np
5
+ import scipy.cluster.hierarchy as sch
6
+ import scipy.stats
7
+ import matplotlib as mpl
8
+ mpl.use('Agg')
9
+ mpl.rcParams['pdf.fonttype'] = 42
10
+ mpl.rcParams['svg.fonttype'] = 'none'
11
+ from deeptools import cm # noqa: F401
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib.gridspec as gridspec
14
+ import matplotlib.ticker
15
+ import matplotlib.mlab
16
+ import matplotlib.markers
17
+ import matplotlib.colors as pltcolors
18
+ from deeptools.utilities import toString, convertCmap
19
+
20
+ import plotly.offline as offline
21
+ import plotly.graph_objs as go
22
+ import plotly.figure_factory as ff
23
+
24
+
25
+ old_settings = np.seterr(all='ignore')
26
+
27
+
28
+ class Correlation:
29
+ """
30
+ class to work with matrices
31
+ having sample data
32
+ to compute correlations, plot
33
+ them and make scatter plots
34
+ """
35
+
36
+ def __init__(self, matrix_file,
37
+ corr_method=None,
38
+ labels=None,
39
+ remove_outliers=False,
40
+ skip_zeros=False,
41
+ log1p=False):
42
+
43
+ self.load_matrix(matrix_file)
44
+ self.skip_zeros = skip_zeros
45
+ self.corr_method = corr_method
46
+ self.corr_matrix = None # correlation matrix
47
+ self.column_order = None
48
+ self.rowCenter = False
49
+ if labels is not None:
50
+ # test that the length of labels
51
+ # corresponds to the length of
52
+ # samples
53
+
54
+ self.labels = labels
55
+ self.labels = [toString(x) for x in self.labels]
56
+
57
+ if self.matrix.shape[1] == 1:
58
+ # There's nothing that can be done with a single sample
59
+ sys.exit("\nPlease use a matrix with more than one sample\n")
60
+
61
+ if skip_zeros is True:
62
+ # remove rows containing only nans or zeros
63
+ # that could be unmappable regions.
64
+ self.remove_rows_of_zeros()
65
+
66
+ if remove_outliers is True:
67
+ # remove outliers, otherwise outliers will produce a very
68
+ # high pearson correlation. Unnecessary for spearman correlation
69
+ self.remove_outliers()
70
+
71
+ if log1p is True:
72
+ self.matrix = np.log1p(self.matrix)
73
+
74
+ if corr_method:
75
+ self.compute_correlation()
76
+
77
+ def load_matrix(self, matrix_file):
78
+ """
79
+ loads a matrix file saved using the numpy
80
+ savez method. Two keys are expected:
81
+ 'matrix' and 'labels'. The matrix should
82
+ contain one sample per row
83
+ """
84
+
85
+ _ma = np.load(matrix_file)
86
+ # matrix: cols correspond to samples
87
+ self.matrix = np.asarray(_ma['matrix'].tolist())
88
+ if np.any(np.isnan(self.matrix)):
89
+ num_nam = len(np.flatnonzero(np.isnan(self.matrix.flatten())))
90
+ sys.stderr.write("*Warning*. {} NaN values were found. They will be removed along with the "
91
+ "corresponding bins in other samples for the computation "
92
+ "and plotting\n".format(num_nam))
93
+
94
+ self.matrix = np.ma.compress_rows(np.ma.masked_invalid(self.matrix))
95
+
96
+ self.labels = list(map(toString, _ma['labels']))
97
+
98
+ assert len(self.labels) == self.matrix.shape[1], "ERROR, length of labels is not equal " \
99
+ "to length of matrix samples"
100
+
101
+ @staticmethod
102
+ def get_outlier_indices(data, max_deviation=200):
103
+ """
104
+ The method is based on the median absolute deviation. See
105
+ Boris Iglewicz and David Hoaglin (1993),
106
+ "Volume 16: How to Detect and Handle Outliers",
107
+ The ASQC Basic References in Quality Control:
108
+ Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
109
+
110
+ returns the list, without the outliers
111
+
112
+ The max_deviation=200 is like selecting a z-score
113
+ larger than 200, just that it is based on the median
114
+ and the median absolute deviation instead of the
115
+ mean and the standard deviation.
116
+ """
117
+ median = np.median(data)
118
+ b_value = 1.4826 # value set for a normal distribution
119
+ mad = b_value * np.median(np.abs(data))
120
+ outliers = []
121
+ if mad > 0:
122
+ deviation = abs(data - median) / mad
123
+ """
124
+ outliers = data[deviation > max_deviation]
125
+ print "outliers removed {}".format(len(outliers))
126
+ print outliers
127
+ """
128
+ outliers = np.flatnonzero(deviation > max_deviation)
129
+ return outliers
130
+
131
+ def remove_outliers(self, verbose=True):
132
+ """
133
+ get the outliers *per column* using the median absolute
134
+ deviation method
135
+
136
+ Returns the filtered matrix
137
+ """
138
+
139
+ unfiltered = len(self.matrix)
140
+ to_remove = None
141
+ for col in self.matrix.T:
142
+ outliers = self.get_outlier_indices(col)
143
+ if to_remove is None:
144
+ to_remove = set(outliers)
145
+ else:
146
+ # only set to remove those bins in which
147
+ # the outliers are present in all cases (colums)
148
+ # that's why the intersection is used
149
+ to_remove = to_remove.intersection(outliers)
150
+ if len(to_remove):
151
+ to_keep = [x for x in range(self.matrix.shape[0])
152
+ if x not in to_remove]
153
+ self.matrix = self.matrix[to_keep, :]
154
+ if verbose:
155
+ sys.stderr.write(
156
+ "total/filtered/left: "
157
+ "{}/{}/{}\n".format(unfiltered,
158
+ unfiltered - len(to_keep),
159
+ len(to_keep)))
160
+
161
+ return self.matrix
162
+
163
+ def remove_rows_of_zeros(self):
164
+ # remove rows containing all zeros or all nans
165
+ _mat = np.nan_to_num(self.matrix)
166
+ to_keep = _mat.sum(1) != 0
167
+
168
+ self.matrix = self.matrix[to_keep, :]
169
+
170
+ def save_corr_matrix(self, file_handle):
171
+ """
172
+ saves the correlation matrix
173
+ """
174
+ if self.column_order:
175
+ self.corr_matrix = self.corr_matrix[:, self.column_order][self.column_order]
176
+ self.labels = [self.labels[i] for i in self.column_order]
177
+
178
+ self.labels = [toString(x) for x in self.labels]
179
+ file_handle.write("\t'" + "'\t'".join(self.labels) + "'\n")
180
+ fmt = "\t".join(np.repeat('%.4f', self.corr_matrix.shape[1])) + "\n"
181
+ i = 0
182
+ for row in self.corr_matrix:
183
+ file_handle.write(
184
+ "'%s'\t" % self.labels[i] + fmt % tuple(row))
185
+ i += 1
186
+
187
+ def compute_correlation(self):
188
+ """
189
+ computes spearman or pearson
190
+ correlation for the samples in the matrix
191
+
192
+ The matrix should contain the values of each sample per column
193
+ that's why the transpose is used.
194
+
195
+ >>> matrix = np.array([[1, 2, 3, np.nan],
196
+ ... [1, 2, 3, 4],
197
+ ... [6, 4, 3, 1]]).T
198
+ >>> np.savez_compressed("/tmp/test_matrix.npz", matrix=matrix, labels=['a', 'b', 'c'])
199
+
200
+ >>> c = Correlation("/tmp/test_matrix.npz", corr_method='pearson')
201
+
202
+ the results should be as in R
203
+
204
+ >>> c.compute_correlation().filled(np.nan)
205
+ array([[ 1. , 1. , -0.98198051],
206
+ [ 1. , 1. , -0.98198051],
207
+ [-0.98198051, -0.98198051, 1. ]])
208
+ >>> c.corr_method = 'spearman'
209
+ >>> c.corr_matrix = None
210
+ >>> c.compute_correlation()
211
+ array([[ 1., 1., -1.],
212
+ [ 1., 1., -1.],
213
+ [-1., -1., 1.]])
214
+ """
215
+ if self.corr_matrix is not None:
216
+ return self.corr_matrix
217
+
218
+ num_samples = len(self.labels)
219
+ # initialize correlation matrix
220
+
221
+ if self.corr_method == 'pearson':
222
+ self.corr_matrix = np.ma.corrcoef(self.matrix.T, allow_masked=True)
223
+
224
+ else:
225
+ corr_matrix = np.zeros((num_samples, num_samples), dtype='float')
226
+ # do an all vs all correlation using the
227
+ # indices of the upper triangle
228
+ rows, cols = np.triu_indices(num_samples)
229
+
230
+ for index in range(len(rows)):
231
+ row = rows[index]
232
+ col = cols[index]
233
+ corr_matrix[row, col] = scipy.stats.spearmanr(self.matrix[:, row], self.matrix[:, col])[0]
234
+ # make the matrix symmetric
235
+ self.corr_matrix = corr_matrix + np.triu(corr_matrix, 1).T
236
+
237
+ return self.corr_matrix
238
+
239
+ def plotly_correlation(self, corr_matrix, plot_filename, labels, plot_title='',
240
+ vmax=None, vmin=None, plot_numbers=True,
241
+ colormap='jet'):
242
+ """plot_correlation, but using plotly"""
243
+ textElement = []
244
+ for row in range(corr_matrix.shape[0]):
245
+ trow = []
246
+ for col in range(corr_matrix.shape[0]):
247
+ if plot_numbers:
248
+ trow.append("{:0.2f}".format(corr_matrix[row, col]))
249
+ else:
250
+ trow.append('')
251
+ textElement.append(trow)
252
+
253
+ zauto = True
254
+ if vmax is not None or vmin is not None:
255
+ zauto = False
256
+
257
+ convertedCmap = convertCmap(colormap)
258
+ fig = ff.create_annotated_heatmap(corr_matrix, x=labels, y=labels, colorscale=convertedCmap, showscale=True, zauto=zauto, zmin=vmin, zmax=vmax, annotation_text=textElement)
259
+ fig.layout['title'] = plot_title
260
+ offline.plot(fig, filename=plot_filename, auto_open=False)
261
+
262
+ def plot_correlation(self, plot_filename, plot_title='', vmax=None,
263
+ vmin=None, colormap='jet', image_format=None,
264
+ plot_numbers=False, plotWidth=11, plotHeight=9.5):
265
+ """
266
+ plots a correlation using a symmetric heatmap
267
+ """
268
+ num_rows = len(self.labels)
269
+ corr_matrix = self.compute_correlation()
270
+ # set a font size according to figure length
271
+ if num_rows < 6:
272
+ font_size = 14
273
+ elif num_rows > 40:
274
+ font_size = 5
275
+ else:
276
+ font_size = int(14 - 0.25 * num_rows)
277
+ mpl.rcParams.update({'font.size': font_size})
278
+ # set the minimum and maximum values
279
+ if vmax is None:
280
+ vmax = 1
281
+ if vmin is None:
282
+ vmin = 0 if corr_matrix .min() >= 0 else -1
283
+
284
+ # Compute and plot dendrogram.
285
+ fig = plt.figure(figsize=(plotWidth, plotHeight))
286
+ plt.suptitle(plot_title)
287
+
288
+ axdendro = fig.add_axes([0.015, 0.1, 0.1, 0.7])
289
+ axdendro.set_axis_off()
290
+ y_var = sch.linkage(corr_matrix, method='centroid')
291
+ z_var = sch.dendrogram(y_var, orientation='left',
292
+ link_color_func=lambda k: 'darkred')
293
+ axdendro.set_xticks([])
294
+ axdendro.set_yticks([])
295
+ cmap = copy.copy(plt.get_cmap(colormap))
296
+
297
+ # this line simply makes a new cmap, based on the original
298
+ # colormap that goes from 0.0 to 0.9
299
+ # This is done to avoid colors that
300
+ # are too dark at the end of the range that do not offer
301
+ # a good contrast between the correlation numbers that are
302
+ # plotted on black.
303
+ if plot_numbers:
304
+ cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped",
305
+ cmap(np.linspace(0, 0.9, 10)))
306
+
307
+ cmap.set_under((0., 0., 1.))
308
+ # Plot distance matrix.
309
+ axmatrix = fig.add_axes([0.12, 0.1, 0.6, 0.7])
310
+ index = z_var['leaves']
311
+ corr_matrix = corr_matrix[index, :]
312
+ corr_matrix = corr_matrix[:, index]
313
+ if corr_matrix.shape[0] > 30:
314
+ # when there are too many rows it is better to remove
315
+ # the black lines surrounding the boxes in the heatmap
316
+ edge_color = 'none'
317
+ else:
318
+ edge_color = 'black'
319
+
320
+ if image_format == "plotly":
321
+ self.plotly_correlation(corr_matrix,
322
+ plot_filename,
323
+ self.labels,
324
+ plot_title=plot_title,
325
+ vmax=vmax,
326
+ vmin=vmin,
327
+ colormap=colormap,
328
+ plot_numbers=plot_numbers)
329
+ return
330
+
331
+ img_mat = axmatrix.pcolormesh(corr_matrix,
332
+ edgecolors=edge_color,
333
+ cmap=cmap,
334
+ vmax=vmax,
335
+ vmin=vmin)
336
+ axmatrix.set_xlim(0, num_rows)
337
+ axmatrix.set_ylim(0, num_rows)
338
+
339
+ axmatrix.yaxis.tick_right()
340
+ axmatrix.set_yticks(np.arange(corr_matrix .shape[0]) + 0.5)
341
+ axmatrix.set_yticklabels(np.array(self.labels).astype('str')[index])
342
+
343
+ axmatrix.xaxis.set_tick_params(labeltop=True)
344
+ axmatrix.xaxis.set_tick_params(labelbottom=False)
345
+ axmatrix.set_xticks(np.arange(corr_matrix .shape[0]) + 0.5)
346
+ axmatrix.set_xticklabels(np.array(self.labels).astype('str')[index], rotation=45, ha='left')
347
+
348
+ axmatrix.tick_params(
349
+ axis='x',
350
+ which='both',
351
+ bottom=False,
352
+ top=False)
353
+
354
+ axmatrix.tick_params(
355
+ axis='y',
356
+ which='both',
357
+ left=False,
358
+ right=False)
359
+
360
+ # Plot colorbar
361
+ axcolor = fig.add_axes([0.12, 0.065, 0.6, 0.02])
362
+ cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal')
363
+ cobar.solids.set_edgecolor("face")
364
+ if plot_numbers:
365
+ for row in range(num_rows):
366
+ for col in range(num_rows):
367
+ axmatrix.text(row + 0.5, col + 0.5,
368
+ "{:.2f}".format(corr_matrix[row, col]),
369
+ ha='center', va='center')
370
+
371
+ self.column_order = index
372
+ fig.savefig(plot_filename, format=image_format)
373
+ plt.close()
374
+
375
+ def plotly_scatter(self, plot_filename, corr_matrix, plot_title='', minXVal=None, maxXVal=None, minYVal=None, maxYVal=None):
376
+ """Make the scatter plot of a matrix with plotly"""
377
+ n = self.matrix.shape[1]
378
+ self.matrix = self.matrix
379
+ fig = go.Figure()
380
+ domainWidth = 1. / n
381
+
382
+ annos = []
383
+ for i in range(n):
384
+ x = domainWidth * (i + 1)
385
+ y = 1 - (domainWidth * i + 0.5 * domainWidth)
386
+ anno = dict(text=self.labels[i], showarrow=False, xref='paper', yref='paper', x=x, y=y, xanchor='right', yanchor='middle')
387
+ annos.append(anno)
388
+
389
+ data = []
390
+ zMin = np.inf
391
+ zMax = -np.inf
392
+ for x in range(n):
393
+ xanchor = 'x{}'.format(x + 1)
394
+ base = x * domainWidth
395
+ domain = [base, base + domainWidth]
396
+ if x > 0:
397
+ base = 1 - base
398
+ fig['layout']['xaxis{}'.format(x + 1)] = dict(domain=domain, range=[minXVal, maxXVal], anchor='free', position=base)
399
+ for y in range(0, n):
400
+ yanchor = 'y{}'.format(y + 1)
401
+ if x == 1:
402
+ base = 1 - y * domainWidth
403
+ domain = [base - domainWidth, base]
404
+ fig['layout']['yaxis{}'.format(y + 1)] = dict(domain=domain, range=[minYVal, maxYVal], side='right', anchor='free', position=1.0)
405
+
406
+ if x > y:
407
+ vector1 = self.matrix[:, x]
408
+ vector2 = self.matrix[:, y]
409
+ Z, xEdges, yEdges = np.histogram2d(vector1, vector2, bins=50)
410
+ Z = np.log10(Z)
411
+ if np.min(Z) < zMin:
412
+ zMin = np.min(Z)
413
+ if np.max(Z) > zMax:
414
+ zMax = np.max(Z)
415
+ name = '{}={:.2f}'.format(self.corr_method, corr_matrix[x, y])
416
+ trace = go.Heatmap(z=Z, x=xEdges, y=yEdges, showlegend=False, xaxis=xanchor, yaxis=yanchor, name=name, showscale=False)
417
+ data.append(trace)
418
+
419
+ # Fix the colorbar bounds
420
+ for trace in data:
421
+ trace.update(zmin=zMin, zmax=zMax)
422
+ data[-1]['colorbar'].update(title="log10(instances per bin)", titleside="right")
423
+ data[-1].update(showscale=True)
424
+
425
+ fig.add_traces(data)
426
+ fig['layout'].update(title=plot_title, showlegend=False, annotations=annos)
427
+
428
+ offline.plot(fig, filename=plot_filename, auto_open=False)
429
+
430
+ def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=False, xRange=None, yRange=None):
431
+ """
432
+ Plot the scatter plots of a matrix
433
+ in which each row is a sample
434
+ """
435
+
436
+ num_samples = self.matrix.shape[1]
437
+ corr_matrix = self.compute_correlation()
438
+ grids = gridspec.GridSpec(num_samples, num_samples)
439
+ grids.update(wspace=0, hspace=0)
440
+ fig = plt.figure(figsize=(2 * num_samples, 2 * num_samples))
441
+ plt.rcParams['font.size'] = 8.0
442
+ plt.suptitle(plot_title)
443
+ if log1p is True:
444
+ self.matrix = np.log1p(self.matrix)
445
+ min_xvalue = self.matrix.min()
446
+ max_xvalue = self.matrix.max()
447
+ min_yvalue = min_xvalue
448
+ max_yvalue = max_xvalue
449
+ if xRange is not None:
450
+ min_xvalue = xRange[0]
451
+ max_xvalue = xRange[1]
452
+ if yRange is not None:
453
+ min_yvalue = yRange[0]
454
+ max_yvalue = yRange[1]
455
+ if (min_xvalue % 2 == 0 and max_xvalue % 2 == 0) or \
456
+ (min_xvalue % 1 == 0 and max_xvalue % 2 == 1):
457
+ # make one value odd and the other even
458
+ max_xvalue += 1
459
+ if (min_yvalue % 2 == 0 and max_yvalue % 2 == 0) or \
460
+ (min_yvalue % 1 == 0 and max_yvalue % 2 == 1):
461
+ # make one value odd and the other even
462
+ max_yvalue += 1
463
+
464
+ # plotly output
465
+ if image_format == 'plotly':
466
+ self.plotly_scatter(plot_filename, corr_matrix, plot_title=plot_title, minXVal=min_xvalue, maxXVal=max_xvalue, minYVal=min_yvalue, maxYVal=max_yvalue)
467
+ return
468
+
469
+ rows, cols = np.triu_indices(num_samples)
470
+
471
+ for index in range(len(rows)):
472
+ row = rows[index]
473
+ col = cols[index]
474
+ if row == col:
475
+ # add titles as
476
+ # empty plot in the diagonal
477
+ ax = fig.add_subplot(grids[row, col])
478
+ ax.text(0.5, 0.5, self.labels[row],
479
+ verticalalignment='center',
480
+ horizontalalignment='center',
481
+ fontsize=10, fontweight='bold',
482
+ transform=ax.transAxes)
483
+ ax.set_axis_off()
484
+ continue
485
+
486
+ ax = fig.add_subplot(grids[row, col])
487
+
488
+ vector1 = self.matrix[:, row]
489
+ vector2 = self.matrix[:, col]
490
+
491
+ ax.text(0.2, 0.8, "{}={:.2f}".format(self.corr_method,
492
+ corr_matrix[row, col]),
493
+ horizontalalignment='left',
494
+ transform=ax.transAxes)
495
+ ax.get_yaxis().set_tick_params(
496
+ which='both',
497
+ left=False,
498
+ right=False,
499
+ direction='out')
500
+
501
+ ax.get_xaxis().set_tick_params(
502
+ which='both',
503
+ top=False,
504
+ bottom=False,
505
+ direction='out')
506
+ ax.get_xaxis().set_tick_params(
507
+ which='major',
508
+ labelrotation=45)
509
+
510
+ if col != num_samples - 1:
511
+ ax.set_yticklabels([])
512
+ else:
513
+ ax.yaxis.tick_right()
514
+ ax.get_yaxis().set_tick_params(
515
+ which='both',
516
+ left=False,
517
+ right=True,
518
+ direction='out')
519
+ if col - row == 1:
520
+ ax.xaxis.tick_bottom()
521
+ ax.get_xaxis().set_tick_params(
522
+ which='both',
523
+ top=False,
524
+ bottom=True,
525
+ direction='out')
526
+ ax.get_xaxis().set_tick_params(
527
+ which='major',
528
+ labelrotation=45)
529
+
530
+ else:
531
+ ax.set_xticklabels([])
532
+
533
+ ax.set_xlim(min_xvalue, max_xvalue)
534
+ ax.set_ylim(min_yvalue, max_yvalue)
535
+ ax.hist2d(vector2, vector1, bins=200, cmin=0.1)
536
+
537
+ plt.savefig(plot_filename, format=image_format)
538
+ plt.close()
539
+
540
+ def plotly_pca(self, plotFile, Wt, pvar, PCs, eigenvalues, cols, plotTitle):
541
+ """
542
+ A plotly version of plot_pca, that's called by it to do the actual plotting
543
+ """
544
+ fig = go.Figure()
545
+ fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1])}
546
+ fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1])}
547
+ fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'title': 'Principal Component'}
548
+ fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Eigenvalue', 'rangemode': 'tozero', 'showgrid': False}
549
+ fig['layout']['yaxis3'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Cumulative variability', 'rangemode': 'tozero', 'side': 'right', 'overlaying': 'y2'}
550
+ fig['layout'].update(title=plotTitle)
551
+
552
+ # PCA
553
+ if cols is not None:
554
+ colors = itertools.cycle(cols)
555
+ n = len(self.labels)
556
+ data = []
557
+ for i in range(n):
558
+ trace = go.Scatter(x=[Wt[PCs[0] - 1, i]],
559
+ y=[Wt[PCs[1] - 1, i]],
560
+ mode='marker',
561
+ xaxis='x1',
562
+ yaxis='y1',
563
+ name=self.labels[i])
564
+ trace['marker'].update(size=20)
565
+ if cols is not None:
566
+ trace['marker'].update(color=next(colors))
567
+ data.append(trace)
568
+
569
+ # Scree plot
570
+ trace = go.Bar(showlegend=False,
571
+ name='Eigenvalues',
572
+ x=range(1, n + 1),
573
+ y=eigenvalues[:n],
574
+ xaxis='x2',
575
+ yaxis='y2')
576
+ data.append(trace)
577
+
578
+ # Cumulative variability
579
+ trace = go.Scatter(showlegend=False,
580
+ x=range(1, n + 1),
581
+ y=pvar.cumsum()[:n],
582
+ mode='lines+markers',
583
+ name='Cumulative variability',
584
+ xaxis='x2',
585
+ yaxis='y3',
586
+ line={'color': 'red'},
587
+ marker={'symbol': 'circle-open-dot', 'color': 'black'})
588
+ data.append(trace)
589
+
590
+ annos = []
591
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'PCA', 'y': 1.0, 'x': 0.25, 'font': {'size': 16}, 'showarrow': False})
592
+ annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'Scree plot', 'y': 1.0, 'x': 0.75, 'font': {'size': 16}, 'showarrow': False})
593
+
594
+ fig.add_traces(data)
595
+ fig['layout']['annotations'] = annos
596
+ offline.plot(fig, filename=plotFile, auto_open=False)
597
+
598
+ def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=None, log1p=False, plotWidth=5, plotHeight=10, cols=None, marks=None):
599
+ """
600
+ Plot the PCA of a matrix
601
+
602
+ Returns the matrix of plotted values.
603
+ """
604
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(plotWidth, plotHeight))
605
+
606
+ # Filter
607
+ m = self.matrix
608
+ rvs = m.var(axis=1)
609
+ if self.transpose:
610
+ m = m[np.nonzero(rvs)[0], :]
611
+ rvs = rvs[np.nonzero(rvs)[0]]
612
+ if self.ntop > 0 and m.shape[0] > self.ntop:
613
+ m = m[np.argpartition(rvs, -self.ntop)[-self.ntop:], :]
614
+ rvs = rvs[np.argpartition(rvs, -self.ntop)[-self.ntop:]]
615
+
616
+ # log2 (if requested)
617
+ if self.log2:
618
+ self.matrix = np.log2(self.matrix + 0.01)
619
+
620
+ # Row center / transpose
621
+ if self.rowCenter and not self.transpose:
622
+ _ = self.matrix.mean(axis=1)
623
+ self.matrix -= _[:, None]
624
+ if self.transpose:
625
+ m = m.T
626
+
627
+ # Center and scale
628
+ m2 = (m - np.mean(m, axis=0))
629
+ m2 /= np.std(m2, axis=0, ddof=1) # Use the unbiased std. dev.
630
+
631
+ # SVD
632
+ U, s, Vh = np.linalg.svd(m2, full_matrices=False, compute_uv=True) # Is full_matrices ever needed?
633
+
634
+ # % variance, eigenvalues
635
+ eigenvalues = s**2
636
+ variance = eigenvalues / float(np.max([1, m2.shape[1] - 1]))
637
+ pvar = variance / variance.sum()
638
+
639
+ # Weights/projections
640
+ Wt = Vh
641
+ if self.transpose:
642
+ # Use the projected coordinates for the transposed matrix
643
+ Wt = np.dot(m2, Vh.T).T
644
+
645
+ if plot_filename is not None:
646
+ n = n_bars = len(self.labels)
647
+ if eigenvalues.size < n:
648
+ n_bars = eigenvalues.size
649
+ markers = itertools.cycle(matplotlib.markers.MarkerStyle.filled_markers)
650
+ if cols is not None:
651
+ colors = itertools.cycle(cols)
652
+ else:
653
+ colors = itertools.cycle(plt.cm.gist_rainbow(np.linspace(0, 1, n)))
654
+
655
+ if marks is not None:
656
+ markers = itertools.cycle(marks)
657
+
658
+ if image_format == 'plotly':
659
+ self.plotly_pca(plot_filename, Wt, pvar, PCs, eigenvalues, cols, plot_title)
660
+ else:
661
+ ax1.axhline(y=0, color="black", linestyle="dotted", zorder=1)
662
+ ax1.axvline(x=0, color="black", linestyle="dotted", zorder=2)
663
+ for i in range(n):
664
+ color = next(colors)
665
+ marker = next(markers)
666
+ if isinstance(color, np.ndarray):
667
+ color = pltcolors.to_hex(color, keep_alpha=True)
668
+ ax1.scatter(Wt[PCs[0] - 1, i], Wt[PCs[1] - 1, i],
669
+ marker=marker, color=color, s=150, label=self.labels[i], zorder=i + 3)
670
+ if plot_title == '':
671
+ ax1.set_title('PCA')
672
+ else:
673
+ ax1.set_title(plot_title)
674
+ ax1.set_xlabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1]))
675
+ ax1.set_ylabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1]))
676
+ lgd = ax1.legend(scatterpoints=1, loc='center left', borderaxespad=0.5,
677
+ bbox_to_anchor=(1, 0.5),
678
+ prop={'size': 12}, markerscale=0.9)
679
+
680
+ # Scree plot
681
+ ind = np.arange(n_bars) # the x locations for the groups
682
+ width = 0.35 # the width of the bars
683
+
684
+ if mpl.__version__ >= "2.0.0":
685
+ ax2.bar(2 * width + ind, eigenvalues[:n_bars], width * 2)
686
+ else:
687
+ ax2.bar(width + ind, eigenvalues[:n_bars], width * 2)
688
+ ax2.set_ylabel('Eigenvalue')
689
+ ax2.set_xlabel('Principal Component')
690
+ ax2.set_title('Scree plot')
691
+ ax2.set_xticks(ind + width * 2)
692
+ ax2.set_xticklabels(ind + 1)
693
+
694
+ ax3 = ax2.twinx()
695
+ ax3.axhline(y=1, color="black", linestyle="dotted")
696
+ ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "r-")
697
+ ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "wo", markeredgecolor="black")
698
+ ax3.set_ylim([0, 1.05])
699
+ ax3.set_ylabel('Cumulative variability')
700
+
701
+ plt.subplots_adjust(top=3.85)
702
+ plt.tight_layout()
703
+ plt.savefig(plot_filename, format=image_format, bbox_extra_artists=(lgd,), bbox_inches='tight')
704
+ plt.close()
705
+
706
+ return Wt, eigenvalues
deepTools/source/deeptools/correlation_heatmap.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from matplotlib import use as mplt_use
2
+ mplt_use('Agg')
3
+ from deeptools import cm # noqa: F401
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import scipy.cluster.hierarchy as sch
7
+ from matplotlib import rcParams
8
+ import matplotlib.colors as pltcolors
9
+ import copy
10
+
11
+ rcParams['pdf.fonttype'] = 42
12
+ rcParams['svg.fonttype'] = 'none'
13
+ old_settings = np.seterr(all='ignore')
14
+
15
+
16
+ def plot_correlation(corr_matrix, labels, plotFileName, vmax=None,
17
+ vmin=None, colormap='jet', image_format=None,
18
+ plot_numbers=False, plot_title=''):
19
+
20
+ num_rows = corr_matrix.shape[0]
21
+
22
+ # set a font size according to figure length
23
+ if num_rows < 6:
24
+ font_size = 14
25
+ elif num_rows > 40:
26
+ font_size = 5
27
+ else:
28
+ font_size = int(14 - 0.25 * num_rows)
29
+ rcParams.update({'font.size': font_size})
30
+ # set the minimum and maximum values
31
+ if vmax is None:
32
+ vmax = 1
33
+ if vmin is None:
34
+ vmin = 0 if corr_matrix.min() >= 0 else -1
35
+
36
+ # Compute and plot dendrogram.
37
+ fig = plt.figure(figsize=(11, 9.5))
38
+ if plot_title:
39
+ plt.suptitle(plot_title)
40
+ axdendro = fig.add_axes([0.02, 0.12, 0.1, 0.66])
41
+ axdendro.set_axis_off()
42
+ y_var = sch.linkage(corr_matrix, method='complete')
43
+ z_var = sch.dendrogram(y_var, orientation='right',
44
+ link_color_func=lambda k: 'darkred')
45
+ axdendro.set_xticks([])
46
+ axdendro.set_yticks([])
47
+ cmap = copy.copy(plt.get_cmap(colormap))
48
+
49
+ # this line simply makes a new cmap, based on the original
50
+ # colormap that goes from 0.0 to 0.9
51
+ # This is done to avoid colors that
52
+ # are too dark at the end of the range that do not offer
53
+ # a good contrast between the correlation numbers that are
54
+ # plotted on black.
55
+ if plot_numbers:
56
+ cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped",
57
+ cmap(np.linspace(0, 0.9, 10)))
58
+
59
+ cmap.set_under((0., 0., 1.))
60
+ # Plot distance matrix.
61
+ axmatrix = fig.add_axes([0.13, 0.1, 0.6, 0.7])
62
+ index = z_var['leaves']
63
+ corr_matrix = corr_matrix[index, :]
64
+ corr_matrix = corr_matrix[:, index]
65
+ img_mat = axmatrix.pcolormesh(corr_matrix,
66
+ edgecolors='black',
67
+ cmap=cmap,
68
+ vmax=vmax,
69
+ vmin=vmin)
70
+ axmatrix.set_xlim(0, num_rows)
71
+ axmatrix.set_ylim(0, num_rows)
72
+
73
+ axmatrix.yaxis.tick_right()
74
+ axmatrix.set_yticks(np.arange(corr_matrix.shape[0]) + 0.5)
75
+ axmatrix.set_yticklabels(np.array(labels).astype('str')[index])
76
+
77
+ # axmatrix.xaxis.set_label_position('top')
78
+ axmatrix.xaxis.set_tick_params(labeltop=True)
79
+ axmatrix.xaxis.set_tick_params(labelbottom=False)
80
+ axmatrix.set_xticks(np.arange(corr_matrix.shape[0]) + 0.5)
81
+ axmatrix.set_xticklabels(np.array(labels).astype('str')[index],
82
+ rotation=45,
83
+ ha='left')
84
+
85
+ axmatrix.tick_params(
86
+ axis='x',
87
+ which='both',
88
+ bottom=False,
89
+ top=False)
90
+
91
+ axmatrix.tick_params(
92
+ axis='y',
93
+ which='both',
94
+ left=False,
95
+ right=False)
96
+
97
+ # axmatrix.set_xticks([])
98
+ # Plot colorbar.
99
+ axcolor = fig.add_axes([0.13, 0.065, 0.6, 0.02])
100
+ cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal')
101
+ cobar.solids.set_edgecolor("face")
102
+ if plot_numbers:
103
+ for row in range(num_rows):
104
+ for col in range(num_rows):
105
+ axmatrix.text(row + 0.5, col + 0.5,
106
+ "{:.2f}".format(corr_matrix[row, col]),
107
+ ha='center', va='center')
108
+
109
+ fig.savefig(plotFileName, format=image_format)
110
+ fig.close()
deepTools/source/deeptools/countReadsPerBin.py ADDED
@@ -0,0 +1,1033 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import os
3
+ import time
4
+ import sys
5
+ import multiprocessing
6
+ import numpy as np
7
+
8
+ # deepTools packages
9
+ import deeptools.utilities
10
+ from deeptools import bamHandler
11
+ from deeptools import mapReduce
12
+ from deeptoolsintervals import GTF
13
+ import pyBigWig
14
+
15
+ debug = 0
16
+ old_settings = np.seterr(all='ignore')
17
+
18
+
19
+ def countReadsInRegions_wrapper(args):
20
+ """
21
+ Passes the arguments to countReadsInRegions_worker.
22
+ This is a step required given
23
+ the constrains from the multiprocessing module.
24
+ The args var, contains as first element the 'self' value
25
+ from the countReadsPerBin object
26
+
27
+ """
28
+ return CountReadsPerBin.count_reads_in_region(*args)
29
+
30
+
31
+ class CountReadsPerBin(object):
32
+
33
+ r"""Collects coverage over multiple bam files using multiprocessing
34
+
35
+ This function collects read counts (coverage) from several bam files and returns
36
+ an numpy array with the results. This class uses multiprocessing to compute the coverage.
37
+
38
+ Parameters
39
+ ----------
40
+ bamFilesList : list
41
+ List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']
42
+
43
+ binLength : int
44
+ Length of the window/bin. This value is overruled by ``bedFile`` if present.
45
+
46
+ numberOfSamples : int
47
+ Total number of samples. The genome is divided into ``numberOfSamples``, each
48
+ with a window/bin length equal to ``binLength``. This value is overruled
49
+ by ``stepSize`` in case such value is present and by ``bedFile`` in which
50
+ case the number of samples and bins are defined in the bed file
51
+
52
+ numberOfProcessors : int
53
+ Number of processors to use. Default is 4
54
+
55
+ verbose : bool
56
+ Output messages. Default: False
57
+
58
+ region : str
59
+ Region to limit the computation in the form chrom:start:end.
60
+
61
+ bedFile : list of file_handles.
62
+ Each file handle corresponds to a bed file containing the regions for which to compute the coverage. This option
63
+ overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.
64
+
65
+ blackListFileName : str
66
+ A string containing a BED file with blacklist regions.
67
+
68
+ extendReads : bool, int
69
+
70
+ Whether coverage should be computed for the extended read length (i.e. the region covered
71
+ by the two mates or the regions expected to be covered by single-reads).
72
+ If the value is 'int', then then this is interpreted as the fragment length to extend reads
73
+ that are not paired. For Illumina reads, usual values are around 300.
74
+ This value can be determined using the peak caller MACS2 or can be
75
+ approximated by the fragment lengths computed when preparing the library for sequencing. If the value
76
+ is of the variable is true and not value is given, the fragment size is sampled from the library but
77
+ only if the library is paired-end. Default: False
78
+
79
+
80
+ minMappingQuality : int
81
+ Reads of a mapping quality less than the give value are not considered. Default: None
82
+
83
+ ignoreDuplicates : bool
84
+ Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
85
+ to be excluded. Default: false
86
+
87
+ chrToSkip: list
88
+ List with names of chromosomes that do not want to be included in the coverage computation.
89
+ This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').
90
+
91
+ stepSize : int
92
+ the positions for which the coverage is computed are defined as follows:
93
+ ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
94
+ the coverage at each base pair. If the stepSize is equal to the
95
+ binLength then the coverage is computed for consecutive bins. If seepSize is
96
+ smaller than the binLength, then teh bins will overlap.
97
+
98
+ center_read : bool
99
+ Determines if reads should be centered with respect to the fragment length.
100
+
101
+ samFlag_include : int
102
+ Extracts only those reads having the SAM flag. For example, to get only
103
+ reads that are the first mates a samFlag of 64 could be used. Similarly, the
104
+ samFlag_include can be used to select only reads mapping on the reverse strand
105
+ or to get only properly paired reads.
106
+
107
+ samFlag_exclude : int
108
+ Removes reads that match the SAM flag. For example to get all reads
109
+ that map to the forward strand a samFlag_exlude 16 should be used. Which
110
+ translates into exclude all reads that map to the reverse strand.
111
+
112
+ zerosToNans : bool
113
+ If true, zero values encountered are transformed to Nans. Default false.
114
+
115
+ skipZeroOverZero : bool
116
+ If true, skip bins where all input BAM files have no coverage (only applicable to bamCompare).
117
+
118
+ minFragmentLength : int
119
+ If greater than 0, fragments below this size are excluded.
120
+
121
+ maxFragmentLength : int
122
+ If greater than 0, fragments above this size are excluded.
123
+
124
+ out_file_for_raw_data : str
125
+ File name to save the raw counts computed
126
+
127
+ statsList : list
128
+ For each BAM file in bamFilesList, the associated per-chromosome statistics returned by openBam
129
+
130
+ mappedList : list
131
+ For each BAM file in bamFilesList, the number of mapped reads in the file.
132
+
133
+ bed_and_bin : boolean
134
+ If true AND a bedFile is given, compute coverage of each bin of the given size in each region of bedFile
135
+
136
+ genomeChunkSize : int
137
+ If not None, the length of the genome used for multiprocessing.
138
+
139
+ Returns
140
+ -------
141
+ numpy array
142
+
143
+ Each row correspond to each bin/bed region and each column correspond to each of
144
+ the bamFiles.
145
+
146
+
147
+ Examples
148
+ --------
149
+
150
+ The test data contains reads for 200 bp.
151
+
152
+ >>> test = Tester()
153
+
154
+ The transpose function is used to get a nicer looking output.
155
+ The first line corresponds to the number of reads per bin in bam file 1
156
+
157
+ >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 50, 4)
158
+ >>> np.transpose(c.run())
159
+ array([[0., 0., 1., 1.],
160
+ [0., 1., 1., 2.]])
161
+ """
162
+
163
+ def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1,
164
+ verbose=False, region=None,
165
+ bedFile=None, extendReads=False,
166
+ genomeChunkSize=None,
167
+ blackListFileName=None,
168
+ minMappingQuality=None,
169
+ ignoreDuplicates=False,
170
+ chrsToSkip=[],
171
+ stepSize=None,
172
+ center_read=False,
173
+ samFlag_include=None,
174
+ samFlag_exclude=None,
175
+ zerosToNans=False,
176
+ skipZeroOverZero=False,
177
+ smoothLength=0,
178
+ minFragmentLength=0,
179
+ maxFragmentLength=0,
180
+ out_file_for_raw_data=None,
181
+ bed_and_bin=False,
182
+ statsList=[],
183
+ mappedList=[]):
184
+
185
+ self.bamFilesList = bamFilesList
186
+ self.binLength = binLength
187
+ self.numberOfSamples = numberOfSamples
188
+ self.blackListFileName = blackListFileName
189
+ self.statsList = statsList
190
+ self.mappedList = mappedList
191
+ self.skipZeroOverZero = skipZeroOverZero
192
+ self.bed_and_bin = bed_and_bin
193
+ self.genomeChunkSize = genomeChunkSize
194
+
195
+ if extendReads and len(bamFilesList):
196
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
197
+ frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0],
198
+ return_lengths=False,
199
+ blackListFileName=blackListFileName,
200
+ numberOfProcessors=numberOfProcessors,
201
+ verbose=verbose)
202
+ if extendReads is True:
203
+ # try to guess fragment length if the bam file contains paired end reads
204
+ if frag_len_dict:
205
+ self.defaultFragmentLength = int(frag_len_dict['median'])
206
+ else:
207
+ exit("*ERROR*: library is not paired-end. Please provide an extension length.")
208
+ if verbose:
209
+ print(("Fragment length based on paired en data "
210
+ "estimated to be {}".format(frag_len_dict['median'])))
211
+
212
+ elif extendReads < read_len_dict['median']:
213
+ sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
214
+ "Reads will not be extended.\n".format(int(read_len_dict['median'])))
215
+ self.defaultFragmentLength = 'read length'
216
+
217
+ elif extendReads > 2000:
218
+ exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads))
219
+ else:
220
+ self.defaultFragmentLength = int(extendReads)
221
+
222
+ else:
223
+ self.defaultFragmentLength = 'read length'
224
+
225
+ self.numberOfProcessors = numberOfProcessors
226
+ self.verbose = verbose
227
+ self.region = region
228
+ self.bedFile = bedFile
229
+ self.minMappingQuality = minMappingQuality
230
+ self.ignoreDuplicates = ignoreDuplicates
231
+ self.chrsToSkip = chrsToSkip
232
+ self.stepSize = stepSize
233
+ self.center_read = center_read
234
+ self.samFlag_include = samFlag_include
235
+ self.samFlag_exclude = samFlag_exclude
236
+ self.minFragmentLength = minFragmentLength
237
+ self.maxFragmentLength = maxFragmentLength
238
+ self.zerosToNans = zerosToNans
239
+ self.smoothLength = smoothLength
240
+
241
+ if out_file_for_raw_data:
242
+ self.save_data = True
243
+ self.out_file_for_raw_data = out_file_for_raw_data
244
+ else:
245
+ self.save_data = False
246
+ self.out_file_for_raw_data = None
247
+
248
+ # check that wither numberOfSamples or stepSize are set
249
+ if numberOfSamples is None and stepSize is None and bedFile is None:
250
+ raise ValueError("either stepSize, numberOfSamples or bedFile have to be set")
251
+
252
+ if self.defaultFragmentLength != 'read length':
253
+ self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
254
+ else:
255
+ self.maxPairedFragmentLength = 1000
256
+ if self.maxFragmentLength > 0:
257
+ self.maxPairedFragmentLength = self.maxFragmentLength
258
+
259
+ if len(self.mappedList) == 0:
260
+ try:
261
+ for fname in self.bamFilesList:
262
+ bam, mapped, unmapped, stats = bamHandler.openBam(fname, returnStats=True, nThreads=self.numberOfProcessors)
263
+ self.mappedList.append(mapped)
264
+ self.statsList.append(stats)
265
+ bam.close()
266
+ except:
267
+ self.mappedList = []
268
+ self.statsList = []
269
+
270
+ def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths):
271
+ # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
272
+ # workers for analysis. If too short, too much time is spent loading the files
273
+ # if too long, some processors end up free.
274
+ # the following values are empirical
275
+ if self.stepSize is None:
276
+ if self.region is None:
277
+ self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
278
+ else:
279
+ # compute the step size, based on the number of samples
280
+ # and the length of the region studied
281
+ (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
282
+ self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)
283
+
284
+ # number of samples is better if large
285
+ if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
286
+ min_num_of_samples = int(genomeSize / np.mean(chrLengths))
287
+ raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))
288
+
289
+ max_mapped = 0
290
+ if len(self.mappedList) > 0:
291
+ max_mapped = max(self.mappedList)
292
+
293
+ # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
294
+ if max_mapped == 0:
295
+ chunkSize = 10000 * self.binLength
296
+ self.stepSize = self.binLength
297
+ else:
298
+ reads_per_bp = float(max_mapped) / genomeSize
299
+ chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandles)))
300
+
301
+ # Ensure that chunkSize is always at least self.stepSize
302
+ if chunkSize < self.stepSize:
303
+ chunkSize = self.stepSize
304
+
305
+ # Ensure that chunkSize is always at least self.binLength
306
+ if self.binLength and chunkSize < self.binLength:
307
+ chunkSize = self.binLength
308
+
309
+ return chunkSize
310
+
311
+ def run(self, allArgs=None):
312
+ bamFilesHandles = []
313
+ for x in self.bamFilesList:
314
+ try:
315
+ y = bamHandler.openBam(x)
316
+ except SystemExit:
317
+ sys.exit(sys.exc_info()[1])
318
+ except:
319
+ y = pyBigWig.open(x)
320
+ bamFilesHandles.append(y)
321
+
322
+ chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose)
323
+
324
+ # skip chromosome in the list. This is usually for the
325
+ # X chromosome which may have either one copy in a male sample
326
+ # or a mixture of male/female and is unreliable.
327
+ # Also the skip may contain heterochromatic regions and
328
+ # mitochondrial DNA
329
+ if len(self.chrsToSkip):
330
+ chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]
331
+
332
+ chrNames, chrLengths = list(zip(*chromsizes))
333
+
334
+ genomeSize = sum(chrLengths)
335
+
336
+ chunkSize = None
337
+ if self.bedFile is None:
338
+ if self.genomeChunkSize is None:
339
+ chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths)
340
+ else:
341
+ chunkSize = self.genomeChunkSize
342
+
343
+ [bam_h.close() for bam_h in bamFilesHandles]
344
+
345
+ if self.verbose:
346
+ print("step size is {}".format(self.stepSize))
347
+
348
+ if self.region:
349
+ # in case a region is used, append the tilesize
350
+ self.region += ":{}".format(self.binLength)
351
+
352
+ # Handle GTF options
353
+ transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
354
+
355
+ # use map reduce to call countReadsInRegions_wrapper
356
+ imap_res = mapReduce.mapReduce([],
357
+ countReadsInRegions_wrapper,
358
+ chromsizes,
359
+ self_=self,
360
+ genomeChunkLength=chunkSize,
361
+ bedFile=self.bedFile,
362
+ blackListFileName=self.blackListFileName,
363
+ region=self.region,
364
+ numberOfProcessors=self.numberOfProcessors,
365
+ transcriptID=transcriptID,
366
+ exonID=exonID,
367
+ keepExons=keepExons,
368
+ transcript_id_designator=transcript_id_designator)
369
+
370
+ if self.out_file_for_raw_data:
371
+ if len(non_common):
372
+ sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
373
+ "the chromosomes that were not common between the bigwig files\n")
374
+
375
+ # concatenate intermediary bedgraph files
376
+ ofile = open(self.out_file_for_raw_data, "w")
377
+ for _values, tempFileName in imap_res:
378
+ if tempFileName:
379
+ # concatenate all intermediate tempfiles into one
380
+ _foo = open(tempFileName, 'r')
381
+ shutil.copyfileobj(_foo, ofile)
382
+ _foo.close()
383
+ os.remove(tempFileName)
384
+
385
+ ofile.close()
386
+
387
+ try:
388
+ num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
389
+ return num_reads_per_bin
390
+
391
+ except ValueError:
392
+ if self.bedFile:
393
+ sys.exit('\nNo coverage values could be computed.\n\n'
394
+ 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
395
+ 'The valid chromosome names are:\n{}'.format(chrNames))
396
+ else:
397
+ sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
398
+ 'contain mapped reads.')
399
+
400
+ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
401
+ """Counts the reads in each bam file at each 'stepSize' position
402
+ within the interval (start, end) for a window or bin of size binLength.
403
+
404
+ The stepSize controls the distance between bins. For example,
405
+ a step size of 20 and a bin size of 20 will create bins next to
406
+ each other. If the step size is smaller than the bin size the
407
+ bins will overlap.
408
+
409
+ If a list of bedRegions is given, then the number of reads
410
+ that overlaps with each region is counted.
411
+
412
+ Parameters
413
+ ----------
414
+ chrom : str
415
+ Chrom name
416
+ start : int
417
+ start coordinate
418
+ end : int
419
+ end coordinate
420
+ bed_regions_list: list
421
+ List of list of tuples of the form (start, end)
422
+ corresponding to bed regions to be processed.
423
+ If not bed file was passed to the object constructor
424
+ then this list is empty.
425
+
426
+ Returns
427
+ -------
428
+ numpy array
429
+ The result is a numpy array that as rows each bin
430
+ and as columns each bam file.
431
+
432
+
433
+ Examples
434
+ --------
435
+ Initialize some useful values
436
+
437
+ >>> test = Tester()
438
+ >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)
439
+
440
+ The transpose is used to get better looking numbers. The first line
441
+ corresponds to the number of reads per bin in the first bamfile.
442
+
443
+ >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
444
+ >>> _array
445
+ array([[0., 0.],
446
+ [0., 1.],
447
+ [1., 1.],
448
+ [1., 2.]])
449
+
450
+ """
451
+
452
+ if start > end:
453
+ raise NameError("start %d bigger that end %d" % (start, end))
454
+
455
+ if self.stepSize is None and bed_regions_list is None:
456
+ raise ValueError("stepSize is not set!")
457
+ # array to keep the read counts for the regions
458
+ subnum_reads_per_bin = []
459
+
460
+ start_time = time.time()
461
+
462
+ bam_handles = []
463
+ for fname in self.bamFilesList:
464
+ try:
465
+ bam_handles.append(bamHandler.openBam(fname))
466
+ except SystemExit:
467
+ sys.exit(sys.exc_info()[1])
468
+ except:
469
+ bam_handles.append(pyBigWig.open(fname))
470
+
471
+ blackList = None
472
+ if self.blackListFileName is not None:
473
+ blackList = GTF(self.blackListFileName)
474
+
475
+ # A list of lists of tuples
476
+ transcriptsToConsider = []
477
+ if bed_regions_list is not None:
478
+ if self.bed_and_bin:
479
+ transcriptsToConsider.append([(x[1][0][0], x[1][0][1], self.binLength) for x in bed_regions_list])
480
+ else:
481
+ transcriptsToConsider = [x[1] for x in bed_regions_list]
482
+ else:
483
+ if self.stepSize == self.binLength:
484
+ transcriptsToConsider.append([(start, end, self.binLength)])
485
+ else:
486
+ for i in range(start, end, self.stepSize):
487
+ if i + self.binLength > end:
488
+ break
489
+ if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength):
490
+ continue
491
+ transcriptsToConsider.append([(i, i + self.binLength)])
492
+
493
+ if self.save_data:
494
+ _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
495
+ _file_name = _file.name
496
+ else:
497
+ _file_name = ''
498
+
499
+ for bam in bam_handles:
500
+ for trans in transcriptsToConsider:
501
+ tcov = self.get_coverage_of_region(bam, chrom, trans)
502
+ if bed_regions_list is not None and not self.bed_and_bin:
503
+ subnum_reads_per_bin.append(np.sum(tcov))
504
+ else:
505
+ subnum_reads_per_bin.extend(tcov)
506
+
507
+ subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')
508
+
509
+ if self.save_data:
510
+ idx = 0
511
+ for i, trans in enumerate(transcriptsToConsider):
512
+ if len(trans[0]) != 3:
513
+ starts = ",".join([str(x[0]) for x in trans])
514
+ ends = ",".join([str(x[1]) for x in trans])
515
+ _file.write("\t".join([chrom, starts, ends]) + "\t")
516
+ _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n")
517
+ else:
518
+ for exon in trans:
519
+ for startPos in range(exon[0], exon[1], exon[2]):
520
+ if idx >= subnum_reads_per_bin.shape[0]:
521
+ # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
522
+ # Counts there are added to the bin before them, but range() will still try to include them.
523
+ break
524
+ _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, min(startPos + exon[2], exon[1])))
525
+ _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n")
526
+ idx += 1
527
+ _file.close()
528
+
529
+ if self.verbose:
530
+ endTime = time.time()
531
+ rows = subnum_reads_per_bin.shape[0]
532
+ print("%s countReadsInRegions_worker: processing %d "
533
+ "(%.1f per sec) @ %s:%s-%s" %
534
+ (multiprocessing.current_process().name,
535
+ rows, rows / (endTime - start_time), chrom, start, end))
536
+
537
+ return subnum_reads_per_bin, _file_name
538
+
539
+ def get_coverage_of_region(self, bamHandle, chrom, regions,
540
+ fragmentFromRead_func=None):
541
+ """
542
+ Returns a numpy array that corresponds to the number of reads
543
+ that overlap with each tile.
544
+
545
+ >>> test = Tester()
546
+ >>> import pysam
547
+ >>> c = CountReadsPerBin([], stepSize=1, extendReads=300)
548
+
549
+ For this case the reads are length 36. The number of overlapping
550
+ read fragments is 4 and 5 for the positions tested.
551
+
552
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
553
+ ... [(5000833, 5000834), (5000834, 5000835)])
554
+ array([4., 5.])
555
+
556
+ In the following example a paired read is extended to the fragment length which is 100
557
+ The first mate starts at 5000000 and the second at 5000064. Each mate is
558
+ extended to the fragment length *independently*
559
+ At position 500090-500100 one fragment of length 100 overlap, and after position 5000101
560
+ there should be zero reads.
561
+
562
+ >>> c.zerosToNans = True
563
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
564
+ ... [(5000090, 5000100), (5000100, 5000110)])
565
+ array([ 1., nan])
566
+
567
+ In the following case the reads length is 50. Reads are not extended.
568
+
569
+ >>> c.extendReads=False
570
+ >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
571
+ array([1., 2., 2.])
572
+
573
+
574
+ """
575
+ if not fragmentFromRead_func:
576
+ fragmentFromRead_func = self.get_fragment_from_read
577
+ nbins = len(regions)
578
+ if len(regions[0]) == 3:
579
+ nbins = 0
580
+ for reg in regions:
581
+ nbins += (reg[1] - reg[0]) // reg[2]
582
+ if (reg[1] - reg[0]) % reg[2] > 0:
583
+ nbins += 1
584
+ coverages = np.zeros(nbins, dtype='float64')
585
+
586
+ if self.defaultFragmentLength == 'read length':
587
+ extension = 0
588
+ else:
589
+ extension = self.maxPairedFragmentLength
590
+
591
+ blackList = None
592
+ if self.blackListFileName is not None:
593
+ blackList = GTF(self.blackListFileName)
594
+
595
+ vector_start = 0
596
+ for idx, reg in enumerate(regions):
597
+ if len(reg) == 3:
598
+ tileSize = int(reg[2])
599
+ nRegBins = (reg[1] - reg[0]) // tileSize
600
+ if (reg[1] - reg[0]) % tileSize > 0:
601
+ # Don't eliminate small bins! Issue 887
602
+ nRegBins += 1
603
+ else:
604
+ nRegBins = 1
605
+ tileSize = int(reg[1] - reg[0])
606
+
607
+ # Blacklisted regions have a coverage of 0
608
+ if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
609
+ continue
610
+ regStart = int(max(0, reg[0] - extension))
611
+ regEnd = reg[1] + int(extension)
612
+
613
+ # If alignments are extended and there's a blacklist, ensure that no
614
+ # reads originating in a blacklist are fetched
615
+ if blackList and reg[0] > 0 and extension > 0:
616
+ o = blackList.findOverlaps(chrom, regStart, reg[0])
617
+ if o is not None and len(o) > 0:
618
+ regStart = o[-1][1]
619
+ o = blackList.findOverlaps(chrom, reg[1], regEnd)
620
+ if o is not None and len(o) > 0:
621
+ regEnd = o[0][0]
622
+
623
+ start_time = time.time()
624
+ # caching seems faster. TODO: profile the function
625
+ c = 0
626
+ if chrom not in bamHandle.references:
627
+ raise NameError("chromosome {} not found in bam file".format(chrom))
628
+
629
+ prev_pos = set()
630
+ lpos = None
631
+ # of previous processed read pair
632
+ for read in bamHandle.fetch(chrom, regStart, regEnd):
633
+ if read.is_unmapped:
634
+ continue
635
+ if self.minMappingQuality and read.mapq < self.minMappingQuality:
636
+ continue
637
+
638
+ # filter reads based on SAM flag
639
+ if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
640
+ continue
641
+ if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
642
+ continue
643
+
644
+ # Fragment lengths
645
+ tLen = deeptools.utilities.getTLen(read)
646
+ if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
647
+ continue
648
+ if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
649
+ continue
650
+
651
+ # get rid of duplicate reads that have same position on each of the
652
+ # pairs
653
+ if self.ignoreDuplicates:
654
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
655
+ if tLen >= 0:
656
+ s = read.pos
657
+ e = s + tLen
658
+ else:
659
+ s = read.pnext
660
+ e = s - tLen
661
+ if read.reference_id != read.next_reference_id:
662
+ e = read.pnext
663
+ if lpos is not None and lpos == read.reference_start \
664
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
665
+ continue
666
+ if lpos != read.reference_start:
667
+ prev_pos.clear()
668
+ lpos = read.reference_start
669
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
670
+
671
+ # since reads can be split (e.g. RNA-seq reads) each part of the
672
+ # read that maps is called a position block.
673
+ try:
674
+ position_blocks = fragmentFromRead_func(read)
675
+ except TypeError:
676
+ # the get_fragment_from_read functions returns None in some cases.
677
+ # Those cases are to be skipped, hence the continue line.
678
+ continue
679
+
680
+ last_eIdx = None
681
+ for fragmentStart, fragmentEnd in position_blocks:
682
+ if fragmentEnd is None or fragmentStart is None:
683
+ continue
684
+ fragmentLength = fragmentEnd - fragmentStart
685
+ if fragmentLength == 0:
686
+ continue
687
+ # skip reads that are not in the region being
688
+ # evaluated.
689
+ if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
690
+ continue
691
+
692
+ if fragmentStart < reg[0]:
693
+ fragmentStart = reg[0]
694
+ if fragmentEnd > reg[0] + len(coverages) * tileSize:
695
+ fragmentEnd = reg[0] + len(coverages) * tileSize
696
+
697
+ sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
698
+ eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
699
+ if last_eIdx is not None:
700
+ sIdx = max(last_eIdx, sIdx)
701
+ if sIdx >= eIdx:
702
+ continue
703
+ sIdx = int(sIdx)
704
+ eIdx = int(eIdx)
705
+ coverages[sIdx:eIdx] += 1
706
+ last_eIdx = eIdx
707
+
708
+ c += 1
709
+
710
+ if self.verbose:
711
+ endTime = time.time()
712
+ print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (
713
+ multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))
714
+
715
+ vector_start += nRegBins
716
+
717
+ # change zeros to NAN
718
+ if self.zerosToNans:
719
+ coverages[coverages == 0] = np.nan
720
+
721
+ return coverages
722
+
723
+ def getReadLength(self, read):
724
+ return len(read)
725
+
726
+ @staticmethod
727
+ def is_proper_pair(read, maxPairedFragmentLength):
728
+ """
729
+ Checks if a read is proper pair meaning that both mates are facing each other and are in
730
+ the same chromosome and are not to far away. The sam flag for proper pair can not
731
+ always be trusted. Note that if the fragment size is > maxPairedFragmentLength (~2kb
732
+ usually) that False will be returned.
733
+ :return: bool
734
+
735
+ >>> import pysam
736
+ >>> import os
737
+ >>> from deeptools.countReadsPerBin import CountReadsPerBin as cr
738
+ >>> root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
739
+ >>> bam = pysam.AlignmentFile("{}/test_proper_pair_filtering.bam".format(root))
740
+ >>> iter = bam.fetch()
741
+ >>> read = next(iter)
742
+ >>> cr.is_proper_pair(read, 1000) # "keep" read
743
+ True
744
+ >>> cr.is_proper_pair(read, 200) # "keep" read, but maxPairedFragmentLength is too short
745
+ False
746
+ >>> read = next(iter)
747
+ >>> cr.is_proper_pair(read, 1000) # "improper pair"
748
+ False
749
+ >>> read = next(iter)
750
+ >>> cr.is_proper_pair(read, 1000) # "mismatch chr"
751
+ False
752
+ >>> read = next(iter)
753
+ >>> cr.is_proper_pair(read, 1000) # "same orientation1"
754
+ False
755
+ >>> read = next(iter)
756
+ >>> cr.is_proper_pair(read, 1000) # "same orientation2"
757
+ False
758
+ >>> read = next(iter)
759
+ >>> cr.is_proper_pair(read, 1000) # "rev first"
760
+ False
761
+ >>> read = next(iter)
762
+ >>> cr.is_proper_pair(read, 1000) # "rev first OK"
763
+ True
764
+ >>> read = next(iter)
765
+ >>> cr.is_proper_pair(read, 1000) # "for first"
766
+ False
767
+ >>> read = next(iter)
768
+ >>> cr.is_proper_pair(read, 1000) # "for first"
769
+ True
770
+ """
771
+ if not read.is_proper_pair:
772
+ return False
773
+ if read.reference_id != read.next_reference_id:
774
+ return False
775
+ if abs(read.template_length) > maxPairedFragmentLength:
776
+ return False
777
+ # check that the mates face each other (inward)
778
+ if read.is_reverse is read.mate_is_reverse:
779
+ return False
780
+ if read.is_reverse:
781
+ if read.reference_start >= read.next_reference_start:
782
+ return True
783
+ else:
784
+ if read.reference_start <= read.next_reference_start:
785
+ return True
786
+ return False
787
+
788
+ def get_fragment_from_read(self, read):
789
+ """Get read start and end position of a read.
790
+ If given, the reads are extended as follows:
791
+ If reads are paired end, each read mate is extended to match
792
+ the fragment length, otherwise, a default fragment length
793
+ is used. If reads are split (give by the CIGAR string) then
794
+ the multiple positions of the read are returned.
795
+ When reads are extended the cigar information is
796
+ skipped.
797
+
798
+ Parameters
799
+ ----------
800
+ read: pysam object.
801
+
802
+ The following values are defined (for forward reads)::
803
+
804
+
805
+ |-- -- read.tlen -- --|
806
+ |-- read.alen --|
807
+ -----|===============>------------<==============|----
808
+ | | |
809
+ read.reference_start
810
+ read.reference_end read.pnext
811
+
812
+ and for reverse reads
813
+
814
+
815
+ |-- -- read.tlen -- --|
816
+ |-- read.alen --|
817
+ -----|===============>-----------<===============|----
818
+ | | |
819
+ read.pnext read.reference_start read.reference_end
820
+
821
+ this is a sketch of a pair-end reads
822
+
823
+ The function returns the fragment start and end, either
824
+ using the paired end information (if available) or
825
+ extending the read in the appropriate direction if this
826
+ is single-end.
827
+
828
+ Parameters
829
+ ----------
830
+ read : pysam read object
831
+
832
+
833
+ Returns
834
+ -------
835
+ list of tuples
836
+ [(fragment start, fragment end)]
837
+
838
+
839
+ >>> test = Tester()
840
+ >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True)
841
+ >>> c.defaultFragmentLength=100
842
+ >>> c.get_fragment_from_read(test.getRead("paired-forward"))
843
+ [(5000000, 5000100)]
844
+ >>> c.get_fragment_from_read(test.getRead("paired-reverse"))
845
+ [(5000000, 5000100)]
846
+ >>> c.defaultFragmentLength = 200
847
+ >>> c.get_fragment_from_read(test.getRead("single-forward"))
848
+ [(5001491, 5001691)]
849
+ >>> c.get_fragment_from_read(test.getRead("single-reverse"))
850
+ [(5001536, 5001736)]
851
+ >>> c.defaultFragmentLength = 'read length'
852
+ >>> c.get_fragment_from_read(test.getRead("single-forward"))
853
+ [(5001491, 5001527)]
854
+ >>> c.defaultFragmentLength = 'read length'
855
+ >>> c.extendReads = False
856
+ >>> c.get_fragment_from_read(test.getRead("paired-forward"))
857
+ [(5000000, 5000036)]
858
+
859
+ Tests for read centering.
860
+
861
+ >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True, center_read=True)
862
+ >>> c.defaultFragmentLength = 100
863
+ >>> assert c.get_fragment_from_read(test.getRead("paired-forward")) == [(5000032, 5000068)]
864
+ >>> c.defaultFragmentLength = 200
865
+ >>> assert c.get_fragment_from_read(test.getRead("single-reverse")) == [(5001618, 5001654)]
866
+ """
867
+ # if no extension is needed, use pysam get_blocks
868
+ # to identify start and end reference positions.
869
+ # get_blocks return a list of start and end positions
870
+ # based on the CIGAR if skipped regions are found.
871
+ # E.g for a cigar of 40M260N22M
872
+ # get blocks return two elements for the first 40 matches
873
+ # and the for the last 22 matches.
874
+ if self.defaultFragmentLength == 'read length':
875
+ return read.get_blocks()
876
+
877
+ else:
878
+ if self.is_proper_pair(read, self.maxPairedFragmentLength):
879
+ if read.is_reverse:
880
+ fragmentStart = read.next_reference_start
881
+ fragmentEnd = read.reference_end
882
+ else:
883
+ fragmentStart = read.reference_start
884
+ # the end of the fragment is defined as
885
+ # the start of the forward read plus the insert length
886
+ fragmentEnd = read.reference_start + abs(read.template_length)
887
+
888
+ # Extend using the default fragment length
889
+ else:
890
+ if read.is_reverse:
891
+ fragmentStart = read.reference_end - self.defaultFragmentLength
892
+ fragmentEnd = read.reference_end
893
+ else:
894
+ fragmentStart = read.reference_start
895
+ fragmentEnd = read.reference_start + self.defaultFragmentLength
896
+
897
+ if self.center_read:
898
+ fragmentCenter = fragmentEnd - (fragmentEnd - fragmentStart) / 2
899
+ fragmentStart = int(fragmentCenter - read.infer_query_length(always=False) / 2)
900
+ fragmentEnd = fragmentStart + read.infer_query_length(always=False)
901
+
902
+ assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \
903
+ "end for read {}".format(read.query_name)
904
+ return [(fragmentStart, fragmentEnd)]
905
+
906
+ def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition):
907
+ """
908
+ Given a tile index position and a tile size (length), return the a new indices
909
+ over a larger range, called the smoothRange.
910
+ This region is centered in the tileIndex an spans on both sizes
911
+ to cover the smoothRange. The smoothRange is trimmed in case it is less
912
+ than zero or greater than maxPosition ::
913
+
914
+
915
+ ---------------|==================|------------------
916
+ tileStart
917
+ |--------------------------------------|
918
+ | <-- smoothRange --> |
919
+ |
920
+ tileStart - (smoothRange-tileSize)/2
921
+
922
+ Test for a smooth range that spans 3 tiles.
923
+
924
+ Examples
925
+ --------
926
+
927
+ >>> c = CountReadsPerBin([], 1, 1, 1, 0)
928
+ >>> c.getSmoothRange(5, 1, 3, 10)
929
+ (4, 7)
930
+
931
+ Test smooth range truncated on start.
932
+
933
+ >>> c.getSmoothRange(0, 10, 30, 200)
934
+ (0, 2)
935
+
936
+ Test smooth range truncated on start.
937
+
938
+ >>> c.getSmoothRange(1, 10, 30, 4)
939
+ (0, 3)
940
+
941
+ Test smooth range truncated on end.
942
+
943
+ >>> c.getSmoothRange(5, 1, 3, 5)
944
+ (4, 5)
945
+
946
+ Test smooth range not multiple of tileSize.
947
+
948
+ >>> c.getSmoothRange(5, 10, 24, 10)
949
+ (4, 6)
950
+ """
951
+ smoothTiles = int(smoothRange / tileSize)
952
+ if smoothTiles == 1:
953
+ return (tileIndex, tileIndex + 1)
954
+
955
+ smoothTilesSide = float(smoothTiles - 1) / 2
956
+ smoothTilesLeft = int(np.ceil(smoothTilesSide))
957
+ smoothTilesRight = int(np.floor(smoothTilesSide)) + 1
958
+
959
+ indexStart = max(tileIndex - smoothTilesLeft, 0)
960
+ indexEnd = min(maxPosition, tileIndex + smoothTilesRight)
961
+ return (indexStart, indexEnd)
962
+
963
+
964
+ def remove_row_of_zeros(matrix):
965
+ # remove rows containing all zeros or all nans
966
+ _mat = np.nan_to_num(matrix)
967
+ to_keep = _mat.sum(1) != 0
968
+ return matrix[to_keep, :]
969
+
970
+
971
+ def estimateSizeFactors(m):
972
+ """
973
+ Compute size factors in the same way as DESeq2.
974
+ The inverse of that is returned, as it's then compatible with bamCoverage.
975
+
976
+ m : a numpy ndarray
977
+
978
+ >>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]])
979
+ >>> sf = estimateSizeFactors(m)
980
+ >>> assert np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4)
981
+ >>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]])
982
+ >>> sf = estimateSizeFactors(m)
983
+ >>> assert np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4)
984
+ """
985
+ loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1]
986
+ # Mask after computing the geometric mean
987
+ m = np.ma.masked_where(m <= 0, m)
988
+ loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans)
989
+ # DESeq2 ratio-based size factor
990
+ sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0))
991
+ return 1. / sf
992
+
993
+
994
+ class Tester(object):
995
+
996
+ def __init__(self):
997
+ """
998
+ The distribution of reads between the two bam files is as follows.
999
+
1000
+ They cover 200 bp
1001
+
1002
+ 0 100 200
1003
+ |------------------------------------------------------------|
1004
+ A ===============
1005
+ ===============
1006
+
1007
+
1008
+ B =============== ===============
1009
+ ===============
1010
+ ===============
1011
+ """
1012
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
1013
+ # self.root = "./test/test_data/"
1014
+ self.bamFile1 = self.root + "testA.bam"
1015
+ self.bamFile2 = self.root + "testB.bam"
1016
+ self.bamFile_PE = self.root + "test_paired2.bam"
1017
+ self.chrom = '3R'
1018
+ global debug
1019
+ debug = 0
1020
+
1021
+ def getRead(self, readType):
1022
+ """ prepare arguments for test
1023
+ """
1024
+ bam = bamHandler.openBam(self.bamFile_PE)
1025
+ if readType == 'paired-reverse':
1026
+ read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
1027
+ elif readType == 'single-forward':
1028
+ read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
1029
+ elif readType == 'single-reverse':
1030
+ read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
1031
+ else: # by default a forward paired read is returned
1032
+ read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
1033
+ return read
deepTools/source/deeptools/deeptools_list_tools.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import sys
6
+ from importlib.metadata import version
7
+
8
+
9
+ def parse_arguments(args=None):
10
+ parser = argparse.ArgumentParser(
11
+ formatter_class=argparse.RawDescriptionHelpFormatter,
12
+ description="""
13
+ deepTools is a suite of python tools particularly developed for the efficient analysis of
14
+ high-throughput sequencing data, such as ChIP-seq, RNA-seq or MNase-seq.
15
+
16
+ Each tool should be called by its own name as in the following example:
17
+
18
+ $ bamCoverage -b reads.bam -o coverage.bw
19
+
20
+ If you find deepTools useful for your research please cite as:
21
+
22
+ Ramírez, Fidel, Devon P. Ryan, Björn Grüning, Vivek Bhardwaj, Fabian Kilpert,
23
+ Andreas S. Richter, Steffen Heyne, Friederike Dündar,
24
+ and Thomas Manke. 2016. "deepTools2: A next Generation Web Server for Deep-Sequencing
25
+ Data Analysis." Nucleic Acids Research, April. doi:10.1093/nar/gkw257.
26
+
27
+
28
+
29
+ [ Tools for BAM and bigWig file processing ]
30
+ multiBamSummary compute read coverages over bam files. Output used for plotCorrelation or plotPCA
31
+ multiBigwigSummary extract scores from bigwig files. Output used for plotCorrelation or plotPCA
32
+ correctGCBias corrects GC bias from bam file. Don't use it with ChIP data
33
+ bamCoverage computes read coverage per bins or regions
34
+ bamCompare computes log2 ratio and other operations of read coverage of two samples per bins or regions
35
+ bigwigCompare computes log2 ratio and other operations from bigwig scores of two samples per bins or regions
36
+ bigwigAverage computes average from bigwig scores of multiple samples per bins or regions
37
+ computeMatrix prepares the data from bigwig scores for plotting with plotHeatmap or plotProfile
38
+ alignmentSieve filters BAM alignments according to specified parameters, optionally producing a BEDPE file
39
+
40
+
41
+ [ Tools for QC ]
42
+ plotCorrelation plots heatmaps or scatterplots of data correlation
43
+ plotPCA plots PCA
44
+ plotFingerprint plots the distribution of enriched regions
45
+ bamPEFragmentSize returns the read length and paired-end distance from a bam file
46
+ computeGCBias computes and plots the GC bias of a sample
47
+ plotCoverage plots a histogram of read coverage
48
+ estimateReadFiltering estimates the number of reads that will be filtered from a BAM file or files given certain criteria
49
+
50
+
51
+ [Heatmaps and summary plots]
52
+ plotHeatmap plots one or multiple heatmaps of user selected regions over different genomic scores
53
+ plotProfile plots the average profile of user selected regions over different genomic scores
54
+ plotEnrichment plots the read/fragment coverage of one or more sets of regions
55
+
56
+ [Miscellaneous]
57
+ computeMatrixOperations Modifies the output of computeMatrix in a variety of ways.
58
+
59
+
60
+ For more information visit: http://deeptools.readthedocs.org
61
+ """)
62
+
63
+ parser.add_argument('--version', action='version',
64
+ version='%(prog)s {}'.format(version('deeptools')))
65
+
66
+ return parser
67
+
68
+
69
+ def process_args(args=None):
70
+ args = parse_arguments().parse_args(args)
71
+
72
+ return args
73
+
74
+
75
+ def main(args=None):
76
+ if args is None and len(sys.argv) == 1:
77
+ args = ["--help"]
78
+ process_args(args)
deepTools/source/deeptools/estimateReadFiltering.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import argparse
3
+ import sys
4
+
5
+ from deeptools import parserCommon, bamHandler, utilities
6
+ from deeptools.mapReduce import mapReduce
7
+ from deeptools.utilities import smartLabels
8
+ from importlib.metadata import version
9
+
10
+
11
+ def parseArguments():
12
+ parser = argparse.ArgumentParser(
13
+ formatter_class=argparse.RawDescriptionHelpFormatter,
14
+ description="""
15
+ This tool estimates the number of reads that would be filtered given a set of
16
+ settings and prints this to the terminal. Further, it tracks the number of singleton reads. The following metrics will always be tracked regardless of what you specify (the order output also matches this):
17
+
18
+ * Total reads (including unmapped)
19
+ * Mapped reads
20
+ * Reads in blacklisted regions (--blackListFileName)
21
+
22
+ The following metrics are estimated according to the --binSize and --distanceBetweenBins parameters
23
+ * Estimated mapped reads filtered (the total number of mapped reads filtered for any reason)
24
+ * Alignments with a below threshold MAPQ (--minMappingQuality)
25
+ * Alignments with at least one missing flag (--samFlagInclude)
26
+ * Alignments with undesirable flags (--samFlagExclude)
27
+ * Duplicates determined by deepTools (--ignoreDuplicates)
28
+ * Duplicates marked externally (e.g., by picard)
29
+ * Singletons (paired-end reads with only one mate aligning)
30
+ * Wrong strand (due to --filterRNAstrand)
31
+
32
+ The sum of these may be more than the total number of reads. Note that alignments are sampled from bins of size --binSize spaced --distanceBetweenBins apart.
33
+ """,
34
+ usage='estimateReadFiltering -b sample1.bam sample2.bam\n'
35
+ 'help: estimateReadFiltering -h / estimateReadFiltering --help'
36
+ )
37
+
38
+ required = parser.add_argument_group('Required arguments')
39
+ required.add_argument('--bamfiles', '-b',
40
+ metavar='FILE1 FILE2',
41
+ help='List of indexed bam files separated by spaces.',
42
+ nargs='+',
43
+ required=True)
44
+
45
+ general = parser.add_argument_group('General arguments')
46
+
47
+ general.add_argument('--outFile', '-o',
48
+ type=parserCommon.writableFile,
49
+ help='The file to write results to. By default, results are printed to the console')
50
+
51
+ general.add_argument('--sampleLabels',
52
+ help='Labels for the samples. The '
53
+ 'default is to use the file name of the '
54
+ 'sample. The sample labels should be separated '
55
+ 'by spaces and quoted if a label itself'
56
+ 'contains a space E.g. --sampleLabels label-1 "label 2" ',
57
+ nargs='+')
58
+
59
+ general.add_argument('--smartLabels',
60
+ action='store_true',
61
+ help='Instead of manually specifying labels for the input '
62
+ 'BAM files, this causes deepTools to use the '
63
+ 'file name after removing the path and extension.')
64
+
65
+ general.add_argument('--binSize', '-bs',
66
+ metavar='INT',
67
+ help='Length in bases of the window used to sample the genome. (Default: %(default)s)',
68
+ default=1000000,
69
+ type=int)
70
+
71
+ general.add_argument('--distanceBetweenBins', '-n',
72
+ metavar='INT',
73
+ help='To reduce the computation time, not every possible genomic '
74
+ 'bin is sampled. This option allows you to set the distance '
75
+ 'between bins actually sampled from. Larger numbers are sufficient '
76
+ 'for high coverage samples, while smaller values are useful for '
77
+ 'lower coverage samples. Note that if you specify a value that '
78
+ 'results in too few (<1000) reads sampled, the value will be '
79
+ 'decreased. (Default: %(default)s)',
80
+ default=10000,
81
+ type=int)
82
+
83
+ general.add_argument('--numberOfProcessors', '-p',
84
+ help='Number of processors to use. Type "max/2" to '
85
+ 'use half the maximum number of processors or "max" '
86
+ 'to use all available processors. (Default: %(default)s)',
87
+ metavar="INT",
88
+ type=parserCommon.numberOfProcessors,
89
+ default=1,
90
+ required=False)
91
+
92
+ general.add_argument('--verbose', '-v',
93
+ help='Set to see processing messages.',
94
+ action='store_true')
95
+
96
+ general.add_argument('--version', action='version',
97
+ version='%(prog)s {}'.format(version('deeptools')))
98
+
99
+ filtering = parser.add_argument_group('Optional arguments')
100
+
101
+ filtering.add_argument('--filterRNAstrand',
102
+ help='Selects RNA-seq reads (single-end or paired-end) in '
103
+ 'the given strand. (Default: %(default)s)',
104
+ choices=['forward', 'reverse'],
105
+ default=None)
106
+
107
+ filtering.add_argument('--ignoreDuplicates',
108
+ help='If set, reads that have the same orientation '
109
+ 'and start position will be considered only '
110
+ 'once. If reads are paired, the mate\'s position '
111
+ 'also has to coincide to ignore a read.',
112
+ action='store_true')
113
+
114
+ filtering.add_argument('--minMappingQuality',
115
+ metavar='INT',
116
+ help='If set, only reads that have a mapping '
117
+ 'quality score of at least this are '
118
+ 'considered.',
119
+ type=int)
120
+
121
+ filtering.add_argument('--samFlagInclude',
122
+ help='Include reads based on the SAM flag. For example, '
123
+ 'to get only reads that are the first mate, use a flag of 64. '
124
+ 'This is useful to count properly paired reads only once, '
125
+ 'as otherwise the second mate will be also considered for the '
126
+ 'coverage. (Default: %(default)s)',
127
+ metavar='INT',
128
+ default=None,
129
+ type=int,
130
+ required=False)
131
+
132
+ filtering.add_argument('--samFlagExclude',
133
+ help='Exclude reads based on the SAM flag. For example, '
134
+ 'to get only reads that map to the forward strand, use '
135
+ '--samFlagExclude 16, where 16 is the SAM flag for reads '
136
+ 'that map to the reverse strand. (Default: %(default)s)',
137
+ metavar='INT',
138
+ default=None,
139
+ type=int,
140
+ required=False)
141
+
142
+ filtering.add_argument('--blackListFileName', '-bl',
143
+ help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.",
144
+ metavar="BED file",
145
+ nargs="+",
146
+ required=False)
147
+
148
+ return parser
149
+
150
+
151
+ def getFiltered_worker(arglist):
152
+ chrom, start, end, args = arglist
153
+ # Fix the bounds
154
+ if end - start > args.binSize and end - start > args.distanceBetweenBins:
155
+ end -= args.distanceBetweenBins
156
+ if end <= start:
157
+ end = start + 1
158
+
159
+ o = []
160
+ for fname in args.bamfiles:
161
+ fh = bamHandler.openBam(fname)
162
+ chromUse = utilities.mungeChromosome(chrom, fh.references)
163
+ prev_pos = set()
164
+ lpos = None
165
+
166
+ minMapq = 0
167
+ samFlagInclude = 0
168
+ samFlagExclude = 0
169
+ internalDupes = 0
170
+ externalDupes = 0
171
+ singletons = 0
172
+ filterRNAstrand = 0
173
+ nFiltered = 0
174
+ total = 0 # This is only used to estimate the percentage affected
175
+ for read in fh.fetch(chromUse, start, end):
176
+ filtered = 0
177
+ if read.pos < start:
178
+ # ensure that we never double count (in case distanceBetweenBins == 0)
179
+ continue
180
+
181
+ if read.flag & 4:
182
+ # Ignore unmapped reads, they were counted already
183
+ continue
184
+
185
+ if args.minMappingQuality and read.mapq < args.minMappingQuality:
186
+ filtered = 1
187
+ minMapq += 1
188
+ if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
189
+ filtered = 1
190
+ samFlagInclude += 1
191
+ if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
192
+ filtered = 1
193
+ samFlagExclude += 1
194
+ if args.ignoreDuplicates:
195
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
196
+ if read.tlen >= 0:
197
+ s = read.pos
198
+ e = s + read.tlen
199
+ else:
200
+ s = read.pnext
201
+ e = s - read.tlen
202
+ if read.reference_id != read.next_reference_id:
203
+ e = read.pnext
204
+ if lpos is not None and lpos == read.reference_start \
205
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
206
+ filtered = 1
207
+ internalDupes += 1
208
+ if lpos != read.reference_start:
209
+ prev_pos.clear()
210
+ lpos = read.reference_start
211
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
212
+ if read.is_duplicate:
213
+ filtered = 1
214
+ externalDupes += 1
215
+ if read.is_paired and read.mate_is_unmapped:
216
+ filtered = 1
217
+ singletons += 1
218
+
219
+ # filterRNAstrand
220
+ if args.filterRNAstrand:
221
+ if read.is_paired:
222
+ if args.filterRNAstrand == 'forward':
223
+ if read.flag & 144 == 128 or read.flag & 96 == 64:
224
+ pass
225
+ else:
226
+ filtered = 1
227
+ filterRNAstrand += 1
228
+ elif args.filterRNAstrand == 'reverse':
229
+ if read.flag & 144 == 144 or read.flag & 96 == 96:
230
+ pass
231
+ else:
232
+ filtered = 1
233
+ filterRNAstrand += 1
234
+ else:
235
+ if args.filterRNAstrand == 'forward':
236
+ if read.flag & 16 == 16:
237
+ pass
238
+ else:
239
+ filtered = 1
240
+ filterRNAstrand += 1
241
+ elif args.filterRNAstrand == 'reverse':
242
+ if read.flag & 16 == 0:
243
+ pass
244
+ else:
245
+ filtered = 1
246
+ filterRNAstrand += 1
247
+
248
+ total += 1
249
+ nFiltered += filtered
250
+ fh.close()
251
+
252
+ # Append a tuple to the output
253
+ tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand)
254
+ o.append(tup)
255
+ return o
256
+
257
+
258
+ def main(args=None):
259
+ args = parseArguments().parse_args(args)
260
+
261
+ if not args.sampleLabels and args.smartLabels:
262
+ args.sampleLabels = smartLabels(args.bamfiles)
263
+
264
+ if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
265
+ sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n")
266
+ sys.exit(1)
267
+
268
+ if args.outFile is None:
269
+ of = sys.stdout
270
+ else:
271
+ of = open(args.outFile, "w")
272
+
273
+ bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles]
274
+ mapped = [x[1] for x in bhs]
275
+ unmappedList = [x[2] for x in bhs]
276
+ bhs = [x[0] for x in bhs]
277
+
278
+ # Get the reads in blacklisted regions
279
+ if args.blackListFileName:
280
+ blacklisted = []
281
+ for bh in bhs:
282
+ blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors))
283
+ else:
284
+ blacklisted = [0] * len(bhs)
285
+
286
+ # Get the total and mapped reads
287
+ total = [x + y for x, y in list(zip(mapped, unmappedList))]
288
+
289
+ chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
290
+ for x in bhs:
291
+ x.close()
292
+
293
+ # Get the remaining metrics
294
+ res = mapReduce([args],
295
+ getFiltered_worker,
296
+ chrom_sizes,
297
+ genomeChunkLength=args.binSize + args.distanceBetweenBins,
298
+ blackListFileName=args.blackListFileName,
299
+ numberOfProcessors=args.numberOfProcessors,
300
+ verbose=args.verbose)
301
+
302
+ totals = [0] * len(args.bamfiles)
303
+ nFiltered = [0] * len(args.bamfiles)
304
+ MAPQs = [0] * len(args.bamfiles)
305
+ flagIncludes = [0] * len(args.bamfiles)
306
+ flagExcludes = [0] * len(args.bamfiles)
307
+ internalDupes = [0] * len(args.bamfiles)
308
+ externalDupes = [0] * len(args.bamfiles)
309
+ singletons = [0] * len(args.bamfiles)
310
+ rnaStrand = [0] * len(args.bamfiles)
311
+ for x in res:
312
+ for idx, r in enumerate(x):
313
+ totals[idx] += r[0]
314
+ nFiltered[idx] += r[1]
315
+ MAPQs[idx] += r[2]
316
+ flagIncludes[idx] += r[3]
317
+ flagExcludes[idx] += r[4]
318
+ internalDupes[idx] += r[5]
319
+ externalDupes[idx] += r[6]
320
+ singletons[idx] += r[7]
321
+ rnaStrand[idx] += r[8]
322
+
323
+ # Print some output
324
+ of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n")
325
+ for idx, _ in enumerate(args.bamfiles):
326
+ if args.sampleLabels:
327
+ of.write(args.sampleLabels[idx])
328
+ else:
329
+ of.write(args.bamfiles[idx])
330
+ of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx]))
331
+ # nFiltered
332
+ metric = 0.0
333
+ if totals[idx] > 0:
334
+ metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx]
335
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
336
+ # MAPQ
337
+ metric = 0.0
338
+ if totals[idx] > 0:
339
+ metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
340
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
341
+ # samFlagInclude
342
+ metric = 0.0
343
+ if totals[idx] > 0:
344
+ metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx]
345
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
346
+ # samFlagExclude
347
+ metric = 0.0
348
+ if totals[idx] > 0:
349
+ metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx]
350
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
351
+ # Internally determined duplicates
352
+ metric = 0.0
353
+ if totals[idx] > 0:
354
+ metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx]
355
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
356
+ # Externally marked duplicates
357
+ metric = 0.0
358
+ if totals[idx] > 0:
359
+ metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx]
360
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
361
+ # Singletons
362
+ metric = 0.0
363
+ if totals[idx] > 0:
364
+ metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
365
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
366
+ # filterRNAstrand
367
+ metric = 0.0
368
+ if totals[idx] > 0:
369
+ metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
370
+ of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
371
+ of.write("\n")
372
+
373
+ if args.outFile is not None:
374
+ of.close()
375
+
376
+ return 0
deepTools/source/deeptools/estimateScaleFactor.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import sys
6
+
7
+ from deeptools.SES_scaleFactor import estimateScaleFactor
8
+ from deeptools.parserCommon import numberOfProcessors
9
+ from importlib.metadata import version
10
+ debug = 0
11
+
12
+
13
+ def parseArguments(args=None):
14
+ parser = argparse.ArgumentParser(
15
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
16
+ description='Given two BAM files, this estimates scaling factors '
17
+ '(bigger to smaller).',
18
+ usage='estimateScaleFactor -b sample1.bam sample2.bam\n'
19
+ 'help: estimateScaleFactor -h / estimateScaleFactor --help'
20
+ )
21
+
22
+ # define the arguments
23
+ parser.add_argument('--bamfiles', '-b',
24
+ metavar='list of bam files',
25
+ help='List of indexed BAM files, space delineated',
26
+ nargs='+',
27
+ required=True)
28
+
29
+ parser.add_argument('--ignoreForNormalization', '-ignore',
30
+ help='A comma-separated list of chromosome names, '
31
+ 'limited by quotes, '
32
+ 'containing those '
33
+ 'chromosomes that should be excluded '
34
+ 'during normalization computations. For example, '
35
+ '--ignoreForNormalization "chrX, chrM" ')
36
+
37
+ parser.add_argument('--sampleWindowLength', '-l',
38
+ help='Length in bases for a window used to '
39
+ 'sample the genome and compute the size or scaling '
40
+ 'factors',
41
+ default=1000,
42
+ type=int)
43
+
44
+ parser.add_argument('--numberOfSamples', '-n',
45
+ help='Number of samplings taken from the genome '
46
+ 'to compute the scaling factors',
47
+ default=100000,
48
+ type=int)
49
+
50
+ parser.add_argument('--normalizationLength', '-nl',
51
+ help='By default, data is normalized to 1 '
52
+ 'fragment per 100 bases. The expected value is an '
53
+ 'integer. For example, if normalizationLength '
54
+ 'is 1000, then the resulting scaling factor '
55
+ 'will cause the average coverage of the BAM file to '
56
+ 'have on average 1 fragment per kilobase',
57
+ type=int,
58
+ default=10)
59
+
60
+ parser.add_argument('--skipZeros',
61
+ help='If set, then zero counts that happen for *all* '
62
+ 'BAM files given are ignored. This will result in a '
63
+ 'reduced number of read counts than that specified '
64
+ 'in --numberOfSamples',
65
+ action='store_true',
66
+ required=False)
67
+
68
+ parser.add_argument('--numberOfProcessors', '-p',
69
+ help='Number of processors to use. The default is '
70
+ 'to use half the maximum number of processors.',
71
+ metavar="INT",
72
+ type=numberOfProcessors,
73
+ default="max/2",
74
+ required=False)
75
+
76
+ parser.add_argument('--verbose', '-v',
77
+ help='Set to see processing messages.',
78
+ action='store_true')
79
+
80
+ parser.add_argument('--version',
81
+ action='version',
82
+ version='%(prog)s {}'.format(version('deeptools')))
83
+
84
+ args = parser.parse_args(args)
85
+ if args.ignoreForNormalization:
86
+ args.ignoreForNormalization = [
87
+ x.strip() for x in args.ignoreForNormalization.split(',')
88
+ ]
89
+ else:
90
+ args.ignoreForNormalization = []
91
+ return args
92
+
93
+
94
+ def main(args=None):
95
+ """
96
+ The algorithm samples the genome a number of times as specified
97
+ by the --numberOfSamples parameter to estimate scaling factors of
98
+ between to samples
99
+
100
+ """
101
+ args = parseArguments(args)
102
+ if len(args.bamfiles) > 2:
103
+ print("SES method to estimate scale factors only works for two samples")
104
+ exit(0)
105
+
106
+ sys.stderr.write("{:,} number of samples will be computed.\n".format(args.numberOfSamples))
107
+ sizeFactorsDict = estimateScaleFactor(args.bamfiles, args.sampleWindowLength,
108
+ args.numberOfSamples,
109
+ args.normalizationLength,
110
+ numberOfProcessors=args.numberOfProcessors,
111
+ chrsToSkip=args.ignoreForNormalization,
112
+ verbose=args.verbose)
113
+
114
+ for k, v in sizeFactorsDict.items():
115
+ print("{}: {}".format(k, v))
deepTools/source/deeptools/getFragmentAndReadSize.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # own tools
4
+ from deeptools import bamHandler
5
+ from deeptools import mapReduce
6
+
7
+ old_settings = np.seterr(all='ignore')
8
+
9
+
10
+ def getFragmentLength_wrapper(args):
11
+ return getFragmentLength_worker(*args)
12
+
13
+
14
+ def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
15
+ """
16
+ Queries the reads at the given region for the distance between
17
+ reads and the read length
18
+
19
+ Parameters
20
+ ----------
21
+ chrom : str
22
+ chromosome name
23
+ start : int
24
+ region start
25
+ end : int
26
+ region end
27
+ bamFile : str
28
+ BAM file name
29
+ distanceBetweenBins : int
30
+ the number of bases at the end of each bin to ignore
31
+
32
+ Returns
33
+ -------
34
+ np.array
35
+ an np.array, where first column is fragment length, the
36
+ second is for read length
37
+ """
38
+ bam = bamHandler.openBam(bamFile)
39
+ end = max(start + 1, end - distanceBetweenBins)
40
+ if chrom in bam.references:
41
+ reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
42
+ for r in bam.fetch(chrom, start, end)
43
+ if r.is_proper_pair and r.is_read1 and not r.is_unmapped])
44
+ if not len(reads):
45
+ # if the previous operation produces an empty list
46
+ # it could be that the data is not paired, then
47
+ # we try with out filtering
48
+ reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
49
+ for r in bam.fetch(chrom, start, end) if not r.is_unmapped])
50
+ else:
51
+ raise NameError("chromosome {} not found in bam file".format(chrom))
52
+
53
+ if not len(reads):
54
+ reads = np.array([]).reshape(0, 2)
55
+
56
+ return reads
57
+
58
+
59
+ def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
60
+ binSize=50000, distanceBetweenBins=1000000,
61
+ numberOfProcessors=None, verbose=False):
62
+ """
63
+ Estimates the fragment length and read length through sampling
64
+
65
+ Parameters
66
+ ----------
67
+ bamFile : str
68
+ BAM file name
69
+ return_lengths : bool
70
+ numberOfProcessors : int
71
+ verbose : bool
72
+ binSize : int
73
+ distanceBetweenBins : int
74
+
75
+ Returns
76
+ -------
77
+ d : dict
78
+ tuple of two dictionaries, one for the fragment length and the other
79
+ for the read length. The dictionaries summarise the mean, median etc. values
80
+
81
+ """
82
+
83
+ bam_handle = bamHandler.openBam(bamFile)
84
+ chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
85
+
86
+ distanceBetweenBins *= 2
87
+ fl = []
88
+
89
+ # Fix issue #522, allow distanceBetweenBins == 0
90
+ if distanceBetweenBins == 0:
91
+ imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
92
+ getFragmentLength_wrapper,
93
+ chrom_sizes,
94
+ genomeChunkLength=binSize,
95
+ blackListFileName=blackListFileName,
96
+ numberOfProcessors=numberOfProcessors,
97
+ verbose=verbose)
98
+ fl = np.concatenate(imap_res)
99
+
100
+ # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed
101
+ while len(fl) < 1000 and distanceBetweenBins > 1:
102
+ distanceBetweenBins /= 2
103
+ stepsize = binSize + distanceBetweenBins
104
+ imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
105
+ getFragmentLength_wrapper,
106
+ chrom_sizes,
107
+ genomeChunkLength=stepsize,
108
+ blackListFileName=blackListFileName,
109
+ numberOfProcessors=numberOfProcessors,
110
+ verbose=verbose)
111
+
112
+ fl = np.concatenate(imap_res)
113
+
114
+ if len(fl):
115
+ fragment_length = fl[:, 0]
116
+ read_length = fl[:, 1]
117
+ if fragment_length.mean() > 0:
118
+ fragment_len_dict = {'sample_size': len(fragment_length),
119
+ 'min': fragment_length.min(),
120
+ 'qtile25': np.percentile(fragment_length, 25),
121
+ 'mean': np.mean(fragment_length),
122
+ 'median': np.median(fragment_length),
123
+ 'qtile75': np.percentile(fragment_length, 75),
124
+ 'max': fragment_length.max(),
125
+ 'std': np.std(fragment_length),
126
+ 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))),
127
+ 'qtile10': np.percentile(fragment_length, 10),
128
+ 'qtile20': np.percentile(fragment_length, 20),
129
+ 'qtile30': np.percentile(fragment_length, 30),
130
+ 'qtile40': np.percentile(fragment_length, 40),
131
+ 'qtile60': np.percentile(fragment_length, 60),
132
+ 'qtile70': np.percentile(fragment_length, 70),
133
+ 'qtile80': np.percentile(fragment_length, 80),
134
+ 'qtile90': np.percentile(fragment_length, 90),
135
+ 'qtile99': np.percentile(fragment_length, 99)}
136
+ else:
137
+ fragment_len_dict = None
138
+
139
+ if return_lengths and fragment_len_dict is not None:
140
+ fragment_len_dict['lengths'] = fragment_length
141
+
142
+ read_len_dict = {'sample_size': len(read_length),
143
+ 'min': read_length.min(),
144
+ 'qtile25': np.percentile(read_length, 25),
145
+ 'mean': np.mean(read_length),
146
+ 'median': np.median(read_length),
147
+ 'qtile75': np.percentile(read_length, 75),
148
+ 'max': read_length.max(),
149
+ 'std': np.std(read_length),
150
+ 'mad': np.median(np.abs(read_length - np.median(read_length))),
151
+ 'qtile10': np.percentile(read_length, 10),
152
+ 'qtile20': np.percentile(read_length, 20),
153
+ 'qtile30': np.percentile(read_length, 30),
154
+ 'qtile40': np.percentile(read_length, 40),
155
+ 'qtile60': np.percentile(read_length, 60),
156
+ 'qtile70': np.percentile(read_length, 70),
157
+ 'qtile80': np.percentile(read_length, 80),
158
+ 'qtile90': np.percentile(read_length, 90),
159
+ 'qtile99': np.percentile(read_length, 99)}
160
+ if return_lengths:
161
+ read_len_dict['lengths'] = read_length
162
+ else:
163
+ fragment_len_dict = None
164
+ read_len_dict = None
165
+
166
+ return fragment_len_dict, read_len_dict
deepTools/source/deeptools/getRatio.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ old_settings = np.seterr(all='ignore')
4
+
5
+
6
+ def compute_ratio(value1, value2, args):
7
+ value1 = value1 + args['pseudocount'][0]
8
+ value2 = value2 + args['pseudocount'][1]
9
+
10
+ ratio = float(value1) / value2
11
+ if args['valueType'] == 'log2':
12
+ ratio = np.log2(ratio)
13
+
14
+ elif args['valueType'] == 'reciprocal_ratio':
15
+ # the reciprocal ratio of a/b
16
+ # is a/b if a/b > 1 else -1* b/a
17
+ ratio = ratio if ratio >= 1 else -1.0 / ratio
18
+
19
+ return ratio
20
+
21
+
22
+ def getRatio(tileCoverage, args):
23
+ r"""
24
+ The mapreduce method calls this function
25
+ for each tile. The parameters (args) are fixed
26
+ in the main method.
27
+
28
+ >>> funcArgs= {'valueType': 'ratio', 'scaleFactors': (1,1), 'pseudocount': [1, 1]}
29
+ >>> getRatio([9, 19], funcArgs)
30
+ 0.5
31
+ >>> getRatio([0, 0], funcArgs)
32
+ 1.0
33
+ >>> getRatio([np.nan, np.nan], funcArgs)
34
+ nan
35
+ >>> getRatio([np.nan, 1.0], funcArgs)
36
+ nan
37
+ >>> funcArgs['valueType'] ='subtract'
38
+ >>> getRatio([20, 10], funcArgs)
39
+ 10
40
+ >>> funcArgs['scaleFactors'] = (1, 0.5)
41
+ >>> getRatio([10, 20], funcArgs)
42
+ 0.0
43
+
44
+ The reciprocal ratio is of a and b is:
45
+ is a/b if a/b > 1 else -1* b/a
46
+ >>> funcArgs['valueType'] ='reciprocal_ratio'
47
+ >>> funcArgs['scaleFactors'] = (1, 1)
48
+ >>> funcArgs['pseudocount'] = [0, 0]
49
+ >>> getRatio([2, 1], funcArgs)
50
+ 2.0
51
+ >>> getRatio([1, 2], funcArgs)
52
+ -2.0
53
+ >>> getRatio([1, 1], funcArgs)
54
+ 1.0
55
+ """
56
+
57
+ value1 = args['scaleFactors'][0] * tileCoverage[0]
58
+ value2 = args['scaleFactors'][1] * tileCoverage[1]
59
+
60
+ # if any of the two values to compare
61
+ # is nan, return nan
62
+ if np.isnan(value1) or np.isnan(value2):
63
+ return np.nan
64
+
65
+ # ratio case
66
+ if args['valueType'] in ['ratio', 'log2', 'reciprocal_ratio']:
67
+ bin_value = compute_ratio(value1, value2, args)
68
+
69
+ # non ratio case (diff, sum etc)
70
+ else:
71
+ if args['valueType'] == 'subtract':
72
+ bin_value = value1 - value2
73
+ elif args['valueType'] == 'add':
74
+ bin_value = value1 + value2
75
+ elif args['valueType'] == 'first':
76
+ bin_value = value1
77
+ elif args['valueType'] == 'second':
78
+ bin_value = value2
79
+ elif args['valueType'] == 'mean':
80
+ bin_value = (value1 + value2) / 2.0
81
+
82
+ return bin_value
deepTools/source/deeptools/getScaleFactor.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import numpy as np
5
+ import deeptools.mapReduce as mapReduce
6
+ from deeptools import bamHandler
7
+ from deeptools import utilities
8
+ import sys
9
+
10
+ debug = 0
11
+
12
+
13
+ def getFractionKept_wrapper(args):
14
+ return getFractionKept_worker(*args)
15
+
16
+
17
+ def getFractionKept_worker(chrom, start, end, bamFile, args, offset):
18
+ """
19
+ Queries the BAM file and counts the number of alignments kept/found in the
20
+ first 50000 bases.
21
+ """
22
+ bam = bamHandler.openBam(bamFile)
23
+ start += offset * 50000
24
+ end = min(end, start + 50000)
25
+ tot = 0
26
+ filtered = 0
27
+
28
+ if end <= start:
29
+ return (filtered, tot)
30
+
31
+ prev_pos = set()
32
+ lpos = None
33
+ if chrom in bam.references:
34
+ for read in bam.fetch(chrom, start, end):
35
+ tot += 1
36
+ if read.is_unmapped:
37
+ continue
38
+
39
+ if args.minMappingQuality and read.mapq < args.minMappingQuality:
40
+ filtered += 1
41
+ continue
42
+
43
+ # filter reads based on SAM flag
44
+ if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
45
+ filtered += 1
46
+ continue
47
+ if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
48
+ filtered += 1
49
+ continue
50
+
51
+ # fragment length filtering
52
+ tLen = utilities.getTLen(read)
53
+ if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
54
+ filtered += 1
55
+ continue
56
+ if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
57
+ filtered += 1
58
+ continue
59
+
60
+ # get rid of duplicate reads that have same position on each of the
61
+ # pairs
62
+ if args.ignoreDuplicates:
63
+ # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
64
+ if tLen >= 0:
65
+ s = read.pos
66
+ e = s + tLen
67
+ else:
68
+ s = read.pnext
69
+ e = s - tLen
70
+ if read.reference_id != read.next_reference_id:
71
+ e = read.pnext
72
+ if lpos is not None and lpos == read.reference_start \
73
+ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
74
+ filtered += 1
75
+ continue
76
+ if lpos != read.reference_start:
77
+ prev_pos.clear()
78
+ lpos = read.reference_start
79
+ prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
80
+
81
+ # If filterRNAstrand is in args, then filter accordingly
82
+ # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
83
+ if hasattr(args, "filterRNAstrand"):
84
+ if read.is_paired:
85
+ if args.filterRNAstrand == 'forward':
86
+ if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)):
87
+ filtered += 1
88
+ continue
89
+ elif args.filterRNAstrand == 'reverse':
90
+ if not (read.flag & 144 == 144 or read.flag & 96 == 96):
91
+ filtered += 1
92
+ continue
93
+ else:
94
+ if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
95
+ filtered += 1
96
+ continue
97
+ elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
98
+ filtered += 1
99
+ continue
100
+
101
+ return (filtered, tot)
102
+
103
+
104
+ def fraction_kept(args, stats):
105
+ """
106
+ Count the following:
107
+ (A) The total number of alignments sampled
108
+ (B) The total number of alignments ignored due to any of the following:
109
+ --samFlagInclude
110
+ --samFlagExclude
111
+ --minMappingQuality
112
+ --ignoreDuplicates
113
+ --minFragmentLength
114
+ --maxFragmentLength
115
+
116
+ Black list regions are already accounted for. This works by sampling the
117
+ genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
118
+ whichever is smaller (unless there are fewer than 100,000 alignments, in
119
+ which case sample everything).
120
+
121
+ The sampling works by dividing the genome into bins and only looking at the
122
+ first 50000 bases. If this doesn't yield sufficient alignments then the bin
123
+ size is halved.
124
+ """
125
+ # Do we even need to proceed?
126
+ if (not args.minMappingQuality or args.minMappingQuality == 0) and \
127
+ (not args.samFlagInclude or args.samFlagInclude == 0) and \
128
+ (not args.samFlagExclude or args.samFlagExclude == 0) and \
129
+ (not args.minFragmentLength or args.minFragmentLength == 0) and \
130
+ (not args.maxFragmentLength or args.maxFragmentLength == 0):
131
+ if hasattr(args, "filterRNAstrand"):
132
+ if args.filterRNAstrand not in ["forward", "reverse"]:
133
+ return 1.0
134
+ else:
135
+ return 1.0
136
+
137
+ filtered = 0
138
+ total = 0
139
+ distanceBetweenBins = 2000000
140
+ bam_handle = bamHandler.openBam(args.bam)
141
+ bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
142
+ if bam_mapped < 1000000:
143
+ num_needed_to_sample = bam_mapped
144
+ else:
145
+ if 0.1 * bam_mapped >= 1000000:
146
+ num_needed_to_sample = 0.1 * bam_mapped
147
+ else:
148
+ num_needed_to_sample = 1000000
149
+ if args.exactScaling:
150
+ num_needed_to_sample = bam_mapped
151
+ if num_needed_to_sample == bam_mapped:
152
+ distanceBetweenBins = 55000
153
+ if args.ignoreForNormalization:
154
+ chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references)
155
+ if chrom_name not in args.ignoreForNormalization]
156
+ else:
157
+ chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
158
+
159
+ offset = 0
160
+ # Iterate over bins at various non-overlapping offsets until we have enough data
161
+ while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000):
162
+ res = mapReduce.mapReduce((bam_handle.filename, args, offset),
163
+ getFractionKept_wrapper,
164
+ chrom_sizes,
165
+ genomeChunkLength=distanceBetweenBins,
166
+ blackListFileName=args.blackListFileName,
167
+ numberOfProcessors=args.numberOfProcessors,
168
+ verbose=args.verbose)
169
+
170
+ if len(res):
171
+ foo, bar = np.sum(res, axis=0)
172
+ filtered += foo
173
+ total += bar
174
+ offset += 1
175
+
176
+ if total == 0:
177
+ # This should never happen
178
+ total = 1
179
+
180
+ return 1.0 - float(filtered) / float(total)
181
+
182
+
183
+ def get_num_kept_reads(args, stats):
184
+ """
185
+ Substracts from the total number of mapped reads in a bamfile
186
+ the proportion of reads that fall into blacklisted regions
187
+ or that are filtered
188
+
189
+ :return: integer
190
+ """
191
+ if stats is None:
192
+ bam_handle, mapped, unmapped, stats = bamHandler.openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
193
+ else:
194
+ bam_handle = bamHandler.openBam(args.bam)
195
+ bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
196
+ if args.blackListFileName:
197
+ blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization,
198
+ args.blackListFileName, args.numberOfProcessors)
199
+ print("There are {0} alignments, of which {1} are completely "
200
+ "within a blacklist region.".format(bam_mapped_total, blacklisted))
201
+ num_kept_reads = bam_mapped_total - blacklisted
202
+ else:
203
+ num_kept_reads = bam_mapped_total
204
+ ftk = fraction_kept(args, stats)
205
+ if ftk < 1:
206
+ num_kept_reads *= ftk
207
+ print("Due to filtering, {0}% of the aforementioned alignments "
208
+ "will be used {1}".format(100 * ftk, num_kept_reads))
209
+
210
+ return num_kept_reads, bam_mapped_total
211
+
212
+
213
+ def get_scale_factor(args, stats):
214
+ scale_factor = args.scaleFactor
215
+ bam_mapped, bam_mapped_total = get_num_kept_reads(args, stats)
216
+ if args.normalizeUsing == 'RPGC':
217
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
218
+ sys.stderr.write("normalization: 1x (effective genome size {})\n".format(args.effectiveGenomeSize))
219
+
220
+ # try to guess fragment length if the bam file contains paired end reads
221
+ from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
222
+ frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
223
+ return_lengths=False,
224
+ blackListFileName=args.blackListFileName,
225
+ numberOfProcessors=args.numberOfProcessors,
226
+ verbose=args.verbose)
227
+ if args.extendReads:
228
+ if args.extendReads is True:
229
+ # try to guess fragment length if the bam file contains paired end reads
230
+ if frag_len_dict:
231
+ fragment_length = frag_len_dict['median']
232
+ else:
233
+ exit("*ERROR*: library is not paired-end. Please provide an extension length.")
234
+ if args.verbose:
235
+ print(("Fragment length based on paired en data "
236
+ "estimated to be {}".format(frag_len_dict['median'])))
237
+
238
+ elif args.extendReads < 1:
239
+ exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
240
+ elif args.extendReads > 2000:
241
+ exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
242
+ else:
243
+ fragment_length = args.extendReads
244
+
245
+ else:
246
+ # set as fragment length the read length
247
+ fragment_length = int(read_len_dict['median'])
248
+ if args.verbose:
249
+ print("Estimated read length is {}".format(int(read_len_dict['median'])))
250
+
251
+ current_coverage = \
252
+ float(bam_mapped * fragment_length) / args.effectiveGenomeSize
253
+ # the scaling sets the coverage to match 1x
254
+ scale_factor *= 1.0 / current_coverage
255
+ if debug:
256
+ print("Estimated current coverage {}".format(current_coverage))
257
+ print("Scaling factor {}".format(args.scaleFactor))
258
+
259
+ elif args.normalizeUsing == 'RPKM':
260
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
261
+ sys.stderr.write("normalization: RPKM\n")
262
+
263
+ # the RPKM is the # reads per tile / \
264
+ # ( total reads (in millions) * tile length in Kb)
265
+ million_reads_mapped = float(bam_mapped) / 1e6
266
+ tile_len_in_kb = float(args.binSize) / 1000
267
+
268
+ scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)
269
+
270
+ if debug:
271
+ print("scale factor using RPKM is {0}".format(args.scaleFactor))
272
+
273
+ elif args.normalizeUsing == 'CPM':
274
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
275
+ sys.stderr.write("normalization: CPM\n")
276
+
277
+ # the CPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
278
+ million_reads_mapped = float(bam_mapped) / 1e6
279
+ scale_factor *= 1.0 / (million_reads_mapped)
280
+
281
+ if debug:
282
+ print("scale factor using CPM is {0}".format(args.scaleFactor))
283
+
284
+ elif args.normalizeUsing == 'BPM':
285
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
286
+ sys.stderr.write("normalization: BPM\n")
287
+ # the BPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
288
+ # sampled_bins_sum = getSampledSum(args.bam)
289
+ tile_len_in_kb = float(args.binSize) / 1000
290
+ tpm_scaleFactor = (bam_mapped / tile_len_in_kb) / 1e6
291
+
292
+ scale_factor *= 1 / (tpm_scaleFactor * tile_len_in_kb)
293
+ if debug:
294
+ print("scale factor using BPM is {0}".format(args.scaleFactor))
295
+
296
+ else:
297
+ # Print output, since normalzation stuff isn't printed to stderr otherwise
298
+ sys.stderr.write("normalization: none (signal scaled by the fraction of alignments kept after filtering)\n")
299
+
300
+ scale_factor *= bam_mapped / float(bam_mapped_total)
301
+
302
+ if args.verbose:
303
+ print("Final scaling factor: {}".format(scale_factor))
304
+
305
+ return scale_factor
deepTools/source/deeptools/getScorePerBigWigBin.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyBigWig
2
+ import numpy as np
3
+ import os
4
+ import sys
5
+ import shutil
6
+ import warnings
7
+
8
+ # deepTools packages
9
+ import deeptools.mapReduce as mapReduce
10
+ import deeptools.utilities
11
+ # debug = 0
12
+
13
+ old_settings = np.seterr(all='ignore')
14
+
15
+
16
+ def countReadsInRegions_wrapper(args):
17
+ # Using arguments unpacking!
18
+ return countFragmentsInRegions_worker(*args)
19
+
20
+
21
+ def countFragmentsInRegions_worker(chrom, start, end,
22
+ bigWigFiles,
23
+ stepSize, binLength,
24
+ save_data,
25
+ bedRegions=None
26
+ ):
27
+ """ returns the average score in each bigwig file at each 'stepSize'
28
+ position within the interval start, end for a 'binLength' window.
29
+ Because the idea is to get counts for window positions at
30
+ different positions for sampling the bins are equally spaced
31
+ and *not adjacent*.
32
+
33
+ If a list of bedRegions is given, then the number of reads
34
+ that overlaps with each region is counted.
35
+
36
+ Test dataset with two samples covering 200 bp.
37
+ >>> test = Tester()
38
+
39
+ Fragment coverage.
40
+ >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0])
41
+ array([[1., 1., 2., 2.],
42
+ [1., 1., 1., 3.]])
43
+
44
+ >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0])
45
+ array([[1.5],
46
+ [1.5]])
47
+
48
+ BED regions:
49
+ >>> bedRegions = [[test.chrom, [(45, 55)]], [test.chrom, [(95, 105)]], [test.chrom, [(145, 155)]]]
50
+ >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False,
51
+ ... bedRegions=bedRegions)[0])
52
+ array([[1. , 1.5, 2. ],
53
+ [1. , 1. , 2. ]])
54
+ """
55
+ assert start < end, "start {} bigger that end {}".format(start, end)
56
+
57
+ # array to keep the scores for the regions
58
+ sub_score_per_bin = []
59
+
60
+ rows = 0
61
+
62
+ bigwig_handles = []
63
+ for foo in bigWigFiles:
64
+ bigwig_handles.append(pyBigWig.open(foo))
65
+
66
+ regions_to_consider = []
67
+ if bedRegions:
68
+ for reg in bedRegions:
69
+ regs = []
70
+ for exon in reg[1]:
71
+ regs.append((exon[0], exon[1]))
72
+ regions_to_consider.append(regs)
73
+ else:
74
+ for i in range(start, end, stepSize):
75
+ if (i + binLength) > end:
76
+ regions_to_consider.append([(i, end)]) # last bin (may be smaller)
77
+ else:
78
+ regions_to_consider.append([(i, i + binLength)])
79
+
80
+ if save_data:
81
+ _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
82
+ _file_name = _file.name
83
+ else:
84
+ _file_name = ''
85
+ warnings.simplefilter("default")
86
+ i = 0
87
+ for reg in regions_to_consider:
88
+ avgReadsArray = []
89
+ i += 1
90
+
91
+ for idx, bwh in enumerate(bigwig_handles):
92
+ if chrom not in bwh.chroms():
93
+ unmod_name = chrom
94
+ if chrom.startswith('chr'):
95
+ # remove the chr part from chromosome name
96
+ chrom = chrom[3:]
97
+ else:
98
+ # prefix with 'chr' the chromosome name
99
+ chrom = 'chr' + chrom
100
+ if chrom not in bwh.chroms():
101
+ exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx]))
102
+
103
+ weights = []
104
+ scores = []
105
+ for exon in reg:
106
+ weights.append(exon[1] - exon[0])
107
+ score = bwh.stats(chrom, exon[0], exon[1])
108
+
109
+ if score is None or score == [None] or np.isnan(score[0]):
110
+ score = [np.nan]
111
+ scores.extend(score)
112
+ avgReadsArray.append(np.average(scores, weights=weights)) # mean of fragment coverage for region
113
+
114
+ sub_score_per_bin.extend(avgReadsArray)
115
+ rows += 1
116
+ if save_data:
117
+ starts = []
118
+ ends = []
119
+ for exon in reg:
120
+ starts.append(str(exon[0]))
121
+ ends.append(str(exon[1]))
122
+ starts = ",".join(starts)
123
+ ends = ",".join(ends)
124
+ _file.write("\t".join(map(str, [chrom, starts, ends])) + "\t")
125
+ _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n")
126
+
127
+ if save_data:
128
+ _file.close()
129
+ warnings.resetwarnings()
130
+
131
+ # the output is a matrix having as many rows as the variable 'row'
132
+ # and as many columns as bigwig files. The rows correspond to
133
+ # each of the regions processed by the worker.
134
+ # np.array([[score1_1, score1_2],
135
+ # [score2_1, score2_2]]
136
+ return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name
137
+
138
+
139
+ def getChromSizes(bigwigFilesList):
140
+ """
141
+ Get chromosome sizes from bigWig file with pyBigWig
142
+
143
+ Test dataset with two samples covering 200 bp.
144
+ >>> test = Tester()
145
+
146
+ Chromosome name(s) and size(s).
147
+ >>> assert getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([]))
148
+ """
149
+ def print_chr_names_and_size(chr_set):
150
+ sys.stderr.write("chromosome\tlength\n")
151
+ for name, size in chr_set:
152
+ sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size))
153
+
154
+ bigwigFilesList = bigwigFilesList[:]
155
+
156
+ common_chr = set()
157
+ for fname in bigwigFilesList:
158
+ fh = pyBigWig.open(fname)
159
+ common_chr = common_chr.union(set(fh.chroms().items()))
160
+ fh.close()
161
+
162
+ non_common_chr = set()
163
+ for bw in bigwigFilesList:
164
+ _names_and_size = set(pyBigWig.open(bw).chroms().items())
165
+ if len(common_chr & _names_and_size) == 0:
166
+ # try to add remove 'chr' from the chromosme name
167
+ _corr_names_size = set()
168
+ for chrom_name, size in _names_and_size:
169
+ if chrom_name.startswith('chr'):
170
+ _corr_names_size.add((chrom_name[3:], size))
171
+ else:
172
+ _corr_names_size.add(('chr' + chrom_name, size))
173
+ if len(common_chr & _corr_names_size) == 0:
174
+ message = "No common chromosomes found. Are the bigwig files " \
175
+ "from the same species and same assemblies?\n"
176
+ sys.stderr.write(message)
177
+ print_chr_names_and_size(common_chr)
178
+
179
+ sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n"
180
+ "lengths from file\n{}\n".format(bw))
181
+ print_chr_names_and_size(_names_and_size)
182
+ exit(1)
183
+ else:
184
+ _names_and_size = _corr_names_size
185
+
186
+ non_common_chr |= common_chr ^ _names_and_size
187
+ common_chr = common_chr & _names_and_size
188
+
189
+ if len(non_common_chr) > 0:
190
+ sys.stderr.write("\nThe following chromosome names did not match between the bigwig files\n")
191
+ print_chr_names_and_size(non_common_chr)
192
+
193
+ # get the list of common chromosome names and sizes
194
+ return sorted(common_chr), non_common_chr
195
+
196
+
197
+ def getScorePerBin(bigWigFiles, binLength,
198
+ numberOfProcessors=1,
199
+ verbose=False, region=None,
200
+ bedFile=None,
201
+ blackListFileName=None,
202
+ stepSize=None,
203
+ chrsToSkip=[],
204
+ out_file_for_raw_data=None,
205
+ allArgs=None):
206
+ """
207
+ This function returns a matrix containing scores (median) for the coverage
208
+ of fragments within a region. Each row corresponds to a sampled region.
209
+ Likewise, each column corresponds to a bigwig file.
210
+
211
+ Test dataset with two samples covering 200 bp.
212
+ >>> test = Tester()
213
+ >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
214
+ array([[1., 1., 2., 2.],
215
+ [1., 1., 1., 3.]])
216
+
217
+ """
218
+ # Try to determine an optimal fraction of the genome (chunkSize)
219
+ # that is sent to workers for analysis. If too short, too much time
220
+ # is spent loading the files
221
+ # if too long, some processors end up free.
222
+ # the following is a heuristic
223
+
224
+ # get list of common chromosome names and sizes
225
+ chrom_sizes, non_common = getChromSizes(bigWigFiles)
226
+ # skip chromosome in the list. This is usually for the
227
+ # X chromosome which may have either one copy in a male sample
228
+ # or a mixture of male/female and is unreliable.
229
+ # Also the skip may contain heterochromatic regions and
230
+ # mitochondrial DNA
231
+ if chrsToSkip and len(chrsToSkip):
232
+ chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]
233
+
234
+ chrnames, chrlengths = list(zip(*chrom_sizes))
235
+ if stepSize is None:
236
+ stepSize = binLength # for adjacent bins
237
+
238
+ # set chunksize based on number of processors used
239
+ chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
240
+ # make chunkSize multiple of binLength
241
+ chunkSize -= chunkSize % binLength
242
+ if verbose:
243
+ print("step size is {}".format(stepSize))
244
+
245
+ if region:
246
+ # in case a region is used, append the tilesize
247
+ region += ":{}".format(binLength)
248
+ # mapReduce( (staticArgs), func, chromSize, etc. )
249
+ if out_file_for_raw_data:
250
+ save_file = True
251
+ else:
252
+ save_file = False
253
+
254
+ # Handle GTF options
255
+ transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
256
+
257
+ imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file),
258
+ countReadsInRegions_wrapper,
259
+ chrom_sizes,
260
+ genomeChunkLength=chunkSize,
261
+ bedFile=bedFile,
262
+ blackListFileName=blackListFileName,
263
+ region=region,
264
+ numberOfProcessors=numberOfProcessors,
265
+ transcriptID=transcriptID,
266
+ exonID=exonID,
267
+ keepExons=keepExons,
268
+ transcript_id_designator=transcript_id_designator)
269
+
270
+ if out_file_for_raw_data:
271
+ if len(non_common):
272
+ sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
273
+ "the chromosomes that were not common between the bigwig files\n")
274
+
275
+ # concatenate intermediary bedgraph files
276
+ ofile = open(out_file_for_raw_data, "w")
277
+ for _values, tempFileName in imap_res:
278
+ if tempFileName:
279
+ # concatenate all intermediate tempfiles into one
280
+ f = open(tempFileName, 'r')
281
+ shutil.copyfileobj(f, ofile)
282
+ f.close()
283
+ os.remove(tempFileName)
284
+
285
+ ofile.close()
286
+
287
+ # the matrix scores are in the first element of each of the entries in imap_res
288
+ score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
289
+ return score_per_bin
290
+
291
+
292
+ class Tester(object):
293
+
294
+ def __init__(self):
295
+ """
296
+ The the two bigWig files are as follows:
297
+ $ cat /tmp/testA.bg
298
+ 3R 0 100 1
299
+ 3R 100 200 2
300
+
301
+ $ cat /tmp/testB.bg
302
+ 3R 0 150 1
303
+ 3R 150 200 3
304
+
305
+ They cover 200 bp:
306
+
307
+ 0 50 100 150 200
308
+ |------------------------------------------------------------|
309
+ A 111111111111111111111111111111122222222222222222222222222222
310
+
311
+
312
+ B 111111111111111111111111111111111111111111111333333333333333
313
+
314
+ """
315
+
316
+ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
317
+ self.bwFile1 = self.root + "testA.bw"
318
+ self.bwFile2 = self.root + "testB.bw"
319
+ self.bwFile_PE = self.root + "test_paired2.bw"
320
+ self.chrom = '3R'
321
+ # global debug
322
+ # debug = 0
deepTools/source/deeptools/heatmapper.py ADDED
@@ -0,0 +1,1372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import gzip
3
+ from collections import OrderedDict
4
+ import numpy as np
5
+ from copy import deepcopy
6
+
7
+ import pyBigWig
8
+ from deeptools import getScorePerBigWigBin
9
+ from deeptools import mapReduce
10
+ from deeptools.utilities import toString, toBytes, smartLabels
11
+ from deeptools.heatmapper_utilities import getProfileTicks
12
+
13
+
14
+ old_settings = np.seterr(all='ignore')
15
+
16
+
17
+ def chopRegions(exonsInput, left=0, right=0):
18
+ """
19
+ exons is a list of (start, end) tuples. The goal is to chop these into
20
+ separate lists of tuples, to take care or unscaled regions. "left" and
21
+ "right" denote regions of a given size to exclude from the normal binning
22
+ process (unscaled regions).
23
+
24
+ This outputs three lists of (start, end) tuples:
25
+
26
+ leftBins: 5' unscaled regions
27
+ bodyBins: body bins for scaling
28
+ rightBins: 3' unscaled regions
29
+
30
+ In addition are two integers
31
+ padLeft: Number of bases of padding on the left (due to not being able to fulfill "left")
32
+ padRight: As above, but on the right side
33
+ """
34
+ leftBins = []
35
+ rightBins = []
36
+ padLeft = 0
37
+ padRight = 0
38
+ exons = deepcopy(exonsInput)
39
+ while len(exons) > 0 and left > 0:
40
+ width = exons[0][1] - exons[0][0]
41
+ if width <= left:
42
+ leftBins.append(exons[0])
43
+ del exons[0]
44
+ left -= width
45
+ else:
46
+ leftBins.append((exons[0][0], exons[0][0] + left))
47
+ exons[0] = (exons[0][0] + left, exons[0][1])
48
+ left = 0
49
+ if left > 0:
50
+ padLeft = left
51
+
52
+ while len(exons) > 0 and right > 0:
53
+ width = exons[-1][1] - exons[-1][0]
54
+ if width <= right:
55
+ rightBins.append(exons[-1])
56
+ del exons[-1]
57
+ right -= width
58
+ else:
59
+ rightBins.append((exons[-1][1] - right, exons[-1][1]))
60
+ exons[-1] = (exons[-1][0], exons[-1][1] - right)
61
+ right = 0
62
+ if right > 0:
63
+ padRight = right
64
+
65
+ return leftBins, exons, rightBins[::-1], padLeft, padRight
66
+
67
+
68
+ def chopRegionsFromMiddle(exonsInput, left=0, right=0):
69
+ """
70
+ Like chopRegions(), above, but returns two lists of tuples on each side of
71
+ the center point of the exons.
72
+
73
+ The steps are as follow:
74
+
75
+ 1) Find the center point of the set of exons (e.g., [(0, 200), (300, 400), (800, 900)] would be centered at 200)
76
+ * If a given exon spans the center point then the exon is split
77
+ 2) The given number of bases at the end of the left-of-center list are extracted
78
+ * If the set of exons don't contain enough bases, then padLeft is incremented accordingly
79
+ 3) As above but for the right-of-center list
80
+ 4) A tuple of (#2, #3, pading on the left, and padding on the right) is returned
81
+ """
82
+ leftBins = []
83
+ rightBins = []
84
+ size = sum([x[1] - x[0] for x in exonsInput])
85
+ middle = size // 2
86
+ cumulativeSum = 0
87
+ padLeft = 0
88
+ padRight = 0
89
+ exons = deepcopy(exonsInput)
90
+
91
+ # Split exons in half
92
+ for exon in exons:
93
+ size = exon[1] - exon[0]
94
+ if cumulativeSum >= middle:
95
+ rightBins.append(exon)
96
+ elif cumulativeSum + size < middle:
97
+ leftBins.append(exon)
98
+ else:
99
+ # Don't add 0-width exonic bins!
100
+ if exon[0] < exon[1] - cumulativeSum - size + middle:
101
+ leftBins.append((exon[0], exon[1] - cumulativeSum - size + middle))
102
+ if exon[1] - cumulativeSum - size + middle < exon[1]:
103
+ rightBins.append((exon[1] - cumulativeSum - size + middle, exon[1]))
104
+ cumulativeSum += size
105
+
106
+ # Trim leftBins/adjust padLeft
107
+ lSum = sum([x[1] - x[0] for x in leftBins])
108
+ if lSum > left:
109
+ lSum = 0
110
+ for i, exon in enumerate(leftBins[::-1]):
111
+ size = exon[1] - exon[0]
112
+ if lSum + size > left:
113
+ leftBins[-i - 1] = (exon[1] + lSum - left, exon[1])
114
+ break
115
+ lSum += size
116
+ if lSum == left:
117
+ break
118
+ i += 1
119
+ if i < len(leftBins):
120
+ leftBins = leftBins[-i:]
121
+ elif lSum < left:
122
+ padLeft = left - lSum
123
+
124
+ # Trim rightBins/adjust padRight
125
+ rSum = sum([x[1] - x[0] for x in rightBins])
126
+ if rSum > right:
127
+ rSum = 0
128
+ for i, exon in enumerate(rightBins):
129
+ size = exon[1] - exon[0]
130
+ if rSum + size > right:
131
+ rightBins[i] = (exon[0], exon[1] - rSum - size + right)
132
+ break
133
+ rSum += size
134
+ if rSum == right:
135
+ break
136
+ rightBins = rightBins[:i + 1]
137
+ elif rSum < right:
138
+ padRight = right - rSum
139
+
140
+ return leftBins, rightBins, padLeft, padRight
141
+
142
+
143
+ def trimZones(zones, maxLength, binSize, padRight):
144
+ """
145
+ Given a (variable length) list of lists of (start, end) tuples, trim/remove and tuple that extends past maxLength (e.g., the end of a chromosome)
146
+
147
+ Returns the trimmed zones and padding
148
+ """
149
+ output = []
150
+ for zone, nbins in zones:
151
+ outZone = []
152
+ changed = False
153
+ for reg in zone:
154
+ if reg[0] >= maxLength:
155
+ changed = True
156
+ padRight += reg[1] - reg[0]
157
+ continue
158
+
159
+ if reg[1] > maxLength:
160
+ changed = True
161
+ padRight += reg[1] - maxLength
162
+ reg = (reg[0], maxLength)
163
+ if reg[1] > reg[0]:
164
+ outZone.append(reg)
165
+ if changed:
166
+ nBins = sum(x[1] - x[0] for x in outZone) // binSize
167
+ else:
168
+ nBins = nbins
169
+ output.append((outZone, nBins))
170
+ return output, padRight
171
+
172
+
173
+ def compute_sub_matrix_wrapper(args):
174
+ return heatmapper.compute_sub_matrix_worker(*args)
175
+
176
+
177
+ class heatmapper(object):
178
+ """
179
+ Class to handle the reading and
180
+ plotting of matrices.
181
+ """
182
+
183
+ def __init__(self):
184
+ self.parameters = None
185
+ self.lengthDict = None
186
+ self.matrix = None
187
+ self.regions = None
188
+ self.blackList = None
189
+ self.quiet = True
190
+ # These are parameters that were single values in versions <3 but are now internally lists. See issue #614
191
+ self.special_params = set(['unscaled 5 prime', 'unscaled 3 prime', 'body', 'downstream', 'upstream', 'ref point', 'bin size'])
192
+
193
+ def getTicks(self, idx):
194
+ """
195
+ This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks.
196
+ """
197
+ xticks, xtickslabel = getProfileTicks(self, self.reference_point_label[idx], self.startLabel, self.endLabel, idx)
198
+ return xticks, xtickslabel
199
+
200
+ def computeMatrix(self, score_file_list, regions_file, parameters, blackListFileName=None, verbose=False, allArgs=None):
201
+ """
202
+ Splits into
203
+ multiple cores the computation of the scores
204
+ per bin for each region (defined by a hash '#'
205
+ in the regions (BED/GFF) file.
206
+ """
207
+ if parameters['body'] > 0 and \
208
+ parameters['body'] % parameters['bin size'] > 0:
209
+ exit("The --regionBodyLength has to be "
210
+ "a multiple of --binSize.\nCurrently the "
211
+ "values are {} {} for\nregionsBodyLength and "
212
+ "binSize respectively\n".format(parameters['body'],
213
+ parameters['bin size']))
214
+
215
+ # the beforeRegionStartLength is extended such that
216
+ # length is a multiple of binSize
217
+ if parameters['downstream'] % parameters['bin size'] > 0:
218
+ exit("Length of region after the body has to be "
219
+ "a multiple of --binSize.\nCurrent value "
220
+ "is {}\n".format(parameters['downstream']))
221
+
222
+ if parameters['upstream'] % parameters['bin size'] > 0:
223
+ exit("Length of region before the body has to be a multiple of "
224
+ "--binSize\nCurrent value is {}\n".format(parameters['upstream']))
225
+
226
+ if parameters['unscaled 5 prime'] % parameters['bin size'] > 0:
227
+ exit("Length of the unscaled 5 prime region has to be a multiple of "
228
+ "--binSize\nCurrent value is {}\n".format(parameters['unscaled 5 prime']))
229
+
230
+ if parameters['unscaled 3 prime'] % parameters['bin size'] > 0:
231
+ exit("Length of the unscaled 5 prime region has to be a multiple of "
232
+ "--binSize\nCurrent value is {}\n".format(parameters['unscaled 3 prime']))
233
+
234
+ if parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] > 0 and parameters['body'] == 0:
235
+ exit('Unscaled 5- and 3-prime regions only make sense with the scale-regions subcommand.\n')
236
+
237
+ # Take care of GTF options
238
+ transcriptID = "transcript"
239
+ exonID = "exon"
240
+ transcript_id_designator = "transcript_id"
241
+ keepExons = False
242
+ self.quiet = False
243
+ if allArgs is not None:
244
+ allArgs = vars(allArgs)
245
+ transcriptID = allArgs.get("transcriptID", transcriptID)
246
+ exonID = allArgs.get("exonID", exonID)
247
+ transcript_id_designator = allArgs.get("transcript_id_designator", transcript_id_designator)
248
+ keepExons = allArgs.get("keepExons", keepExons)
249
+ self.quiet = allArgs.get("quiet", self.quiet)
250
+
251
+ chromSizes, _ = getScorePerBigWigBin.getChromSizes(score_file_list)
252
+ res, labels = mapReduce.mapReduce([score_file_list, parameters],
253
+ compute_sub_matrix_wrapper,
254
+ chromSizes,
255
+ self_=self,
256
+ bedFile=regions_file,
257
+ blackListFileName=blackListFileName,
258
+ numberOfProcessors=parameters['proc number'],
259
+ includeLabels=True,
260
+ transcriptID=transcriptID,
261
+ exonID=exonID,
262
+ transcript_id_designator=transcript_id_designator,
263
+ keepExons=keepExons,
264
+ verbose=verbose)
265
+ # each worker in the pool returns a tuple containing
266
+ # the submatrix data, the regions that correspond to the
267
+ # submatrix, and the number of regions lacking scores
268
+ # Since this is largely unsorted, we need to sort by group
269
+
270
+ # merge all the submatrices into matrix
271
+ matrix = np.concatenate([r[0] for r in res], axis=0)
272
+ regions = []
273
+ regions_no_score = 0
274
+ for idx in range(len(res)):
275
+ if len(res[idx][1]):
276
+ regions.extend(res[idx][1])
277
+ regions_no_score += res[idx][2]
278
+ groups = [x[3] for x in regions]
279
+ foo = sorted(zip(groups, list(range(len(regions))), regions))
280
+ sortIdx = [x[1] for x in foo]
281
+ regions = [x[2] for x in foo]
282
+ matrix = matrix[sortIdx]
283
+
284
+ # mask invalid (nan) values
285
+ matrix = np.ma.masked_invalid(matrix)
286
+
287
+ assert matrix.shape[0] == len(regions), \
288
+ "matrix length does not match regions length"
289
+
290
+ if len(regions) == 0:
291
+ sys.stderr.write("\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n")
292
+ exit(1)
293
+ if regions_no_score == len(regions):
294
+ exit("\nERROR: None of the BED regions could be found in the bigWig"
295
+ "file.\nPlease check that the bigwig file is valid and "
296
+ "that the chromosome names between the BED file and "
297
+ "the bigWig file correspond to each other\n")
298
+
299
+ if regions_no_score > len(regions) * 0.75:
300
+ file_type = 'bigwig' if score_file_list[0].endswith(".bw") else "BAM"
301
+ prcnt = 100 * float(regions_no_score) / len(regions)
302
+ sys.stderr.write(
303
+ "\n\nWarning: {0:.2f}% of regions are *not* associated\n"
304
+ "to any score in the given {1} file. Check that the\n"
305
+ "chromosome names from the BED file are consistent with\n"
306
+ "the chromosome names in the given {2} file and that both\n"
307
+ "files refer to the same species\n\n".format(prcnt,
308
+ file_type,
309
+ file_type))
310
+
311
+ self.parameters = parameters
312
+
313
+ numcols = matrix.shape[1]
314
+ num_ind_cols = self.get_num_individual_matrix_cols()
315
+ sample_boundaries = list(range(0, numcols + num_ind_cols, num_ind_cols))
316
+ if allArgs is not None and allArgs['samplesLabel'] is not None:
317
+ sample_labels = allArgs['samplesLabel']
318
+ else:
319
+ sample_labels = smartLabels(score_file_list)
320
+
321
+ # Determine the group boundaries
322
+ group_boundaries = []
323
+ group_labels_filtered = []
324
+ last_idx = -1
325
+ for x in range(len(regions)):
326
+ if regions[x][3] != last_idx:
327
+ last_idx = regions[x][3]
328
+ group_boundaries.append(x)
329
+ group_labels_filtered.append(labels[last_idx])
330
+ group_boundaries.append(len(regions))
331
+
332
+ # check if a given group is too small. Groups that
333
+ # are too small can't be plotted and an exception is thrown.
334
+ group_len = np.diff(group_boundaries)
335
+ if len(group_len) > 1:
336
+ sum_len = sum(group_len)
337
+ group_frac = [float(x) / sum_len for x in group_len]
338
+ if min(group_frac) <= 0.002:
339
+ sys.stderr.write(
340
+ "One of the groups defined in the bed file is "
341
+ "too small.\nGroups that are too small can't be plotted. "
342
+ "\n")
343
+
344
+ self.matrix = _matrix(regions, matrix,
345
+ group_boundaries,
346
+ sample_boundaries,
347
+ group_labels_filtered,
348
+ sample_labels)
349
+
350
+ if parameters['skip zeros']:
351
+ self.matrix.removeempty()
352
+
353
+ @staticmethod
354
+ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, parameters, regions):
355
+ """
356
+ Returns
357
+ -------
358
+ numpy matrix
359
+ A numpy matrix that contains per each row the values found per each of the regions given
360
+ """
361
+ if parameters['verbose']:
362
+ sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))
363
+
364
+ # read BAM or scores file
365
+ score_file_handles = []
366
+ for sc_file in score_file_list:
367
+ score_file_handles.append(pyBigWig.open(sc_file))
368
+
369
+ # determine the number of matrix columns based on the lengths
370
+ # given by the user, times the number of score files
371
+ matrix_cols = len(score_file_list) * \
372
+ ((parameters['downstream'] +
373
+ parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] +
374
+ parameters['upstream'] + parameters['body']) //
375
+ parameters['bin size'])
376
+
377
+ # create an empty matrix to store the values
378
+ sub_matrix = np.zeros((len(regions), matrix_cols))
379
+ sub_matrix[:] = np.nan
380
+
381
+ j = 0
382
+ sub_regions = []
383
+ regions_no_score = 0
384
+ for transcript in regions:
385
+ feature_chrom = transcript[0]
386
+ exons = transcript[1]
387
+ feature_start = exons[0][0]
388
+ feature_end = exons[-1][1]
389
+ feature_name = transcript[2]
390
+ feature_strand = transcript[4]
391
+ padLeft = 0
392
+ padRight = 0
393
+ padLeftNaN = 0
394
+ padRightNaN = 0
395
+ upstream = []
396
+ downstream = []
397
+
398
+ # get the body length
399
+ body_length = np.sum([x[1] - x[0] for x in exons]) - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']
400
+
401
+ # print some information
402
+ if parameters['body'] > 0 and \
403
+ body_length < parameters['bin size']:
404
+ if not self.quiet:
405
+ sys.stderr.write("A region that is shorter than the bin size (possibly only after accounting for unscaled regions) was found: "
406
+ "({0}) {1} {2}:{3}:{4}. Skipping...\n".format((body_length - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']),
407
+ feature_name, feature_chrom,
408
+ feature_start, feature_end))
409
+ coverage = np.zeros(matrix_cols)
410
+ if not parameters['missing data as zero']:
411
+ coverage[:] = np.nan
412
+ else:
413
+ if feature_strand == '-':
414
+ if parameters['downstream'] > 0:
415
+ upstream = [(feature_start - parameters['downstream'], feature_start)]
416
+ if parameters['upstream'] > 0:
417
+ downstream = [(feature_end, feature_end + parameters['upstream'])]
418
+ unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 3 prime'], right=parameters['unscaled 5 prime'])
419
+ # bins per zone
420
+ a = parameters['downstream'] // parameters['bin size']
421
+ b = parameters['unscaled 3 prime'] // parameters['bin size']
422
+ d = parameters['unscaled 5 prime'] // parameters['bin size']
423
+ e = parameters['upstream'] // parameters['bin size']
424
+ else:
425
+ if parameters['upstream'] > 0:
426
+ upstream = [(feature_start - parameters['upstream'], feature_start)]
427
+ if parameters['downstream'] > 0:
428
+ downstream = [(feature_end, feature_end + parameters['downstream'])]
429
+ unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 5 prime'], right=parameters['unscaled 3 prime'])
430
+ a = parameters['upstream'] // parameters['bin size']
431
+ b = parameters['unscaled 5 prime'] // parameters['bin size']
432
+ d = parameters['unscaled 3 prime'] // parameters['bin size']
433
+ e = parameters['downstream'] // parameters['bin size']
434
+ c = parameters['body'] // parameters['bin size']
435
+
436
+ # build zones (each is a list of tuples)
437
+ # zone0: region before the region start,
438
+ # zone1: unscaled 5 prime region
439
+ # zone2: the body of the region
440
+ # zone3: unscaled 3 prime region
441
+ # zone4: the region from the end of the region downstream
442
+ # the format for each zone is: [(start, end), ...], number of bins
443
+ # Note that for "reference-point", upstream/downstream will go
444
+ # through the exons (if requested) and then possibly continue
445
+ # on the other side (unless parameters['nan after end'] is true)
446
+ if parameters['body'] > 0:
447
+ zones = [(upstream, a), (unscaled5prime, b), (body, c), (unscaled3prime, d), (downstream, e)]
448
+ elif parameters['ref point'] == 'TES': # around TES
449
+ if feature_strand == '-':
450
+ downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['upstream'])
451
+ if padRight > 0 and parameters['nan after end'] is True:
452
+ padRightNaN += padRight
453
+ elif padRight > 0:
454
+ downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
455
+ padRight = 0
456
+ else:
457
+ unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['upstream'])
458
+ if padLeft > 0 and parameters['nan after end'] is True:
459
+ padLeftNaN += padLeft
460
+ elif padLeft > 0:
461
+ upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
462
+ padLeft = 0
463
+ e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
464
+ a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
465
+ zones = [(upstream, a), (downstream, e)]
466
+ elif parameters['ref point'] == 'center': # at the region center
467
+ if feature_strand == '-':
468
+ upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['downstream'], right=parameters['upstream'])
469
+ else:
470
+ upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['upstream'], right=parameters['downstream'])
471
+ if padLeft > 0 and parameters['nan after end'] is True:
472
+ padLeftNaN += padLeft
473
+ elif padLeft > 0:
474
+ if len(upstream) > 0:
475
+ upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
476
+ else:
477
+ upstream = [(downstream[0][0] - padLeft, downstream[0][0])]
478
+ padLeft = 0
479
+ if padRight > 0 and parameters['nan after end'] is True:
480
+ padRightNaN += padRight
481
+ elif padRight > 0:
482
+ downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
483
+ padRight = 0
484
+ a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
485
+ e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
486
+ # It's possible for a/e to be floats or 0 yet upstream/downstream isn't empty
487
+ if a < 1:
488
+ upstream = []
489
+ a = 0
490
+ if e < 1:
491
+ downstream = []
492
+ e = 0
493
+ zones = [(upstream, a), (downstream, e)]
494
+ else: # around TSS
495
+ if feature_strand == '-':
496
+ unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['downstream'])
497
+ if padLeft > 0 and parameters['nan after end'] is True:
498
+ padLeftNaN += padLeft
499
+ elif padLeft > 0:
500
+ upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
501
+ padLeft = 0
502
+ else:
503
+ downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['downstream'])
504
+ if padRight > 0 and parameters['nan after end'] is True:
505
+ padRightNaN += padRight
506
+ elif padRight > 0:
507
+ downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
508
+ padRight = 0
509
+ a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
510
+ e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
511
+ zones = [(upstream, a), (downstream, e)]
512
+
513
+ foo = parameters['upstream']
514
+ bar = parameters['downstream']
515
+ if feature_strand == '-':
516
+ foo, bar = bar, foo
517
+ if padLeftNaN > 0:
518
+ expected = foo // parameters['bin size']
519
+ padLeftNaN = int(round(float(padLeftNaN) / parameters['bin size']))
520
+ if expected - padLeftNaN - a > 0:
521
+ padLeftNaN += 1
522
+ if padRightNaN > 0:
523
+ expected = bar // parameters['bin size']
524
+ padRightNaN = int(round(float(padRightNaN) / parameters['bin size']))
525
+ if expected - padRightNaN - e > 0:
526
+ padRightNaN += 1
527
+
528
+ coverage = []
529
+ # compute the values for each of the files being processed.
530
+ # "cov" is a numpy array of bins
531
+ for sc_handler in score_file_handles:
532
+ # We're only supporting bigWig files at this point
533
+ cov = heatmapper.coverage_from_big_wig(
534
+ sc_handler, feature_chrom, zones,
535
+ parameters['bin size'],
536
+ parameters['bin avg type'],
537
+ parameters['missing data as zero'],
538
+ not self.quiet)
539
+
540
+ if padLeftNaN > 0:
541
+ cov = np.concatenate([[np.nan] * padLeftNaN, cov])
542
+ if padRightNaN > 0:
543
+ cov = np.concatenate([cov, [np.nan] * padRightNaN])
544
+
545
+ if feature_strand == "-":
546
+ cov = cov[::-1]
547
+
548
+ coverage = np.hstack([coverage, cov])
549
+
550
+ if coverage is None:
551
+ regions_no_score += 1
552
+ if not self.quiet:
553
+ sys.stderr.write(
554
+ "No data was found for region "
555
+ "{0} {1}:{2}-{3}. Skipping...\n".format(
556
+ feature_name, feature_chrom,
557
+ feature_start, feature_end))
558
+
559
+ coverage = np.zeros(matrix_cols)
560
+ if not parameters['missing data as zero']:
561
+ coverage[:] = np.nan
562
+
563
+ try:
564
+ temp = coverage.copy()
565
+ temp[np.isnan(temp)] = 0
566
+ except:
567
+ if not self.quiet:
568
+ sys.stderr.write(
569
+ "No scores defined for region "
570
+ "{0} {1}:{2}-{3}. Skipping...\n".format(feature_name,
571
+ feature_chrom,
572
+ feature_start,
573
+ feature_end))
574
+ coverage = np.zeros(matrix_cols)
575
+ if not parameters['missing data as zero']:
576
+ coverage[:] = np.nan
577
+
578
+ if parameters['min threshold'] is not None and coverage.min() <= parameters['min threshold']:
579
+ continue
580
+ if parameters['max threshold'] is not None and coverage.max() >= parameters['max threshold']:
581
+ continue
582
+ if parameters['scale'] != 1:
583
+ coverage = parameters['scale'] * coverage
584
+
585
+ sub_matrix[j, :] = coverage
586
+
587
+ sub_regions.append(transcript)
588
+ j += 1
589
+
590
+ # remove empty rows
591
+ sub_matrix = sub_matrix[0:j, :]
592
+ if len(sub_regions) != len(sub_matrix[:, 0]):
593
+ sys.stderr.write("regions lengths do not match\n")
594
+ return sub_matrix, sub_regions, regions_no_score
595
+
596
+ @staticmethod
597
+ def coverage_from_array(valuesArray, zones, binSize, avgType):
598
+ try:
599
+ valuesArray[0]
600
+ except (IndexError, TypeError) as detail:
601
+ sys.stderr.write("{0}\nvalues array value: {1}, zones {2}\n".format(detail, valuesArray, zones))
602
+
603
+ cvglist = []
604
+ zoneEnd = 0
605
+ valStart = 0
606
+ valEnd = 0
607
+ for zone, nBins in zones:
608
+ if nBins:
609
+ # linspace is used to more or less evenly partition the data points into the given number of bins
610
+ zoneEnd += nBins
611
+ valStart = valEnd
612
+ valEnd += np.sum([x[1] - x[0] for x in zone])
613
+ counts_list = []
614
+
615
+ # Partition the space into bins
616
+ if nBins == 1:
617
+ pos_array = np.array([valStart])
618
+ else:
619
+ pos_array = np.linspace(valStart, valEnd, nBins, endpoint=False, dtype=int)
620
+ pos_array = np.append(pos_array, valEnd)
621
+
622
+ idx = 0
623
+ while idx < nBins:
624
+ idxStart = int(pos_array[idx])
625
+ idxEnd = max(int(pos_array[idx + 1]), idxStart + 1)
626
+ try:
627
+ counts_list.append(heatmapper.my_average(valuesArray[idxStart:idxEnd], avgType))
628
+ except Exception as detail:
629
+ sys.stderr.write("Exception found: {0}\n".format(detail))
630
+ idx += 1
631
+ cvglist.append(np.array(counts_list))
632
+
633
+ return np.concatenate(cvglist)
634
+
635
+ @staticmethod
636
+ def change_chrom_names(chrom):
637
+ """
638
+ Changes UCSC chromosome names to ensembl chromosome names
639
+ and vice versa.
640
+ """
641
+ if chrom.startswith('chr'):
642
+ # remove the chr part from chromosome name
643
+ chrom = chrom[3:]
644
+ if chrom == "M":
645
+ chrom = "MT"
646
+ else:
647
+ # prefix with 'chr' the chromosome name
648
+ chrom = 'chr' + chrom
649
+ if chrom == "chrMT":
650
+ chrom = "chrM"
651
+
652
+ return chrom
653
+
654
+ @staticmethod
655
+ def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=False, verbose=True):
656
+
657
+ """
658
+ uses pyBigWig
659
+ to query a region define by chrom and zones.
660
+ The output is an array that contains the bigwig
661
+ value per base pair. The summary over bins is
662
+ done in a later step when coverage_from_array is called.
663
+ This method is more reliable than querying the bins
664
+ directly from the bigwig, which should be more efficient.
665
+
666
+ By default, any region, even if no chromosome match is found
667
+ on the bigwig file, produces a result. In other words
668
+ no regions are skipped.
669
+
670
+ zones: array as follows zone0: region before the region start,
671
+ zone1: 5' unscaled region (if present)
672
+ zone2: the body of the region (not always present)
673
+ zone3: 3' unscaled region (if present)
674
+ zone4: the region from the end of the region downstream
675
+
676
+ each zone is a tuple containing start, end, and number of bins
677
+
678
+
679
+ This is useful if several matrices wants to be merged
680
+ or if the sorted BED output of one computeMatrix operation
681
+ needs to be used for other cases
682
+ """
683
+ nVals = 0
684
+ for zone, _ in zones:
685
+ for region in zone:
686
+ nVals += region[1] - region[0]
687
+
688
+ values_array = np.zeros(nVals)
689
+ if not nansAsZeros:
690
+ values_array[:] = np.nan
691
+ if chrom not in list(bigwig.chroms().keys()):
692
+ unmod_name = chrom
693
+ chrom = heatmapper.change_chrom_names(chrom)
694
+ if chrom not in list(bigwig.chroms().keys()):
695
+ if verbose:
696
+ sys.stderr.write("Warning: Your chromosome names do not match.\nPlease check that the "
697
+ "chromosome names in your BED file\ncorrespond to the names in your "
698
+ "bigWig file.\nAn empty line will be added to your heatmap.\nThe problematic "
699
+ "chromosome name is {0}\n\n".format(unmod_name))
700
+
701
+ # return empty nan array
702
+ return heatmapper.coverage_from_array(values_array, zones, binSize, avgType)
703
+
704
+ maxLen = bigwig.chroms(chrom)
705
+ startIdx = 0
706
+ endIdx = 0
707
+ for zone, _ in zones:
708
+ for region in zone:
709
+ startIdx = endIdx
710
+ if region[0] < 0:
711
+ endIdx += abs(region[0])
712
+ values_array[startIdx:endIdx] = np.nan
713
+ startIdx = endIdx
714
+ start = max(0, region[0])
715
+ end = min(maxLen, region[1])
716
+ endIdx += end - start
717
+ if start < end:
718
+ # This won't be the case if we extend off the front of a chromosome, such as (-100, 0)
719
+ values_array[startIdx:endIdx] = bigwig.values(chrom, start, end)
720
+ if end < region[1]:
721
+ startIdx = endIdx
722
+ endIdx += region[1] - end
723
+ values_array[startIdx:endIdx] = np.nan
724
+
725
+ # replaces nans for zeros
726
+ if nansAsZeros:
727
+ values_array[np.isnan(values_array)] = 0
728
+
729
+ return heatmapper.coverage_from_array(values_array, zones,
730
+ binSize, avgType)
731
+
732
+ @staticmethod
733
+ def my_average(valuesArray, avgType='mean'):
734
+ """
735
+ computes the mean, median, etc but only for those values
736
+ that are not Nan
737
+ """
738
+ valuesArray = np.ma.masked_invalid(valuesArray)
739
+ avg = np.ma.__getattribute__(avgType)(valuesArray)
740
+ if isinstance(avg, np.ma.core.MaskedConstant):
741
+ return np.nan
742
+ else:
743
+ return avg
744
+
745
+ def matrix_from_dict(self, matrixDict, regionsDict, parameters):
746
+ self.regionsDict = regionsDict
747
+ self.matrixDict = matrixDict
748
+ self.parameters = parameters
749
+ self.lengthDict = OrderedDict()
750
+ self.matrixAvgsDict = OrderedDict()
751
+
752
+ def read_matrix_file(self, matrix_file):
753
+ # reads a bed file containing the position
754
+ # of genomic intervals
755
+ # In case a hash sign '#' is found in the
756
+ # file, this is considered as a delimiter
757
+ # to split the heatmap into groups
758
+
759
+ import json
760
+ regions = []
761
+ matrix_rows = []
762
+ current_group_index = 0
763
+ max_group_bound = None
764
+
765
+ fh = gzip.open(matrix_file)
766
+ for line in fh:
767
+ line = toString(line).strip()
768
+ # read the header file containing the parameters
769
+ # used
770
+ if line.startswith("@"):
771
+ # the parameters used are saved using
772
+ # json
773
+ self.parameters = json.loads(line[1:].strip())
774
+ max_group_bound = self.parameters['group_boundaries'][1]
775
+ continue
776
+
777
+ # split the line into bed interval and matrix values
778
+ region = line.split('\t')
779
+ chrom, start, end, name, score, strand = region[0:6]
780
+ matrix_row = np.ma.masked_invalid(np.fromiter(region[6:], float))
781
+ matrix_rows.append(matrix_row)
782
+ starts = start.split(",")
783
+ ends = end.split(",")
784
+ regs = [(int(x), int(y)) for x, y in zip(starts, ends)]
785
+ # get the group index
786
+ if len(regions) >= max_group_bound:
787
+ current_group_index += 1
788
+ max_group_bound = self.parameters['group_boundaries'][current_group_index + 1]
789
+ regions.append([chrom, regs, name, max_group_bound, strand, score])
790
+
791
+ matrix = np.vstack(matrix_rows)
792
+ self.matrix = _matrix(regions, matrix, self.parameters['group_boundaries'],
793
+ self.parameters['sample_boundaries'],
794
+ group_labels=self.parameters['group_labels'],
795
+ sample_labels=self.parameters['sample_labels'])
796
+
797
+ if 'sort regions' in self.parameters:
798
+ self.matrix.set_sorting_method(self.parameters['sort regions'],
799
+ self.parameters['sort using'])
800
+
801
+ # Versions of computeMatrix before 3.0 didn't have an entry of these per column, fix that
802
+ nSamples = len(self.matrix.sample_labels)
803
+ h = dict()
804
+ for k, v in self.parameters.items():
805
+ if k in self.special_params and type(v) is not list:
806
+ v = [v] * nSamples
807
+ if len(v) == 0:
808
+ v = [None] * nSamples
809
+ h[k] = v
810
+ self.parameters = h
811
+
812
+ return
813
+
814
+ def save_matrix(self, file_name):
815
+ """
816
+ saves the data required to reconstruct the matrix
817
+ the format is:
818
+ A header containing the parameters used to create the matrix
819
+ encoded as:
820
+ @key:value\tkey2:value2 etc...
821
+ The rest of the file has the same first 5 columns of a
822
+ BED file: chromosome name, start, end, name, score and strand,
823
+ all separated by tabs. After the fifth column the matrix
824
+ values are appended separated by tabs.
825
+ Groups are separated by adding a line starting with a hash (#)
826
+ and followed by the group name.
827
+
828
+ The file is gzipped.
829
+ """
830
+ import json
831
+ self.parameters['sample_labels'] = self.matrix.sample_labels
832
+ self.parameters['group_labels'] = self.matrix.group_labels
833
+ self.parameters['sample_boundaries'] = self.matrix.sample_boundaries
834
+ self.parameters['group_boundaries'] = self.matrix.group_boundaries
835
+
836
+ # Redo the parameters, ensuring things related to ticks and labels are repeated appropriately
837
+ nSamples = len(self.matrix.sample_labels)
838
+ h = dict()
839
+ for k, v in self.parameters.items():
840
+ if type(v) is list and len(v) == 0:
841
+ v = None
842
+ if k in self.special_params and type(v) is not list:
843
+ v = [v] * nSamples
844
+ if len(v) == 0:
845
+ v = [None] * nSamples
846
+ h[k] = v
847
+ fh = gzip.open(file_name, 'wb')
848
+ params_str = json.dumps(h, separators=(',', ':'))
849
+ fh.write(toBytes("@" + params_str + "\n"))
850
+ score_list = np.ma.masked_invalid(np.mean(self.matrix.matrix, axis=1))
851
+ for idx, region in enumerate(self.matrix.regions):
852
+ # join np_array values
853
+ # keeping nans while converting them to strings
854
+ if not np.ma.is_masked(score_list[idx]):
855
+ float(score_list[idx])
856
+ matrix_values = "\t".join(
857
+ np.char.mod('%f', self.matrix.matrix[idx, :]))
858
+ starts = ["{0}".format(x[0]) for x in region[1]]
859
+ ends = ["{0}".format(x[1]) for x in region[1]]
860
+ starts = ",".join(starts)
861
+ ends = ",".join(ends)
862
+ # BEDish format (we don't currently store the score)
863
+ fh.write(
864
+ toBytes('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format(
865
+ region[0],
866
+ starts,
867
+ ends,
868
+ region[2],
869
+ region[5],
870
+ region[4],
871
+ matrix_values)))
872
+ fh.close()
873
+
874
+ def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_label='TSS', end_label='TES', averagetype='mean'):
875
+ """
876
+ Saves the values averaged by col using the avg_type
877
+ given
878
+
879
+ Args:
880
+ file_handle: file name to save the file
881
+ reference_point_label: Name of the reference point label
882
+ start_label: Name of the star label
883
+ end_label: Name of the end label
884
+ averagetype: average type (e.g. mean, median, std)
885
+
886
+ """
887
+ # get X labels
888
+ w = self.parameters['bin size']
889
+ b = self.parameters['upstream']
890
+ a = self.parameters['downstream']
891
+ c = self.parameters.get('unscaled 5 prime', 0)
892
+ d = self.parameters.get('unscaled 3 prime', 0)
893
+ m = self.parameters['body']
894
+
895
+ xticks = []
896
+ xtickslabel = []
897
+ for idx in range(self.matrix.get_num_samples()):
898
+ if b[idx] < 1e5:
899
+ quotient = 1000
900
+ symbol = 'Kb'
901
+ else:
902
+ quotient = 1e6
903
+ symbol = 'Mb'
904
+
905
+ if m[idx] == 0:
906
+ last = 0
907
+ if len(xticks):
908
+ last = xticks[-1]
909
+ xticks.extend([last + (k / w[idx]) for k in [w[idx], b[idx], b[idx] + a[idx]]])
910
+ xtickslabel.extend(['{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol), reference_point_label,
911
+ '{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)])
912
+
913
+ else:
914
+ xticks_values = [w[idx]]
915
+
916
+ # only if upstream region is set, add a x tick
917
+ if b[idx] > 0:
918
+ xticks_values.append(b[idx])
919
+ xtickslabel.append('{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol))
920
+
921
+ xtickslabel.append(start_label)
922
+
923
+ if c[idx] > 0:
924
+ xticks_values.append(b[idx] + c[idx])
925
+ xtickslabel.append("")
926
+
927
+ if d[idx] > 0:
928
+ xticks_values.append(b[idx] + c[idx] + m[idx])
929
+ xtickslabel.append("")
930
+
931
+ xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx])
932
+ xtickslabel.append(end_label)
933
+
934
+ if a[idx] > 0:
935
+ xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx] + a[idx])
936
+ xtickslabel.append('{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol))
937
+
938
+ last = 0
939
+ if len(xticks):
940
+ last = xticks[-1]
941
+ xticks.extend([last + (k / w[idx]) for k in xticks_values])
942
+ x_axis = np.arange(xticks[-1]) + 1
943
+ labs = []
944
+ for x_value in x_axis:
945
+ if x_value in xticks and xtickslabel[xticks.index(x_value)]:
946
+ labs.append(xtickslabel[xticks.index(x_value)])
947
+ elif x_value in xticks:
948
+ labs.append("tick")
949
+ else:
950
+ labs.append("")
951
+
952
+ with open(file_handle, 'w') as fh:
953
+ # write labels
954
+ fh.write("bin labels\t\t{}\n".format("\t".join(labs)))
955
+ fh.write('bins\t\t{}\n'.format("\t".join([str(x) for x in x_axis])))
956
+
957
+ for sample_idx in range(self.matrix.get_num_samples()):
958
+ for group_idx in range(self.matrix.get_num_groups()):
959
+ sub_matrix = self.matrix.get_matrix(group_idx, sample_idx)
960
+ values = [str(x) for x in np.ma.__getattribute__(averagetype)(sub_matrix['matrix'], axis=0)]
961
+ fh.write("{}\t{}\t{}\n".format(sub_matrix['sample'], sub_matrix['group'], "\t".join(values)))
962
+
963
+ def save_matrix_values(self, file_name):
964
+ # print a header telling the group names and their length
965
+ fh = open(file_name, 'wb')
966
+ info = []
967
+ groups_len = np.diff(self.matrix.group_boundaries)
968
+ for i in range(len(self.matrix.group_labels)):
969
+ info.append("{}:{}".format(self.matrix.group_labels[i],
970
+ groups_len[i]))
971
+ fh.write(toBytes("#{}\n".format("\t".join(info))))
972
+ # add to header the x axis values
973
+ fh.write(toBytes("#downstream:{}\tupstream:{}\tbody:{}\tbin size:{}\tunscaled 5 prime:{}\tunscaled 3 prime:{}\n".format(
974
+ self.parameters['downstream'],
975
+ self.parameters['upstream'],
976
+ self.parameters['body'],
977
+ self.parameters['bin size'],
978
+ self.parameters.get('unscaled 5 prime', 0),
979
+ self.parameters.get('unscaled 3 prime', 0))))
980
+ sample_len = np.diff(self.matrix.sample_boundaries)
981
+ for i in range(len(self.matrix.sample_labels)):
982
+ info.extend([self.matrix.sample_labels[i]] * sample_len[i])
983
+ fh.write(toBytes("{}\n".format("\t".join(info))))
984
+
985
+ fh.close()
986
+ # reopen again using append mode
987
+ fh = open(file_name, 'ab')
988
+ np.savetxt(fh, self.matrix.matrix, fmt="%.4g", delimiter="\t")
989
+ fh.close()
990
+
991
+ def save_BED(self, file_handle):
992
+ boundaries = np.array(self.matrix.group_boundaries)
993
+ # Add a header
994
+ file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group")
995
+ if self.matrix.silhouette is not None:
996
+ file_handle.write("\tsilhouette")
997
+ file_handle.write("\n")
998
+ for idx, region in enumerate(self.matrix.regions):
999
+ # the label id corresponds to the last boundary
1000
+ # that is smaller than the region index.
1001
+ # for example for a boundary array = [0, 10, 20]
1002
+ # and labels ['a', 'b', 'c'],
1003
+ # for index 5, the label is 'a', for
1004
+ # index 10, the label is 'b' etc
1005
+ label_idx = np.flatnonzero(boundaries <= idx)[-1]
1006
+ starts = ["{0}".format(x[0]) for x in region[1]]
1007
+ ends = ["{0}".format(x[1]) for x in region[1]]
1008
+ starts = ",".join(starts)
1009
+ ends = ",".join(ends)
1010
+ file_handle.write(
1011
+ '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t0'.format(
1012
+ region[0],
1013
+ region[1][0][0],
1014
+ region[1][-1][1],
1015
+ region[2],
1016
+ region[5],
1017
+ region[4]))
1018
+ file_handle.write(
1019
+ '\t{0}\t{1}\t{2}\t{3}'.format(
1020
+ len(region[1]),
1021
+ ",".join([str(int(y) - int(x)) for x, y in region[1]]),
1022
+ ",".join([str(int(x) - int(starts[0])) for x, y in region[1]]),
1023
+ self.matrix.group_labels[label_idx]))
1024
+ if self.matrix.silhouette is not None:
1025
+ file_handle.write("\t{}".format(self.matrix.silhouette[idx]))
1026
+ file_handle.write("\n")
1027
+ file_handle.close()
1028
+
1029
+ @staticmethod
1030
+ def matrix_avg(matrix, avgType='mean'):
1031
+ matrix = np.ma.masked_invalid(matrix)
1032
+ return np.ma.__getattribute__(avgType)(matrix, axis=0)
1033
+
1034
+ def get_individual_matrices(self, matrix):
1035
+ """In case multiple matrices are saved one after the other
1036
+ this method splits them appart.
1037
+ Returns a list containing the matrices
1038
+ """
1039
+ num_cols = matrix.shape[1]
1040
+ num_ind_cols = self.get_num_individual_matrix_cols()
1041
+ matrices_list = []
1042
+ for i in range(0, num_cols, num_ind_cols):
1043
+ if i + num_ind_cols > num_cols:
1044
+ break
1045
+ matrices_list.append(matrix[:, i:i + num_ind_cols])
1046
+ return matrices_list
1047
+
1048
+ def get_num_individual_matrix_cols(self):
1049
+ """
1050
+ returns the number of columns that
1051
+ each matrix should have. This is done because
1052
+ the final matrix that is plotted can be composed
1053
+ of smaller matrices that are merged one after
1054
+ the other.
1055
+ """
1056
+ matrixCols = ((self.parameters['downstream'] + self.parameters['upstream'] + self.parameters['body'] + self.parameters['unscaled 5 prime'] + self.parameters['unscaled 3 prime']) //
1057
+ self.parameters['bin size'])
1058
+
1059
+ return matrixCols
1060
+
1061
+
1062
+ def computeSilhouetteScore(d, idx, labels):
1063
+ """
1064
+ Given a square distance matrix with NaN diagonals, compute the silhouette score
1065
+ of a given row (idx). Each row should have an associated label (labels).
1066
+ """
1067
+ keep = ~np.isnan(d[idx, ])
1068
+ foo = np.bincount(labels[keep], weights=d[idx, ][keep])
1069
+ groupSizes = np.bincount(labels[keep])
1070
+ intraIdx = labels[idx]
1071
+ if groupSizes[intraIdx] == 1:
1072
+ return 0
1073
+ intra = foo[labels[idx]] / groupSizes[intraIdx]
1074
+ interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]]
1075
+ inter = np.min(foo[interMask] / groupSizes[interMask])
1076
+ return (inter - intra) / max(inter, intra)
1077
+
1078
+
1079
+ class _matrix(object):
1080
+ """
1081
+ class to hold heatmapper matrices
1082
+ The base data is a large matrix
1083
+ with definition to know the boundaries for row and col divisions.
1084
+ Col divisions represent groups within a subset, e.g. Active and
1085
+ inactive from PolII bigwig data.
1086
+
1087
+ Row division represent different samples, for example
1088
+ PolII in males vs. PolII in females.
1089
+
1090
+ This is an internal class of the heatmapper class
1091
+ """
1092
+
1093
+ def __init__(self, regions, matrix, group_boundaries, sample_boundaries,
1094
+ group_labels=None, sample_labels=None):
1095
+
1096
+ # simple checks
1097
+ assert matrix.shape[0] == group_boundaries[-1], \
1098
+ "row max do not match matrix shape"
1099
+ assert matrix.shape[1] == sample_boundaries[-1], \
1100
+ "col max do not match matrix shape"
1101
+
1102
+ self.regions = regions
1103
+ self.matrix = matrix
1104
+ self.group_boundaries = group_boundaries
1105
+ self.sample_boundaries = sample_boundaries
1106
+ self.sort_method = None
1107
+ self.sort_using = None
1108
+ self.silhouette = None
1109
+
1110
+ if group_labels is None:
1111
+ self.group_labels = ['group {}'.format(x)
1112
+ for x in range(len(group_boundaries) - 1)]
1113
+ else:
1114
+ assert len(group_labels) == len(group_boundaries) - 1, \
1115
+ "number of group labels does not match number of groups"
1116
+ self.group_labels = group_labels
1117
+
1118
+ if sample_labels is None:
1119
+ self.sample_labels = ['sample {}'.format(x)
1120
+ for x in range(len(sample_boundaries) - 1)]
1121
+ else:
1122
+ assert len(sample_labels) == len(sample_boundaries) - 1, \
1123
+ "number of sample labels does not match number of samples"
1124
+ self.sample_labels = sample_labels
1125
+
1126
+ def get_matrix(self, group, sample):
1127
+ """
1128
+ Returns a sub matrix from the large
1129
+ matrix. Group and sample are ids,
1130
+ thus, row = 0, col=0 get the first group
1131
+ of the first sample.
1132
+
1133
+ Returns
1134
+ -------
1135
+ dictionary containing the matrix,
1136
+ the group label and the sample label
1137
+ """
1138
+ group_start = self.group_boundaries[group]
1139
+ group_end = self.group_boundaries[group + 1]
1140
+ sample_start = self.sample_boundaries[sample]
1141
+ sample_end = self.sample_boundaries[sample + 1]
1142
+
1143
+ return {'matrix': np.ma.masked_invalid(self.matrix[group_start:group_end, :][:, sample_start:sample_end]),
1144
+ 'group': self.group_labels[group],
1145
+ 'sample': self.sample_labels[sample]}
1146
+
1147
+ def get_num_samples(self):
1148
+ return len(self.sample_labels)
1149
+
1150
+ def get_num_groups(self):
1151
+ return len(self.group_labels)
1152
+
1153
+ def set_group_labels(self, new_labels):
1154
+ """ sets new labels for groups
1155
+ """
1156
+ if len(new_labels) != len(self.group_labels):
1157
+ raise ValueError("length new labels != length original labels")
1158
+ self.group_labels = new_labels
1159
+
1160
+ def set_sample_labels(self, new_labels):
1161
+ """ sets new labels for groups
1162
+ """
1163
+ if len(new_labels) != len(self.sample_labels):
1164
+ raise ValueError("length new labels != length original labels")
1165
+ self.sample_labels = new_labels
1166
+
1167
+ def set_sorting_method(self, sort_method, sort_using):
1168
+ self.sort_method = sort_method
1169
+ self.sort_using = sort_using
1170
+
1171
+ def get_regions(self):
1172
+ """Returns the regions per group
1173
+
1174
+ Returns
1175
+ ------
1176
+ list
1177
+
1178
+ Each element of the list is itself a list
1179
+ of dictionaries containing the regions info:
1180
+ chrom, start, end, strand, name etc.
1181
+
1182
+ Each element of the list corresponds to each
1183
+ of the groups
1184
+ """
1185
+ regions = []
1186
+ for idx in range(len(self.group_labels)):
1187
+ start = self.group_boundaries[idx]
1188
+ end = self.group_boundaries[idx + 1]
1189
+ regions.append(self.regions[start:end])
1190
+
1191
+ return regions
1192
+
1193
+ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None):
1194
+ """
1195
+ Sorts and rearranges the submatrices according to the
1196
+ sorting method given.
1197
+ """
1198
+ if sort_method == 'no':
1199
+ return
1200
+
1201
+ if (sample_list is not None) and (len(sample_list) > 0):
1202
+ # get the ids that correspond to the selected sample list
1203
+ idx_to_keep = []
1204
+ for sample_idx in sample_list:
1205
+ idx_to_keep += range(self.sample_boundaries[sample_idx], self.sample_boundaries[sample_idx + 1])
1206
+
1207
+ matrix = self.matrix[:, idx_to_keep]
1208
+
1209
+ else:
1210
+ matrix = self.matrix
1211
+
1212
+ # compute the row average:
1213
+ if sort_using == 'region_length':
1214
+ matrix_avgs = list()
1215
+ for x in self.regions:
1216
+ matrix_avgs.append(np.sum([bar[1] - bar[0] for bar in x[1]]))
1217
+ matrix_avgs = np.array(matrix_avgs)
1218
+ elif sort_using == 'mean':
1219
+ matrix_avgs = np.nanmean(matrix, axis=1)
1220
+ elif sort_using == 'mean':
1221
+ matrix_avgs = np.nanmean(matrix, axis=1)
1222
+ elif sort_using == 'median':
1223
+ matrix_avgs = np.nanmedian(matrix, axis=1)
1224
+ elif sort_using == 'max':
1225
+ matrix_avgs = np.nanmax(matrix, axis=1)
1226
+ elif sort_using == 'min':
1227
+ matrix_avgs = np.nanmin(matrix, axis=1)
1228
+ elif sort_using == 'sum':
1229
+ matrix_avgs = np.nansum(matrix, axis=1)
1230
+ else:
1231
+ sys.exit("{} is an unsupported sorting method".format(sort_using))
1232
+
1233
+ # order per group
1234
+ _sorted_regions = []
1235
+ _sorted_matrix = []
1236
+ for idx in range(len(self.group_labels)):
1237
+ start = self.group_boundaries[idx]
1238
+ end = self.group_boundaries[idx + 1]
1239
+ order = matrix_avgs[start:end].argsort()
1240
+ if sort_method == 'descend':
1241
+ order = order[::-1]
1242
+ _sorted_matrix.append(self.matrix[start:end, :][order, :])
1243
+ # sort the regions
1244
+ _reg = self.regions[start:end]
1245
+ for idx in order:
1246
+ _sorted_regions.append(_reg[idx])
1247
+
1248
+ self.matrix = np.vstack(_sorted_matrix)
1249
+ self.regions = _sorted_regions
1250
+ self.set_sorting_method(sort_method, sort_using)
1251
+
1252
+ def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None):
1253
+ matrix = np.asarray(self.matrix)
1254
+ matrix_to_cluster = matrix
1255
+ if clustering_samples is not None:
1256
+ assert all(i > 0 for i in clustering_samples), \
1257
+ "all indices should be bigger than or equal to 1."
1258
+ assert all(i <= len(self.sample_labels) for i in
1259
+ clustering_samples), \
1260
+ "each index should be smaller than or equal to {}(total "\
1261
+ "number of samples.)".format(len(self.sample_labels))
1262
+
1263
+ clustering_samples = np.asarray(clustering_samples) - 1
1264
+
1265
+ samples_cols = []
1266
+ for idx in clustering_samples:
1267
+ samples_cols += range(self.sample_boundaries[idx],
1268
+ self.sample_boundaries[idx + 1])
1269
+
1270
+ matrix_to_cluster = matrix_to_cluster[:, samples_cols]
1271
+ if np.any(np.isnan(matrix_to_cluster)):
1272
+ # replace nans for 0 otherwise kmeans produces a weird behaviour
1273
+ sys.stderr.write("*Warning* For clustering nan values have to be replaced by zeros \n")
1274
+ matrix_to_cluster[np.isnan(matrix_to_cluster)] = 0
1275
+
1276
+ if method == 'kmeans':
1277
+ from scipy.cluster.vq import vq, kmeans
1278
+
1279
+ centroids, _ = kmeans(matrix_to_cluster, k)
1280
+ # order the centroids in an attempt to
1281
+ # get the same cluster order
1282
+ cluster_labels, _ = vq(matrix_to_cluster, centroids)
1283
+
1284
+ if method == 'hierarchical':
1285
+ # normally too slow for large data sets
1286
+ from scipy.cluster.hierarchy import fcluster, linkage
1287
+ Z = linkage(matrix_to_cluster, method='ward', metric='euclidean')
1288
+ cluster_labels = fcluster(Z, k, criterion='maxclust')
1289
+ # hierarchical clustering labels from 1 .. k
1290
+ # while k-means labels 0 .. k -1
1291
+ # Thus, for consistency, we subtract 1
1292
+ cluster_labels -= 1
1293
+
1294
+ # sort clusters
1295
+ _clustered_mean = []
1296
+ _cluster_ids_list = []
1297
+ for cluster in range(k):
1298
+ cluster_ids = np.flatnonzero(cluster_labels == cluster)
1299
+ _cluster_ids_list.append(cluster_ids)
1300
+ _clustered_mean.append(matrix_to_cluster[cluster_ids, :].mean())
1301
+
1302
+ # reorder clusters based on mean
1303
+ cluster_order = np.argsort(_clustered_mean)[::-1]
1304
+ # create groups using the clustering
1305
+ self.group_labels = []
1306
+ self.group_boundaries = [0]
1307
+ _clustered_regions = []
1308
+ _clustered_matrix = []
1309
+ cluster_number = 1
1310
+ for cluster in cluster_order:
1311
+ self.group_labels.append("cluster_{}".format(cluster_number))
1312
+ cluster_number += 1
1313
+ cluster_ids = _cluster_ids_list[cluster]
1314
+ self.group_boundaries.append(self.group_boundaries[-1] +
1315
+ len(cluster_ids))
1316
+ _clustered_matrix.append(self.matrix[cluster_ids, :])
1317
+ for idx in cluster_ids:
1318
+ _clustered_regions.append(self.regions[idx])
1319
+
1320
+ self.regions = _clustered_regions
1321
+ self.matrix = np.vstack(_clustered_matrix)
1322
+
1323
+ return idx
1324
+
1325
+ def computeSilhouette(self, k):
1326
+ if k > 1:
1327
+ from scipy.spatial.distance import pdist, squareform
1328
+
1329
+ silhouette = np.repeat(0.0, self.group_boundaries[-1])
1330
+ groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1])
1331
+ labels = np.repeat(np.arange(k), groupSizes)
1332
+
1333
+ d = pdist(self.matrix)
1334
+ d2 = squareform(d)
1335
+ np.fill_diagonal(d2, np.nan) # This excludes the diagonal
1336
+ for idx in range(len(labels)):
1337
+ silhouette[idx] = computeSilhouetteScore(d2, idx, labels)
1338
+ sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette)))
1339
+ self.silhouette = silhouette
1340
+
1341
+ def removeempty(self):
1342
+ """
1343
+ removes matrix rows containing only zeros or nans
1344
+ """
1345
+ to_keep = []
1346
+ score_list = np.ma.masked_invalid(np.mean(self.matrix, axis=1))
1347
+ for idx, region in enumerate(self.regions):
1348
+ if np.ma.is_masked(score_list[idx]) or float(score_list[idx]) == 0:
1349
+ continue
1350
+ else:
1351
+ to_keep.append(idx)
1352
+ self.regions = [self.regions[x] for x in to_keep]
1353
+ self.matrix = self.matrix[to_keep, :]
1354
+ # adjust sample boundaries
1355
+ to_keep = np.array(to_keep)
1356
+ self.group_boundaries = [len(to_keep[to_keep < x]) for x in self.group_boundaries]
1357
+
1358
+ def flatten(self):
1359
+ """
1360
+ flatten and remove nans from matrix. Useful
1361
+ to get max and mins from matrix.
1362
+
1363
+ :return flattened matrix
1364
+ """
1365
+ matrix_flatten = np.asarray(self.matrix.flatten())
1366
+ # nans are removed from the flattened array
1367
+ matrix_flatten = matrix_flatten[~np.isnan(matrix_flatten)]
1368
+ if len(matrix_flatten) == 0:
1369
+ num_nan = len(np.flatnonzero(np.isnan(self.matrix.flatten())))
1370
+ raise ValueError("matrix only contains nans "
1371
+ "(total nans: {})".format(num_nan))
1372
+ return matrix_flatten
deepTools/source/deeptools/heatmapper_utilities.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib
3
+ matplotlib.use('Agg')
4
+ matplotlib.rcParams['pdf.fonttype'] = 42
5
+ matplotlib.rcParams['svg.fonttype'] = 'none'
6
+ from deeptools import cm # noqa: F401
7
+ import matplotlib.colors as pltcolors
8
+ import plotly.graph_objs as go
9
+
10
+ old_settings = np.seterr(all='ignore')
11
+
12
+
13
+ def plot_single(ax, ma, average_type, color, label, plot_type='lines'):
14
+ """
15
+ Adds a line to the plot in the given ax using the specified method
16
+
17
+ Parameters
18
+ ----------
19
+ ax : matplotlib axis
20
+ matplotlib axis
21
+ ma : numpy array
22
+ numpy array The data on this matrix is summarized according
23
+ to the `average_type` argument.
24
+ average_type : str
25
+ string values are sum mean median min max std
26
+ color : str
27
+ a valid color: either a html color name, hex
28
+ (e.g #002233), RGB + alpha tuple or list or RGB tuple or list
29
+ label : str
30
+ label
31
+ plot_type: str
32
+ type of plot. Either 'se' for standard error, 'std' for
33
+ standard deviation, 'overlapped_lines' to plot each line of the matrix,
34
+ fill to plot the area between the x axis and the value or any other string to
35
+ just plot the average line.
36
+
37
+ Returns
38
+ -------
39
+ ax
40
+ matplotlib axis
41
+
42
+ Examples
43
+ --------
44
+
45
+ >>> import matplotlib.pyplot as plt
46
+ >>> import os
47
+ >>> fig = plt.figure()
48
+ >>> ax = fig.add_subplot(111)
49
+ >>> matrix = np.array([[1,2,3],
50
+ ... [4,5,6],
51
+ ... [7,8,9]])
52
+ >>> ax = plot_single(ax, matrix -2, 'mean', color=[0.6, 0.8, 0.9], label='fill light blue', plot_type='fill')
53
+ >>> ax = plot_single(ax, matrix, 'mean', color='blue', label='red')
54
+ >>> ax = plot_single(ax, matrix + 5, 'mean', color='red', label='red', plot_type='std')
55
+ >>> ax = plot_single(ax, matrix + 10, 'mean', color='#cccccc', label='gray se', plot_type='se')
56
+ >>> ax = plot_single(ax, matrix + 20, 'mean', color=(0.9, 0.5, 0.9), label='violet', plot_type='std')
57
+ >>> ax = plot_single(ax, matrix + 30, 'mean', color=(0.9, 0.5, 0.9, 0.5), label='violet with alpha', plot_type='std')
58
+ >>> leg = ax.legend()
59
+ >>> plt.savefig("/tmp/test.pdf")
60
+ >>> plt.close()
61
+ >>> fig = plt.figure()
62
+ >>> os.remove("/tmp/test.pdf")
63
+
64
+
65
+ """
66
+ summary = np.ma.__getattribute__(average_type)(ma, axis=0)
67
+ # only plot the average profiles without error regions
68
+ x = np.arange(len(summary))
69
+ if isinstance(color, np.ndarray):
70
+ color = pltcolors.to_hex(color, keep_alpha=True)
71
+ ax.plot(x, summary, color=color, label=label, alpha=0.9)
72
+ if plot_type == 'fill':
73
+ ax.fill_between(x, summary, facecolor=color, alpha=0.6, edgecolor='none')
74
+
75
+ if plot_type in ['se', 'std']:
76
+ if plot_type == 'se': # standard error
77
+ std = np.std(ma, axis=0) / np.sqrt(ma.shape[0])
78
+ else:
79
+ std = np.std(ma, axis=0)
80
+
81
+ alpha = 0.2
82
+ # an alpha channel has to be added to the color to fill the area
83
+ # between the mean (or median etc.) and the std or se
84
+ f_color = pltcolors.colorConverter.to_rgba(color, alpha)
85
+
86
+ ax.fill_between(x, summary, summary + std, facecolor=f_color, edgecolor='none')
87
+ ax.fill_between(x, summary, summary - std, facecolor=f_color, edgecolor='none')
88
+
89
+ ax.set_xlim(0, max(x))
90
+
91
+ return ax
92
+
93
+
94
+ def plotly_single(ma, average_type, color, label, plot_type='line'):
95
+ """A plotly version of plot_single. Returns a list of traces"""
96
+ summary = list(np.ma.__getattribute__(average_type)(ma, axis=0))
97
+ x = list(np.arange(len(summary)))
98
+ if isinstance(color, str):
99
+ color = list(matplotlib.colors.to_rgb(color))
100
+ traces = [go.Scatter(x=x, y=summary, name=label, line={'color': "rgba({},{},{},0.9)".format(color[0], color[1], color[2])}, showlegend=False)]
101
+ if plot_type == 'fill':
102
+ traces[0].update(fill='tozeroy', fillcolor=color)
103
+
104
+ if plot_type in ['se', 'std']:
105
+ if plot_type == 'se': # standard error
106
+ std = np.std(ma, axis=0) / np.sqrt(ma.shape[0])
107
+ else:
108
+ std = np.std(ma, axis=0)
109
+
110
+ x_rev = x[::-1]
111
+ lower = summary - std
112
+ trace = go.Scatter(x=x + x_rev,
113
+ y=np.concatenate([summary + std, lower[::-1]]),
114
+ fill='tozerox',
115
+ fillcolor="rgba({},{},{},0.2)".format(color[0], color[1], color[2]),
116
+ line=go.Line(color='transparent'),
117
+ showlegend=False,
118
+ name=label)
119
+ traces.append(trace)
120
+
121
+ return traces
122
+
123
+
124
+ def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx):
125
+ """
126
+ returns the position and labelling of the xticks that
127
+ correspond to the heatmap
128
+
129
+ As of deepTools 3, the various parameters can be lists, in which case we then need to index things (the idx parameter)
130
+
131
+ As of matplotlib 3 the ticks in the heatmap need to have 0.5 added to them.
132
+
133
+ As of matplotlib 3.1 there is no longer padding added to all ticks. Reference point ticks will be adjusted by width/2
134
+ or width for spacing and the last half of scaled ticks will be shifed by 1 bin so the ticks are at the beginning of bins.
135
+ """
136
+ w = hm.parameters['bin size']
137
+ b = hm.parameters['upstream']
138
+ a = hm.parameters['downstream']
139
+ if idx is not None:
140
+ w = w[idx]
141
+ b = b[idx]
142
+ a = a[idx]
143
+
144
+ try:
145
+ c = hm.parameters['unscaled 5 prime']
146
+ if idx is not None:
147
+ c = c[idx]
148
+ except:
149
+ c = 0
150
+ try:
151
+ d = hm.parameters['unscaled 3 prime']
152
+ if idx is not None:
153
+ d = d[idx]
154
+ except:
155
+ d = 0
156
+ m = hm.parameters['body']
157
+ if idx is not None:
158
+ m = m[idx]
159
+
160
+ if b < 1e5:
161
+ quotient = 1000
162
+ symbol = 'Kb'
163
+ else:
164
+ quotient = 1e6
165
+ symbol = 'Mb'
166
+
167
+ if m == 0:
168
+ xticks = [(k / w) for k in [0, b - 0.5 * w, b + a - w]]
169
+ xtickslabel = ['{0:.1f}'.format(-(float(b) / quotient)),
170
+ referencePointLabel,
171
+ '{0:.1f}{1}'.format(float(a) / quotient, symbol)]
172
+ else:
173
+ xticks_values = [0]
174
+ xtickslabel = []
175
+
176
+ # only if upstream region is set, add a x tick
177
+ if b > 0:
178
+ xticks_values.append(b)
179
+ xtickslabel.append('{0:.1f}'.format(-(float(b) / quotient)))
180
+
181
+ xtickslabel.append(startLabel)
182
+
183
+ # set the x tick for the body parameter, regardless if
184
+ # upstream is 0 (not set)
185
+ if c > 0:
186
+ xticks_values.append(b + c)
187
+ xtickslabel.append("")
188
+
189
+ if d > 0:
190
+ xticks_values.append(b + c + m)
191
+ xtickslabel.append("")
192
+
193
+ # We need to subtract the bin size from the last 2 point so they're placed at the beginning of the bin
194
+ xticks_values.append(b + c + m + d - w)
195
+ xtickslabel.append(endLabel)
196
+
197
+ if a > 0:
198
+ xticks_values.append(b + c + m + d + a - w)
199
+ xtickslabel.append('{0:.1f}{1}'.format(float(a) / quotient, symbol))
200
+
201
+ xticks = [(k / w) for k in xticks_values]
202
+ xticks = [max(x, 0) for x in xticks]
203
+
204
+ return xticks, xtickslabel
deepTools/source/deeptools/mapReduce.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ from deeptoolsintervals import GTF
3
+ import random
4
+
5
+ debug = 0
6
+
7
+
8
+ def mapReduce(staticArgs, func, chromSize,
9
+ genomeChunkLength=None,
10
+ region=None,
11
+ bedFile=None,
12
+ blackListFileName=None,
13
+ numberOfProcessors=4,
14
+ verbose=False,
15
+ includeLabels=False,
16
+ keepExons=False,
17
+ transcriptID="transcriptID",
18
+ exonID="exonID",
19
+ transcript_id_designator="transcript_id",
20
+ self_=None):
21
+ """
22
+ Split the genome into parts that are sent to workers using a defined
23
+ number of procesors. Results are collected and returned.
24
+
25
+ For each genomic region the given 'func' is called using
26
+ the following parameters:
27
+
28
+ chrom, start, end, staticArgs
29
+
30
+ The *arg* are static, *pickable* variables that need to be sent
31
+ to workers.
32
+
33
+ The genome chunk length corresponds to a fraction of the genome, in bp,
34
+ that is send to each of the workers for processing.
35
+
36
+ Depending on the type of process a larger or shorter regions may be
37
+ preferred
38
+
39
+ :param chromSize: A list of duples containing the chromosome
40
+ name and its length
41
+ :param region: The format is chr:start:end:tileSize (see function
42
+ getUserRegion)
43
+ :param staticArgs: tuple of arguments that are sent to the given 'func'
44
+
45
+ :param func: function to call. The function is called using the
46
+ following parameters (chrom, start, end, staticArgs)
47
+ :param bedFile: Is a bed file is given, the args to the func to be
48
+ called are extended to include a list of bed
49
+ defined regions.
50
+ :param blackListFileName: A list of regions to exclude from all computations.
51
+ Note that this has genomeChunkLength resolution...
52
+ :param self_: In case mapreduce should make a call to an object
53
+ the self variable has to be passed.
54
+ :param includeLabels: Pass group and transcript labels into the calling
55
+ function. These are added to the static args
56
+ (groupLabel and transcriptName).
57
+
58
+ If "includeLabels" is true, a tuple of (results, labels) is returned
59
+ """
60
+
61
+ if not genomeChunkLength:
62
+ genomeChunkLength = 1e5
63
+ genomeChunkLength = int(genomeChunkLength)
64
+
65
+ if verbose:
66
+ print("genome partition size for multiprocessing: {0}".format(
67
+ genomeChunkLength))
68
+
69
+ region_start = 0
70
+ region_end = None
71
+
72
+ # if a region is set, that means that the task should only cover
73
+ # the given genomic position
74
+
75
+ if region:
76
+ chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, region)
77
+ if verbose:
78
+ print("chrom size: {0}, region start: {1}, region end: {2}, "
79
+ "genome chunk length sent to each procesor: {3}".format(chromSize, region_start, region_end, genomeChunkLength))
80
+
81
+ if bedFile:
82
+ defaultGroup = None
83
+ if len(bedFile) == 1:
84
+ defaultGroup = "genes"
85
+ bed_interval_tree = GTF(bedFile, defaultGroup=defaultGroup, transcriptID=transcriptID, exonID=exonID, transcript_id_designator=transcript_id_designator, keepExons=keepExons)
86
+
87
+ if blackListFileName:
88
+ blackList = GTF(blackListFileName)
89
+
90
+ TASKS = []
91
+ # iterate over all chromosomes
92
+ for chrom, size in chromSize:
93
+ # the start is zero unless a specific region is defined
94
+ start = 0 if region_start == 0 else region_start
95
+ for startPos in range(start, size, genomeChunkLength):
96
+ endPos = min(size, startPos + genomeChunkLength)
97
+
98
+ # Reject a chunk if it overlaps
99
+ if blackListFileName:
100
+ regions = blSubtract(blackList, chrom, [startPos, endPos])
101
+ else:
102
+ regions = [[startPos, endPos]]
103
+
104
+ for reg in regions:
105
+ if self_ is not None:
106
+ argsList = [self_]
107
+ else:
108
+ argsList = []
109
+
110
+ argsList.extend([chrom, reg[0], reg[1]])
111
+ # add to argument list the static list received the the function
112
+ argsList.extend(staticArgs)
113
+
114
+ # if a bed file is given, append to the TASK list,
115
+ # a list of bed regions that overlap with the
116
+ # current genomeChunk.
117
+ if bedFile:
118
+ # This effectively creates batches of intervals, which is
119
+ # generally more performant due to the added overhead of
120
+ # initializing additional workers.
121
+
122
+ # TODO, there's no point in including the chromosome
123
+ if includeLabels:
124
+ bed_regions_list = [[chrom, x[4], x[2], x[3], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, numericGroups=True, includeStrand=True)]
125
+ else:
126
+ bed_regions_list = [[chrom, x[4], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, includeStrand=True)]
127
+
128
+ if len(bed_regions_list) == 0:
129
+ continue
130
+ # add to argument list, the position of the bed regions to use
131
+ argsList.append(bed_regions_list)
132
+
133
+ TASKS.append(tuple(argsList))
134
+
135
+ if len(TASKS) > 1 and numberOfProcessors > 1:
136
+ if verbose:
137
+ print(("using {} processors for {} "
138
+ "number of tasks".format(numberOfProcessors,
139
+ len(TASKS))))
140
+ random.shuffle(TASKS)
141
+ pool = multiprocessing.Pool(numberOfProcessors)
142
+ res = pool.map_async(func, TASKS).get(9999999)
143
+ pool.close()
144
+ pool.join()
145
+ else:
146
+ res = list(map(func, TASKS))
147
+
148
+ if includeLabels:
149
+ if bedFile:
150
+ return res, bed_interval_tree.labels
151
+ else:
152
+ return res, None
153
+ return res
154
+
155
+
156
+ def getUserRegion(chrom_sizes, region_string, max_chunk_size=1e6):
157
+ r"""
158
+ Verifies if a given region argument, given by the user
159
+ is valid. The format of the region_string is chrom:start:end:tileSize
160
+ where start, end and tileSize are optional.
161
+
162
+ :param chrom_sizes: dictionary of chromosome/scaffold size. Key=chromosome name
163
+ :param region_string: a string of the form chr:start:end
164
+ :param max_chunk_size: upper limit for the chunk size
165
+ :return: tuple chrom_size for the region start, region end, chunk size
166
+
167
+ #>>> data = getUserRegion({'chr2': 1000}, "chr1:10:10")
168
+ #Traceback (most recent call last):
169
+ # ...
170
+ #NameError: Unknown chromosome: chr1
171
+ #Known chromosomes are: ['chr2']
172
+
173
+ If the region end is biger than the chromosome size, this
174
+ value is used instead
175
+ >>> getUserRegion({'chr2': 1000}, "chr2:10:1001")
176
+ ([('chr2', 1000)], 10, 1000, 990)
177
+
178
+ Test chunk and regions size reduction to match tile size
179
+ >>> getUserRegion({'chr2': 200000}, "chr2:10:123344:3")
180
+ ([('chr2', 123344)], 9, 123345, 123336)
181
+
182
+ Test chromosome name mismatch
183
+ >>> getUserRegion({'2': 200000}, "chr2:10:123344:3")
184
+ ([('2', 123344)], 9, 123345, 123336)
185
+ >>> getUserRegion({'chrM': 200000}, "MT:10:123344:3")
186
+ ([('chrM', 123344)], 9, 123345, 123336)
187
+ """
188
+ region = region_string.split(":")
189
+ chrom = region[0]
190
+ chrom_sizes = dict(chrom_sizes)
191
+
192
+ if chrom not in list(chrom_sizes.keys()):
193
+ if chrom == "MT":
194
+ chromUse = "chrM"
195
+ elif chrom == "chrM":
196
+ chromUse = "MT"
197
+ elif chrom[0:3] == "chr":
198
+ chromUse = chrom[3:]
199
+ else:
200
+ chromUse = "chr" + chrom
201
+ if chromUse not in list(chrom_sizes.keys()):
202
+ raise NameError("Unknown chromosome: %s\nKnown "
203
+ "chromosomes are: %s " % (chrom, list(chrom_sizes.keys())))
204
+ chrom = chromUse
205
+ try:
206
+ region_start = int(region[1])
207
+ except IndexError:
208
+ region_start = 0
209
+ try:
210
+ region_end = int(region[2]) if int(region[2]) <= chrom_sizes[chrom] \
211
+ else chrom_sizes[chrom]
212
+ except IndexError:
213
+ region_end = chrom_sizes[chrom]
214
+ if region_start > region_end or region_start < 0:
215
+ raise NameError("{} not valid. The format is chrom:start:end. "
216
+ "Without comas, dashes or dots. ".format(region_string))
217
+ try:
218
+ tilesize = int(region[3])
219
+ except IndexError:
220
+ tilesize = None
221
+
222
+ chrom_sizes = [(chrom, region_end)]
223
+
224
+ # if tilesize is given, make region_start and region_end
225
+ # multiple of tileSize
226
+ if tilesize:
227
+ region_start -= region_start % tilesize
228
+ region_end += tilesize - (region_end % tilesize)
229
+
230
+ chunk_size = int(region_end - region_start)
231
+ if chunk_size > max_chunk_size:
232
+ chunk_size = max_chunk_size
233
+ if tilesize and tilesize < chunk_size:
234
+ chunk_size -= chunk_size % tilesize
235
+
236
+ return chrom_sizes, region_start, region_end, int(chunk_size)
237
+
238
+
239
+ def blSubtract(t, chrom, chunk):
240
+ """
241
+ If a genomic region overlaps with a blacklisted region, then subtract that region out
242
+
243
+ returns a list of lists
244
+ """
245
+
246
+ if t is None:
247
+ return [chunk]
248
+
249
+ overlaps = t.findOverlaps(chrom, chunk[0], chunk[1])
250
+ if overlaps is not None and len(overlaps) > 0:
251
+ output = []
252
+ for o in overlaps:
253
+ if chunk[1] <= chunk[0]:
254
+ break
255
+ if chunk[0] < o[0]:
256
+ output.append([chunk[0], o[0]])
257
+ chunk[0] = o[1]
258
+ if chunk[0] < chunk[1]:
259
+ output.append([chunk[0], chunk[1]])
260
+ else:
261
+ output = [chunk]
262
+
263
+ return output
deepTools/source/deeptools/misc.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # This should force numpy to run single threaded. See issue #697
4
+ # This module MUST be imported before numpy
5
+ # Note that these environment variables are internal to deepTools (they won't exist on the shell after the command completes)
6
+ if 'MKL_NUM_THREADS' not in os.environ:
7
+ os.environ['MKL_NUM_THREADS'] = 'sequential'
8
+ if 'NUMEXPR_NUM_THREADS' not in os.environ:
9
+ os.environ['NUMEXPR_NUM_THREADS'] = '1'
10
+ if 'OMP_NUM_THREADS' not in os.environ:
11
+ os.environ['OMP_NUM_THREADS'] = '1'
12
+ if 'VECLIB_MAXIMUM_THREADS' not in os.environ:
13
+ os.environ['VECLIB_MAXIMUM_THREADS'] = '1'