Spaces:

guohanghui
/

deepTools

Sleeping

App Files Files Community

guohanghui commited on Jan 31

Commit

c8f61f1

verified ·

1 Parent(s): 6c99c2c

Upload 543 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +98 -0
Dockerfile +18 -0
README.md +27 -5
app.py +45 -0
deepTools/mcp_output/README_MCP.md +64 -0
deepTools/mcp_output/analysis.json +391 -0
deepTools/mcp_output/diff_report.md +73 -0
deepTools/mcp_output/mcp_plugin/__init__.py +0 -0
deepTools/mcp_output/mcp_plugin/adapter.py +139 -0
deepTools/mcp_output/mcp_plugin/main.py +13 -0
deepTools/mcp_output/mcp_plugin/mcp_service.py +102 -0
deepTools/mcp_output/requirements.txt +13 -0
deepTools/mcp_output/start_mcp.py +30 -0
deepTools/mcp_output/workflow_summary.json +195 -0
deepTools/source/.planemo.sh +35 -0
deepTools/source/.readthedocs.yaml +15 -0
deepTools/source/CHANGES.txt +448 -0
deepTools/source/LICENSE.txt +9 -0
deepTools/source/MANIFEST.in +8 -0
deepTools/source/README.md +68 -0
deepTools/source/README.rst +29 -0
deepTools/source/__init__.py +4 -0
deepTools/source/deeptools/SES_scaleFactor.py +195 -0
deepTools/source/deeptools/__init__.py +0 -0
deepTools/source/deeptools/alignmentSieve.py +439 -0
deepTools/source/deeptools/bamCompare.py +314 -0
deepTools/source/deeptools/bamCoverage.py +416 -0
deepTools/source/deeptools/bamHandler.py +103 -0
deepTools/source/deeptools/bamPEFragmentSize.py +369 -0
deepTools/source/deeptools/bigwigAverage.py +128 -0
deepTools/source/deeptools/bigwigCompare.py +146 -0
deepTools/source/deeptools/cm.py +1088 -0
deepTools/source/deeptools/computeGCBias.py +800 -0
deepTools/source/deeptools/computeMatrix.py +429 -0
deepTools/source/deeptools/computeMatrixOperations.py +852 -0
deepTools/source/deeptools/correctGCBias.py +746 -0
deepTools/source/deeptools/correlation.py +706 -0
deepTools/source/deeptools/correlation_heatmap.py +110 -0
deepTools/source/deeptools/countReadsPerBin.py +1033 -0
deepTools/source/deeptools/deeptools_list_tools.py +78 -0
deepTools/source/deeptools/estimateReadFiltering.py +376 -0
deepTools/source/deeptools/estimateScaleFactor.py +115 -0
deepTools/source/deeptools/getFragmentAndReadSize.py +166 -0
deepTools/source/deeptools/getRatio.py +82 -0
deepTools/source/deeptools/getScaleFactor.py +305 -0
deepTools/source/deeptools/getScorePerBigWigBin.py +322 -0
deepTools/source/deeptools/heatmapper.py +1372 -0
deepTools/source/deeptools/heatmapper_utilities.py +204 -0
deepTools/source/deeptools/mapReduce.py +263 -0
deepTools/source/deeptools/misc.py +13 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,101 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+deepTools/source/deeptools/test/test_corrGC/paired.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/deeptools/test/test_heatmapper/heatmap_master_interpolation_bilinear.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/deeptools/test/test_heatmapper/profile_master_multi.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/_static/welcome_eLife_chrX_heatmap.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/_static/welcome_eLife_chrX_profile-1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/_static/welcome_eLife_chrX_scaleR_heatmap.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/computeGCBias_Galaxy.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/computeMatrix_modes.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/computeMatrix_overview.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/correctGCBias_Galaxy.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_DataLib.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_clusterLabeling.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_filteringDuplicates.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_IGV_dataset.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_IGV.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_info.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_FAQ_UCSC01.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_screenshot_dataSet.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_screenshot_dataSetStates.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_startsite_with_comments.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_startsite.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/Gal_UCSC.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_bamCompare.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_bamCoverage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_clustHM01.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_clustHM02.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_clustHM03.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_computeGCbias.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_correctGCbias.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_multiBamSummary.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_plotCorrelation.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/GalHow_plotFingerprint.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/coverage_Ibrahim.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_Bulut.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_CpG.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_DNase.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_GC.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_histonesGomez.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/gallery/hm_TATApsem.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/glossary_ascii.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/glossary_overview.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/glossary_sam.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/norm_IGVsnapshot_indFiles.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/plotCorrelation_galaxy.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/plotCoverage_annotated.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_bamCorrelate_RNAseq.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_fingerprint.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_GCplots_input.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_GCregionexclusion_UCSCscreenshot.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/QC_plotCoverage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/start_collage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/start_workflow.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleComputeMatrix1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleComputeMatrix2.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleComputeMatrix3.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap2.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap3.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleHeatmap4.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleProfile1.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/docs/images/test_plots/ExampleProfile2.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bamCompare_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bamCoverage_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bamPEFragmentSize_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/bigwigCompare_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeGCBias_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_advancedOutput.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_overview.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/computeMatrix_selectScores.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/multiBamSummary_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/multiBigwigSummary_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/norm_IGVsnapshot_indFiles.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCorrelate_RNAseq.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCorrelation_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCoverage_annotated.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotCoverage_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotFingerprint_output.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotHeatmap_example.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotHeatmap_example02.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotPCA_annotated.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/plotProfiler_examples.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/QC_GCplots_input.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/QC_plotCoverage.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/static/images/visual_hm_DmelPolII.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/alignmentSieve.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/alignmentSieve2.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/alignmentSieve3.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/correctGCBias_result1.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/paired_chr2L.bam filter=lfs diff=lfs merge=lfs -text
+deepTools/source/galaxy/wrapper/test-data/paired_chr2L.cram filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/coverage_Ibrahim.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_Bulut.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_CpG.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_DNase.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_GC.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_histonesGomez.png filter=lfs diff=lfs merge=lfs -text
+deepTools/source/gallery/hm_TATApsem.png filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.10
+RUN useradd -m -u 1000 user && python -m pip install --upgrade pip
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+ENV MCP_TRANSPORT=http
+ENV MCP_PORT=7860
+EXPOSE 7860
+CMD ["python", "deepTools/mcp_output/start_mcp.py"]

README.md CHANGED Viewed

@@ -1,10 +1,32 @@
 ---
-title: DeepTools
-emoji: 💻
-colorFrom: purple
-colorTo: yellow
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Deeptools MCP
+emoji: 🤖
+colorFrom: blue
+colorTo: purple
 sdk: docker
+sdk_version: "4.26.0"
+app_file: app.py
 pinned: false
 ---
+# Deeptools MCP Service
+Auto-generated MCP service for deepTools.
+## Usage
+```
+https://None-deepTools-mcp.hf.space/mcp
+```
+## Connect with Cursor
+```json
+{
+  "mcpServers": {
+    "deepTools": {
+      "url": "https://None-deepTools-mcp.hf.space/mcp"
+    }
+  }
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from fastapi import FastAPI
+import os
+import sys
+mcp_plugin_path = os.path.join(os.path.dirname(__file__), "deepTools", "mcp_output", "mcp_plugin")
+sys.path.insert(0, mcp_plugin_path)
+app = FastAPI(
+    title="Deeptools MCP Service",
+    description="Auto-generated MCP service for deepTools",
+    version="1.0.0"
+)
+@app.get("/")
+def root():
+    return {
+        "service": "Deeptools MCP Service",
+        "version": "1.0.0",
+        "status": "running",
+        "transport": os.environ.get("MCP_TRANSPORT", "http")
+    }
+@app.get("/health")
+def health_check():
+    return {"status": "healthy", "service": "deepTools MCP"}
+@app.get("/tools")
+def list_tools():
+    try:
+        from mcp_service import create_app
+        mcp_app = create_app()
+        tools = []
+        for tool_name, tool_func in mcp_app.tools.items():
+            tools.append({
+                "name": tool_name,
+                "description": tool_func.__doc__ or "No description available"
+            })
+        return {"tools": tools}
+    except Exception as e:
+        return {"error": f"Failed to load tools: {str(e)}"}
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)

deepTools/mcp_output/README_MCP.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# deepTools MCP (Model Context Protocol) Service
+## Project Introduction
+deepTools is a comprehensive suite of Python tools designed for the efficient analysis of high-throughput sequencing data, particularly for ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenges of handling large datasets by providing tools for normalized coverage file generation, quality control, and publication-ready visualizations. deepTools supports efficient parallel processing using the mapReduce framework, making it suitable for genome-scale computations.
+## Installation Method
+To install deepTools, ensure you have Python and the following dependencies:
+- numpy
+- matplotlib
+- pysam
+- pyBigWig
+Optional dependencies include:
+- scipy
+- pandas
+You can install deepTools using pip:
+```
+pip install deeptools
+```
+## Quick Start
+To quickly get started with deepTools, you can use the command-line interface (CLI) to call the main functions. Here are some examples:
+- **Calculate Coverage**: Use `bamCoverage` to calculate the coverage of BAM files and output a bigWig file.
+- **Compare BAM Files**: Use `bamCompare` to compare two BAM files and generate a bigWig file with the results.
+- **Generate Heatmaps**: Use `heatmapper` to generate heatmaps from computed matrices.
+Example command:
+```
+bamCoverage -b sample.bam -o output.bw
+```
+## Available Tools and Endpoints List
+1. **alignmentSieve**: Filters alignments based on various criteria.
+2. **bamCompare**: Compares two BAM files and generates a bigWig file.
+3. **bamCoverage**: Calculates the coverage of BAM files.
+4. **computeMatrix**: Computes a matrix of scores for genomic regions.
+5. **heatmapper**: Generates heatmaps from computed matrices.
+6. **multiBamSummary**: Aggregates read counts across multiple BAM files.
+7. **multiBigwigSummary**: Aggregates scores across multiple bigWig files.
+8. **plotCorrelation**: Performs correlation analysis with heatmap/scatter plot output.
+9. **plotHeatmap**: Creates customizable heatmaps.
+10. **plotProfile**: Generates average signal profile plots.
+## Common Issues and Notes
+- **Dependencies**: Ensure all required dependencies are installed. Optional dependencies can enhance functionality.
+- **Environment**: deepTools is compatible with most Unix-like systems. Ensure your environment supports Python and the necessary libraries.
+- **Performance**: For large datasets, consider using the mapReduce framework to leverage parallel processing capabilities.
+## Reference Links or Documentation
+For more detailed information, visit the [deepTools GitHub repository](https://github.com/deeptools/deepTools) or refer to the official [deepTools documentation](https://deeptools.readthedocs.io/en/develop/).
+For specific tool usage and workflows, see the [Typical Workflows](https://deeptools.readthedocs.io/en/develop/content/example_usage.html) section in the documentation.

deepTools/mcp_output/analysis.json ADDED Viewed

	@@ -0,0 +1,391 @@

+{
+  "summary": {
+    "repository_url": "https://github.com/deeptools/deepTools",
+    "summary": "Imported via zip fallback, file count: 81",
+    "file_tree": {
+      ".github/CONTRIBUTING.md": {
+        "size": 544
+      },
+      ".github/ISSUE_TEMPLATE.md": {
+        "size": 691
+      },
+      ".github/PULL_REQUEST_TEMPLATE.md": {
+        "size": 286
+      },
+      ".github/workflows/planemo.yml": {
+        "size": 1421
+      },
+      ".github/workflows/pypi.yml": {
+        "size": 616
+      },
+      ".github/workflows/test.yml": {
+        "size": 3118
+      },
+      ".readthedocs.yaml": {
+        "size": 193
+      },
+      "CHANGES.txt": {
+        "size": 40451
+      },
+      "LICENSE.txt": {
+        "size": 1241
+      },
+      "README.md": {
+        "size": 5910
+      },
+      "deeptools/SES_scaleFactor.py": {
+        "size": 7007
+      },
+      "deeptools/__init__.py": {
+        "size": 0
+      },
+      "deeptools/alignmentSieve.py": {
+        "size": 18200
+      },
+      "deeptools/bamCompare.py": {
+        "size": 14290
+      },
+      "deeptools/bamCoverage.py": {
+        "size": 18617
+      },
+      "deeptools/bamHandler.py": {
+        "size": 3345
+      },
+      "deeptools/bamPEFragmentSize.py": {
+        "size": 21247
+      },
+      "deeptools/bigwigAverage.py": {
+        "size": 4908
+      },
+      "deeptools/bigwigCompare.py": {
+        "size": 6614
+      },
+      "deeptools/cm.py": {
+        "size": 44838
+      },
+      "deeptools/computeGCBias.py": {
+        "size": 31006
+      },
+      "deeptools/computeMatrix.py": {
+        "size": 22446
+      },
+      "deeptools/computeMatrixOperations.py": {
+        "size": 32110
+      },
+      "deeptools/correctGCBias.py": {
+        "size": 26158
+      },
+      "deeptools/correlation.py": {
+        "size": 28078
+      },
+      "deeptools/correlation_heatmap.py": {
+        "size": 3796
+      },
+      "deeptools/countReadsPerBin.py": {
+        "size": 42159
+      },
+      "deeptools/deeptools_list_tools.py": {
+        "size": 3345
+      },
+      "deeptools/estimateReadFiltering.py": {
+        "size": 16606
+      },
+      "deeptools/estimateScaleFactor.py": {
+        "size": 4782
+      },
+      "deeptools/getFragmentAndReadSize.py": {
+        "size": 7011
+      },
+      "deeptools/getRatio.py": {
+        "size": 2326
+      },
+      "deeptools/getScaleFactor.py": {
+        "size": 12772
+      },
+      "deeptools/getScorePerBigWigBin.py": {
+        "size": 11967
+      },
+      "deeptools/heatmapper.py": {
+        "size": 58987
+      },
+      "deeptools/heatmapper_utilities.py": {
+        "size": 7169
+      },
+      "deeptools/mapReduce.py": {
+        "size": 9786
+      },
+      "deeptools/misc.py": {
+        "size": 597
+      },
+      "deeptools/multiBamSummary.py": {
+        "size": 11899
+      },
+      "deeptools/multiBigwigSummary.py": {
+        "size": 11291
+      },
+      "deeptools/parserCommon.py": {
+        "size": 43744
+      },
+      "deeptools/plotCorrelation.py": {
+        "size": 10984
+      },
+      "deeptools/plotCoverage.py": {
+        "size": 16329
+      },
+      "deeptools/plotEnrichment.py": {
+        "size": 25244
+      },
+      "deeptools/plotFingerprint.py": {
+        "size": 19876
+      },
+      "deeptools/plotHeatmap.py": {
+        "size": 37144
+      },
+      "deeptools/plotPCA.py": {
+        "size": 9427
+      },
+      "deeptools/plotProfile.py": {
+        "size": 39224
+      },
+      "deeptools/sumCoveragePerBin.py": {
+        "size": 9899
+      },
+      "deeptools/test/__init__.py": {
+        "size": 0
+      },
+      "deeptools/test/skiptest_heatmapper_images.py": {
+        "size": 5917
+      },
+      "deeptools/test/test_bamCoverage_and_bamCompare.py": {
+        "size": 17582
+      },
+      "deeptools/test/test_bigwigAverage.py": {
+        "size": 2864
+      },
+      "deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py": {
+        "size": 4603
+      },
+      "deeptools/test/test_computeMatrixOperations.py": {
+        "size": 12233
+      },
+      "deeptools/test/test_corrGC/R_gc_paired.txt": {
+        "size": 7525
+      },
+      "deeptools/test/test_corrGC/frequencies_data.txt": {
+        "size": 825
+      },
+      "deeptools/test/test_countReadsPerBin.py": {
+        "size": 8401
+      },
+      "deeptools/test/test_heatmapper.py": {
+        "size": 12550
+      },
+      "deeptools/test/test_multiBamSummary.py": {
+        "size": 1945
+      },
+      "deeptools/test/test_plotCoverage.py": {
+        "size": 1215
+      },
+      "deeptools/test/test_readFiltering.py": {
+        "size": 6229
+      },
+      "deeptools/test/test_tools.py": {
+        "size": 838
+      },
+      "deeptools/test/test_writeBedGraph.py": {
+        "size": 4462
+      },
+      "deeptools/utilities.py": {
+        "size": 14161
+      },
+      "deeptools/writeBedGraph.py": {
+        "size": 13223
+      },
+      "deeptools/writeBedGraph_bam_and_bw.py": {
+        "size": 9255
+      },
+      "docs/_static/welcome_owl.carousel.min.js": {
+        "size": 40401
+      },
+      "docs/conf.py": {
+        "size": 11119
+      },
+      "docs/requirements.txt": {
+        "size": 72
+      },
+      "galaxy/wrapper/.shed.yml": {
+        "size": 2719
+      },
+      "galaxy/wrapper/test-data/alignmentSieve.txt": {
+        "size": 102
+      },
+      "galaxy/wrapper/test-data/bamPEFragmentSize_lengths1.txt": {
+        "size": 115
+      },
+      "galaxy/wrapper/test-data/bamPEFragmentSize_result1.txt": {
+        "size": 613
+      },
+      "galaxy/wrapper/test-data/bamPEFragmentSize_table1.txt": {
+        "size": 810
+      },
+      "galaxy/wrapper/test-data/computeMatrixOperations.txt": {
+        "size": 50
+      },
+      "galaxy/wrapper/test-data/estimateReadFiltering.txt": {
+        "size": 353
+      },
+      "galaxy/wrapper/test-data/plotEnrichment_output.txt": {
+        "size": 197
+      },
+      "pyproject.toml": {
+        "size": 2395
+      },
+      "scripts/convertChromsBigWig.py": {
+        "size": 7412
+      },
+      "scripts/split_bed_into_multiple_files.py": {
+        "size": 822
+      }
+    },
+    "processed_by": "zip_fallback",
+    "success": true
+  },
+  "structure": {
+    "packages": [
+      "source.deeptools",
+      "source.deeptools.test"
+    ]
+  },
+  "dependencies": {
+    "has_environment_yml": false,
+    "has_requirements_txt": false,
+    "pyproject": true,
+    "setup_cfg": false,
+    "setup_py": false
+  },
+  "entry_points": {
+    "imports": [],
+    "cli": [],
+    "modules": []
+  },
+  "llm_analysis": {
+    "core_modules": [
+      {
+        "package": "source.deeptools",
+        "module": "alignmentSieve",
+        "functions": [
+          "main",
+          "parseArguments"
+        ],
+        "classes": [],
+        "description": "This module is responsible for filtering alignments based on various criteria."
+      },
+      {
+        "package": "source.deeptools",
+        "module": "bamCompare",
+        "functions": [
+          "main",
+          "parseArguments"
+        ],
+        "classes": [],
+        "description": "This module compares two BAM files and generates a bigWig file with the results."
+      },
+      {
+        "package": "source.deeptools",
+        "module": "bamCoverage",
+        "functions": [
+          "main",
+          "parseArguments"
+        ],
+        "classes": [],
+        "description": "This module calculates the coverage of BAM files and outputs a bigWig file."
+      },
+      {
+        "package": "source.deeptools",
+        "module": "computeMatrix",
+        "functions": [
+          "main",
+          "parseArguments"
+        ],
+        "classes": [],
+        "description": "This module computes a matrix of scores for genomic regions."
+      },
+      {
+        "package": "source.deeptools",
+        "module": "heatmapper",
+        "functions": [
+          "main",
+          "parseArguments"
+        ],
+        "classes": [],
+        "description": "This module generates heatmaps from the computed matrices."
+      }
+    ],
+    "cli_commands": [
+      {
+        "name": "alignmentSieve",
+        "module": "source.deeptools.alignmentSieve",
+        "description": "CLI command for filtering alignments based on various criteria."
+      },
+      {
+        "name": "bamCompare",
+        "module": "source.deeptools.bamCompare",
+        "description": "CLI command for comparing two BAM files and generating a bigWig file."
+      },
+      {
+        "name": "bamCoverage",
+        "module": "source.deeptools.bamCoverage",
+        "description": "CLI command for calculating the coverage of BAM files."
+      },
+      {
+        "name": "computeMatrix",
+        "module": "source.deeptools.computeMatrix",
+        "description": "CLI command for computing a matrix of scores for genomic regions."
+      },
+      {
+        "name": "heatmapper",
+        "module": "source.deeptools.heatmapper",
+        "description": "CLI command for generating heatmaps from computed matrices."
+      }
+    ],
+    "import_strategy": {
+      "primary": "import",
+      "fallback": "cli",
+      "confidence": 0.85
+    },
+    "dependencies": {
+      "required": [
+        "numpy",
+        "matplotlib",
+        "pysam",
+        "pyBigWig"
+      ],
+      "optional": [
+        "scipy",
+        "pandas"
+      ]
+    },
+    "risk_assessment": {
+      "import_feasibility": 0.8,
+      "intrusiveness_risk": "medium",
+      "complexity": "medium"
+    }
+  },
+  "deepwiki_analysis": {
+    "repo_url": "https://github.com/deeptools/deepTools",
+    "repo_name": "deepTools",
+    "content": "deeptools/deepTools\nInstallation and Getting Started\nTypical Workflows\nCore Processing Engines\nParallel Processing Framework (mapReduce)\nRead Counting Engine (countReadsPerBin)\nScore Extraction (getScorePerBigWigBin)\nMatrix Computation Engine (heatmapper)\nCoverage Generation and Normalization\nSingle-Sample Coverage (bamCoverage)\nSample Comparison (bamCompare)\nGC Bias Correction Pipeline\nMulti-Sample Analysis Tools\nData Integration (multiBamSummary and multiBigwigSummary)\nCorrelation Analysis (plotCorrelation)\nPrincipal Component Analysis (plotPCA)\nVisualization Tools\nMatrix Generation (computeMatrix)\nMatrix Operations (computeMatrixOperations)\nHeatmap Visualization (plotHeatmap)\nProfile Plots (plotProfile)\nQuality Control Tools\nChIP-seq Enrichment Assessment (plotFingerprint)\nFragment Size Analysis (bamPEFragmentSize)\nCoverage Distribution (plotCoverage)\nFeature Enrichment (plotEnrichment)\nRead Filtering Estimation (estimateReadFiltering and alignmentSieve)\nGalaxy Integration\nGalaxy Wrapper Architecture\nTool Shed Distribution and Installation\nTesting with Planemo\nDevelopment and Contributing\nProject Structure and Configuration\nCI/CD Pipeline\nDocumentation System\nRelease Process and Versioning\nFile Formats and Data Structures\nInput File Formats\nIntermediate File Formats\nOutput Formats and Visualization\nAdvanced Topics and Optimization\nPerformance Tuning\nFiltering and Read Processing Options\nNormalization Methods Deep Dive\nSpecialized Read Processing Modes\nCHANGES.txt\ndocs/content/about.rst\ndocs/content/changelog.rst\ndocs/content/example_api_tutorial.rst\ndocs/content/example_gallery.rst\ndocs/content/example_step_by_step.rst\ndocs/content/example_usage.rst\ndocs/content/help_faq.rst\ndocs/content/help_faq_galaxy.rst\ndocs/content/help_galaxy_intro.rst\ndocs/content/help_glossary.rst\ndocs/content/installation.rst\ndocs/content/list_of_tools.rst\ndocs/images/Gal_FAQ_filteringDuplicates.png\ndocs/images/Gal_FAQ_info.png\ndocs/index.rst\ngalaxy/wrapper/deepTools_macros.xml\ngalaxy/wrapper/estimateReadFiltering.xml\ngalaxy/wrapper/test-data/estimateReadFiltering.txt\nscripts/convertChromsBigWig.py\nPurpose and Scope\nThis page provides a high-level introduction to deepTools: what it is, what problems it solves, and its primary capabilities for analyzing high-throughput sequencing data. For detailed installation instructions, seeInstallation and Getting Started. For specific tool usage and workflows, seeTypical Workflows.\nSources:README.md9-11docs/index.rst7-8\nWhat is deepTools?\ndeepTools is a suite of Python tools developed for efficient analysis of high-throughput sequencing data, particularly ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenge of handling large amounts of data generated from DNA sequencing centers by providing:\nNormalized coverage file generationin standard bedGraph and bigWig formats\nQuality control modulesfor assessing data quality and technical biases\nPublication-ready visualizationsfor identifying enrichments and functional genome annotations\nEfficient parallel processingusing themapReduceframework for genome-scale computations\ndeepTools is available through three distinct usage modes:\nSources:README.md9-14docs/index.rst10-14CHANGES.txt62\nSystem Architecture\nThe following diagram illustrates the primary components of deepTools and how they relate to each other:\ndeepTools Component Architecture\nData Access LayerCore Processing EnginesTool Modules LayerUser Interface LayerCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)Galaxy XML Wrappers(galaxy/wrapper/*.xml)Python API(import deeptools.*)Quality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSizeCoverage ToolsbamCoveragebamComparebigwigCompareAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrixVisualization ToolsplotHeatmapplotProfileplotCorrelationcountReadsPerBin(deeptools/countReadsPerBin.py)getScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)heatmapper(deeptools/heatmapper.py)mapReduce(deeptools/mapReduce.py)pysam(BAM/CRAM files)pyBigWig(bigWig files)deeptoolsintervals(BED/GTF files)\nData Access Layer\nCore Processing Engines\nTool Modules Layer\nUser Interface Layer\nCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)\nGalaxy XML Wrappers(galaxy/wrapper/*.xml)\nPython API(import deeptools.*)\nQuality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSize\nCoverage ToolsbamCoveragebamComparebigwigCompare\nAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrix\nVisualization ToolsplotHeatmapplotProfileplotCorrelation\ncountReadsPerBin(deeptools/countReadsPerBin.py)\ngetScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)\nheatmapper(deeptools/heatmapper.py)\nmapReduce(deeptools/mapReduce.py)\npysam(BAM/CRAM files)\npyBigWig(bigWig files)\ndeeptoolsintervals(BED/GTF files)\nSources:docs/content/list_of_tools.rst1-46pyproject.toml24-50README.md34-56\nCore Capabilities\ndeepTools provides five major functional categories that address different stages of sequencing data analysis:\n1. Quality Control and Preprocessing\nTools for assessing data quality before analysis:\nplotFingerprint- ChIP-seq enrichment assessment with quality metrics (Jensen-Shannon distance, CHANCE statistics)\nplotFingerprint\ncomputeGCBias/correctGCBias- GC bias detection and correction\ncomputeGCBias\ncorrectGCBias\nbamPEFragmentSize- Fragment size distribution analysis for paired-end data\nbamPEFragmentSize\nplotCoverage- Coverage distribution and genome coverage metrics\nplotCoverage\nestimateReadFiltering- Preview effects of filtering parameters\nestimateReadFiltering\nSources:docs/content/list_of_tools.rst18-32galaxy/wrapper/estimateReadFiltering.xml1-118\n2. Coverage Generation and Normalization\nTools for converting aligned reads (BAM) to normalized coverage tracks (bigWig/bedGraph):\nbamCoverage- Single-sample normalization with RPKM, CPM, BPM, RPGC methods\nbamCoverage\nbamCompare- Two-sample comparison (e.g., ChIP vs. input) with log2ratio, difference, mean operations\nbigwigCompare- Comparison operations on bigWig files\nbigwigCompare\nbigwigAverage- Averaging multiple bigWig files\nbigwigAverage\nSources:docs/content/list_of_tools.rst24-27CHANGES.txt28\n3. Data Aggregation\nTools for integrating multi-sample data:\nmultiBamSummary- Read count aggregation across BAM files (bins or BED-defined regions)\nmultiBamSummary\nmultiBigwigSummary- Score aggregation across bigWig files\nmultiBigwigSummary\ncomputeMatrix- Signal computation over genomic regions in two modes:scale-regionsandreference-point\ncomputeMatrix\nscale-regions\nreference-point\nSources:docs/content/list_of_tools.rst10-29\n4. Analysis and Statistics\nTools for deriving statistical insights:\nplotCorrelation- Pearson/Spearman correlation with hierarchical clustering and heatmap/scatter plot output\nplotCorrelation\nplotPCA- Principal component analysis for dimensionality reduction\nplotEnrichment- Feature enrichment quantification\nplotEnrichment\nSources:docs/content/list_of_tools.rst14-17docs/content/example_usage.rst32-44\n5. Visualization\nTools for creating publication-ready plots:\nplotHeatmap- Customizable heatmaps with clustering, color schemes, and multi-sample/region support\nplotHeatmap\nplotProfile- Average signal profile plots (meta-profiles) with standard error/deviation options\nplotProfile\ncomputeMatrixOperations- Matrix manipulation (filter, subset, sort, combine)\ncomputeMatrixOperations\nMultiple output formats supported: PNG, PDF, SVG, and interactive HTML (via plotly)\nSources:docs/content/list_of_tools.rst34-36CHANGES.txt187\nFile Format Ecosystem\ndeepTools operates on standard genomics file formats and performs conversions between them:\nFile Format Flow Diagram\nOutput FormatsIntermediate FormatsdeepTools ProcessingInput FormatsBAM/CRAM(aligned reads)bigWig(coverage signal)BED/GTF/GFF(genomic regions)2bit/FASTA(reference genome)bamCoveragebamComparemultiBamSummarymultiBigwigSummarycomputeMatrixcomputeGCBiasNPZ matrix(multiBamSummary output)Matrix .gz(computeMatrix output)Tabular(GC frequencies)bigWig/bedGraph(coverage tracks)PNG/PDF/SVG(static plots)HTML(interactive plotly)Tabular(data tables)\nOutput Formats\nIntermediate Formats\ndeepTools Processing\nInput Formats\nBAM/CRAM(aligned reads)\nbigWig(coverage signal)\nBED/GTF/GFF(genomic regions)\n2bit/FASTA(reference genome)\nbamCoverage\nmultiBamSummary\nmultiBigwigSummary\ncomputeMatrix\ncomputeGCBias\nNPZ matrix(multiBamSummary output)\nMatrix .gz(computeMatrix output)\nTabular(GC frequencies)\nbigWig/bedGraph(coverage tracks)\nPNG/PDF/SVG(static plots)\nHTML(interactive plotly)\nTabular(data tables)\nKey intermediate formats serve as bridges between data aggregation and visualization:\nNPZ files- Compressed NumPy arrays storing multi-sample read counts or scores (output frommultiBamSummary/multiBigwigSummary)\nmultiBamSummary\nmultiBigwigSummary\nMatrix .gz files- Compressed matrices of signal values over genomic regions (output fromcomputeMatrix)\ncomputeMatrix\nSources:docs/content/help_glossary.rst82-174docs/content/list_of_tools.rst8-45\nParallel Processing Framework\nAll compute-intensive operations in deepTools utilize themapReduceframework for efficient genome-scale processing:\nmapReduce Execution Model\nmapReduce Framework (deeptools/mapReduce.py)Input:BAM/bigWig filesBED regionsParametersGenome Chunking(~400k reads per chunk)Worker Pool(multiprocessing.Pool)Worker 1:Process chunk 1Worker 2:Process chunk 2Worker N:Process chunk NResult Aggregation(concatenate/merge)Output:Aggregated results\nmapReduce Framework (deeptools/mapReduce.py)\nInput:BAM/bigWig filesBED regionsParameters\nGenome Chunking(~400k reads per chunk)\nWorker Pool(multiprocessing.Pool)\nWorker 1:Process chunk 1\nWorker 2:Process chunk 2\nWorker N:Process chunk N\nResult Aggregation(concatenate/merge)\nOutput:Aggregated results\nThe framework provides:\nAutomatic parallelizationacross user-specified number of processors (via--numberOfProcessors)",
+    "model": "gpt-4o-2024-08-06",
+    "source": "selenium",
+    "success": true
+  },
+  "deepwiki_options": {
+    "enabled": true,
+    "model": "gpt-4o-2024-08-06"
+  },
+  "risk": {
+    "import_feasibility": 0.8,
+    "intrusiveness_risk": "medium",
+    "complexity": "medium"
+  }
+}

deepTools/mcp_output/diff_report.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# DeepTools Project Difference Report
+**Date:** January 31, 2026
+**Time:** 18:24:38
+**Repository:** deepTools
+**Project Type:** Python Library
+**Intrusiveness:** None
+**Workflow Status:** Success
+**Test Status:** Failed
+## Project Overview
+DeepTools is a Python library designed to facilitate the analysis and visualization of high-throughput sequencing data. It provides a suite of tools for processing and interpreting large datasets, making it an essential resource for bioinformatics research.
+## Difference Analysis
+### New Files
+In this update, 8 new files have been introduced to the deepTools repository. These files likely contain new features or enhancements to existing functionalities. However, no existing files were modified, indicating that the new additions are supplementary rather than replacements or updates to current code.
+### Modified Files
+There were no modifications to existing files in this update. This suggests that the core functionalities of the library remain unchanged, and the focus was on expanding capabilities or adding new features.
+## Technical Analysis
+### Workflow Status
+The workflow status is marked as "success," indicating that the integration and deployment processes were completed without any errors. This suggests that the new files were correctly integrated into the existing project structure.
+### Test Status
+The test status is marked as "failed," which is a critical issue. This failure indicates that the new additions have introduced bugs or issues that prevent the library from functioning as expected. It is essential to identify and resolve these issues to ensure the reliability and stability of the library.
+## Recommendations and Improvements
+1. **Conduct Thorough Testing:**
+   - Perform detailed unit and integration testing on the new files to identify the root cause of the test failures.
+   - Ensure that all new functionalities are covered by test cases to prevent future issues.
+2. **Code Review:**
+   - Conduct a comprehensive code review of the new files to ensure adherence to coding standards and best practices.
+   - Identify any potential areas for optimization or refactoring.
+3. **Documentation Update:**
+   - Update the project documentation to include information about the new features and how they integrate with existing functionalities.
+   - Ensure that any new dependencies or installation instructions are clearly outlined.
+4. **Bug Fixes:**
+   - Prioritize fixing the issues causing test failures to restore the library's functionality.
+   - Implement a bug tracking system to monitor and resolve any new issues that arise.
+## Deployment Information
+The deployment process was successful, indicating that the new files were correctly integrated into the project. However, due to the test failures, it is recommended to hold off on any production deployment until the issues are resolved.
+## Future Planning
+1. **Feature Expansion:**
+   - Continue to expand the library's capabilities by introducing new tools and functionalities that align with user needs and industry trends.
+2. **Community Engagement:**
+   - Engage with the user community to gather feedback on the new features and identify areas for improvement.
+3. **Regular Updates:**
+   - Implement a regular update schedule to ensure that the library remains up-to-date with the latest advancements in bioinformatics.
+4. **Enhanced Testing Framework:**
+   - Develop a more robust testing framework to catch issues earlier in the development process and improve overall software quality.
+## Conclusion
+The recent update to the deepTools project has introduced new features, but the test failures highlight the need for immediate attention to ensure the library's reliability. By addressing the recommendations outlined in this report, the project can continue to provide valuable tools for the bioinformatics community while maintaining high standards of quality and performance.

deepTools/mcp_output/mcp_plugin/__init__.py ADDED Viewed

File without changes

deepTools/mcp_output/mcp_plugin/adapter.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import sys
+# Path settings
+source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
+sys.path.insert(0, source_path)
+# Import statements
+try:
+    from deeptools.alignmentSieve import alignmentSieve
+    from deeptools.bamCompare import bamCompare
+    from deeptools.bamCoverage import bamCoverage
+    from deeptools.computeMatrix import computeMatrix
+    from deeptools.heatmapper import heatmapper
+except ImportError as e:
+    print(f"ImportError: {e}. Ensure the source directory is correctly set.")
+class Adapter:
+    """
+    Adapter class for deepTools functionalities.
+    Provides methods to interact with various deepTools modules.
+    """
+    def __init__(self):
+        self.mode = "import"
+    # -------------------------------------------------------------------------
+    # Alignment Sieve Module
+    # -------------------------------------------------------------------------
+    def run_alignment_sieve(self, input_file, output_file, **kwargs):
+        """
+        Filters alignments based on various criteria using alignmentSieve.
+        Parameters:
+        - input_file: str, path to the input BAM file.
+        - output_file: str, path to the output BAM file.
+        - kwargs: additional parameters for alignmentSieve.
+        Returns:
+        - dict: status of the operation.
+        """
+        try:
+            alignmentSieve(input_file=input_file, output_file=output_file, **kwargs)
+            return {"status": "success", "message": "Alignment sieve completed successfully."}
+        except Exception as e:
+            return {"status": "error", "message": f"Failed to run alignment sieve: {e}"}
+    # -------------------------------------------------------------------------
+    # BAM Compare Module
+    # -------------------------------------------------------------------------
+    def run_bam_compare(self, bamfile1, bamfile2, output_file, **kwargs):
+        """
+        Compares two BAM files and generates a bigWig file using bamCompare.
+        Parameters:
+        - bamfile1: str, path to the first BAM file.
+        - bamfile2: str, path to the second BAM file.
+        - output_file: str, path to the output bigWig file.
+        - kwargs: additional parameters for bamCompare.
+        Returns:
+        - dict: status of the operation.
+        """
+        try:
+            bamCompare(bamfile1=bamfile1, bamfile2=bamfile2, output_file=output_file, **kwargs)
+            return {"status": "success", "message": "BAM comparison completed successfully."}
+        except Exception as e:
+            return {"status": "error", "message": f"Failed to compare BAM files: {e}"}
+    # -------------------------------------------------------------------------
+    # BAM Coverage Module
+    # -------------------------------------------------------------------------
+    def run_bam_coverage(self, bamfile, output_file, **kwargs):
+        """
+        Calculates the coverage of BAM files using bamCoverage.
+        Parameters:
+        - bamfile: str, path to the BAM file.
+        - output_file: str, path to the output coverage file.
+        - kwargs: additional parameters for bamCoverage.
+        Returns:
+        - dict: status of the operation.
+        """
+        try:
+            bamCoverage(bamfile=bamfile, output_file=output_file, **kwargs)
+            return {"status": "success", "message": "BAM coverage calculation completed successfully."}
+        except Exception as e:
+            return {"status": "error", "message": f"Failed to calculate BAM coverage: {e}"}
+    # -------------------------------------------------------------------------
+    # Compute Matrix Module
+    # -------------------------------------------------------------------------
+    def run_compute_matrix(self, score_file, regions_file, output_file, **kwargs):
+        """
+        Computes a matrix of scores for genomic regions using computeMatrix.
+        Parameters:
+        - score_file: str, path to the score file.
+        - regions_file: str, path to the regions file.
+        - output_file: str, path to the output matrix file.
+        - kwargs: additional parameters for computeMatrix.
+        Returns:
+        - dict: status of the operation.
+        """
+        try:
+            computeMatrix(score_file=score_file, regions_file=regions_file, output_file=output_file, **kwargs)
+            return {"status": "success", "message": "Matrix computation completed successfully."}
+        except Exception as e:
+            return {"status": "error", "message": f"Failed to compute matrix: {e}"}
+    # -------------------------------------------------------------------------
+    # Heatmapper Module
+    # -------------------------------------------------------------------------
+    def run_heatmapper(self, matrix_file, output_file, **kwargs):
+        """
+        Generates heatmaps from computed matrices using heatmapper.
+        Parameters:
+        - matrix_file: str, path to the matrix file.
+        - output_file: str, path to the output heatmap file.
+        - kwargs: additional parameters for heatmapper.
+        Returns:
+        - dict: status of the operation.
+        """
+        try:
+            heatmapper(matrix_file=matrix_file, output_file=output_file, **kwargs)
+            return {"status": "success", "message": "Heatmap generation completed successfully."}
+        except Exception as e:
+            return {"status": "error", "message": f"Failed to generate heatmap: {e}"}
+# End of Adapter class definition

deepTools/mcp_output/mcp_plugin/main.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+MCP Service Auto-Wrapper - Auto-generated
+"""
+from mcp_service import create_app
+def main():
+    """Main entry point"""
+    app = create_app()
+    return app
+if __name__ == "__main__":
+    app = main()
+    app.run()

deepTools/mcp_output/mcp_plugin/mcp_service.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import sys
+# Path settings to include the local source directory
+source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
+if source_path not in sys.path:
+    sys.path.insert(0, source_path)
+from fastmcp import FastMCP
+from deeptools.alignmentSieve import alignmentSieve
+from deeptools.bamCompare import bamCompare
+from deeptools.bamCoverage import bamCoverage
+from deeptools.computeMatrix import computeMatrix
+from deeptools.heatmapper import heatmapper
+mcp = FastMCP("deepToolsService")
+@mcp.tool(name="alignment_sieve", description="Filter alignments based on various criteria.")
+def alignment_sieve(input_file: str, output_file: str, min_length: int, max_length: int) -> dict:
+    """
+    Filters alignments in a BAM file based on length criteria.
+    :param input_file: Path to the input BAM file.
+    :param output_file: Path to the output BAM file.
+    :param min_length: Minimum alignment length to retain.
+    :param max_length: Maximum alignment length to retain.
+    :return: Dictionary with success status and result or error message.
+    """
+    try:
+        alignmentSieve(input_file, output_file, min_length, max_length)
+        return {"success": True, "result": f"Filtered alignments saved to {output_file}"}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+@mcp.tool(name="bam_compare", description="Compare two BAM files and generate a bigWig file.")
+def bam_compare(bam_file1: str, bam_file2: str, output_file: str) -> dict:
+    """
+    Compares two BAM files and generates a bigWig file.
+    :param bam_file1: Path to the first BAM file.
+    :param bam_file2: Path to the second BAM file.
+    :param output_file: Path to the output bigWig file.
+    :return: Dictionary with success status and result or error message.
+    """
+    try:
+        bamCompare(bam_file1, bam_file2, output_file)
+        return {"success": True, "result": f"Comparison result saved to {output_file}"}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+@mcp.tool(name="bam_coverage", description="Calculate the coverage of BAM files.")
+def bam_coverage(bam_file: str, output_file: str) -> dict:
+    """
+    Calculates the coverage of a BAM file and outputs a bigWig file.
+    :param bam_file: Path to the BAM file.
+    :param output_file: Path to the output bigWig file.
+    :return: Dictionary with success status and result or error message.
+    """
+    try:
+        bamCoverage(bam_file, output_file)
+        return {"success": True, "result": f"Coverage data saved to {output_file}"}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+@mcp.tool(name="compute_matrix", description="Compute a matrix of scores for genomic regions.")
+def compute_matrix(input_file: str, output_file: str) -> dict:
+    """
+    Computes a matrix of scores for genomic regions from an input file.
+    :param input_file: Path to the input file.
+    :param output_file: Path to the output matrix file.
+    :return: Dictionary with success status and result or error message.
+    """
+    try:
+        computeMatrix(input_file, output_file)
+        return {"success": True, "result": f"Matrix computed and saved to {output_file}"}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+@mcp.tool(name="heatmapper", description="Generate heatmaps from computed matrices.")
+def generate_heatmap(matrix_file: str, output_file: str) -> dict:
+    """
+    Generates a heatmap from a computed matrix file.
+    :param matrix_file: Path to the matrix file.
+    :param output_file: Path to the output heatmap file.
+    :return: Dictionary with success status and result or error message.
+    """
+    try:
+        heatmapper(matrix_file, output_file)
+        return {"success": True, "result": f"Heatmap generated and saved to {output_file}"}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+def create_app() -> FastMCP:
+    """
+    Creates and returns the FastMCP application instance.
+    :return: FastMCP instance.
+    """
+    return mcp

deepTools/mcp_output/requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastmcp
+fastapi
+uvicorn[standard]
+pydantic>=2.0.0
+numpy >= 2.0.0
+scipy >= 0.17.0
+matplotlib >= 3.5.0
+pysam >= 0.14.0
+numpydoc >= 0.5
+pyBigWig >= 0.2.1
+py2bit >= 0.2.0
+plotly >= 4.9
+deeptoolsintervals >= 0.1.8

deepTools/mcp_output/start_mcp.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+MCP Service Startup Entry
+"""
+import sys
+import os
+project_root = os.path.dirname(os.path.abspath(__file__))
+mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
+if mcp_plugin_dir not in sys.path:
+    sys.path.insert(0, mcp_plugin_dir)
+from mcp_service import create_app
+def main():
+    """Start FastMCP service"""
+    app = create_app()
+    # Use environment variable to configure port, default 8000
+    port = int(os.environ.get("MCP_PORT", "8000"))
+    # Choose transport mode based on environment variable
+    transport = os.environ.get("MCP_TRANSPORT", "stdio")
+    if transport == "http":
+        app.run(transport="http", host="0.0.0.0", port=port)
+    else:
+        # Default to STDIO mode
+        app.run()
+if __name__ == "__main__":
+    main()

deepTools/mcp_output/workflow_summary.json ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+  "repository": {
+    "name": "deepTools",
+    "url": "https://github.com/deeptools/deepTools",
+    "local_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/deepTools",
+    "description": "Python library",
+    "features": "Basic functionality",
+    "tech_stack": "Python",
+    "stars": 0,
+    "forks": 0,
+    "language": "Python",
+    "last_updated": "",
+    "complexity": "medium",
+    "intrusiveness_risk": "medium"
+  },
+  "execution": {
+    "start_time": 1769854937.7038116,
+    "end_time": 1769855028.4553556,
+    "duration": 90.75154423713684,
+    "status": "success",
+    "workflow_status": "success",
+    "nodes_executed": [
+      "download",
+      "analysis",
+      "env",
+      "generate",
+      "run",
+      "review",
+      "finalize"
+    ],
+    "total_files_processed": 2,
+    "environment_type": "unknown",
+    "llm_calls": 0,
+    "deepwiki_calls": 0
+  },
+  "tests": {
+    "original_project": {
+      "passed": false,
+      "details": {},
+      "test_coverage": "100%",
+      "execution_time": 0,
+      "test_files": []
+    },
+    "mcp_plugin": {
+      "passed": true,
+      "details": {},
+      "service_health": "healthy",
+      "startup_time": 0,
+      "transport_mode": "stdio",
+      "fastmcp_version": "unknown",
+      "mcp_version": "unknown"
+    }
+  },
+  "analysis": {
+    "structure": {
+      "packages": [
+        "source.deeptools",
+        "source.deeptools.test"
+      ]
+    },
+    "dependencies": {
+      "has_environment_yml": false,
+      "has_requirements_txt": false,
+      "pyproject": true,
+      "setup_cfg": false,
+      "setup_py": false
+    },
+    "entry_points": {
+      "imports": [],
+      "cli": [],
+      "modules": []
+    },
+    "risk_assessment": {
+      "import_feasibility": 0.8,
+      "intrusiveness_risk": "medium",
+      "complexity": "medium"
+    },
+    "deepwiki_analysis": {
+      "repo_url": "https://github.com/deeptools/deepTools",
+      "repo_name": "deepTools",
+      "content": "deeptools/deepTools\nInstallation and Getting Started\nTypical Workflows\nCore Processing Engines\nParallel Processing Framework (mapReduce)\nRead Counting Engine (countReadsPerBin)\nScore Extraction (getScorePerBigWigBin)\nMatrix Computation Engine (heatmapper)\nCoverage Generation and Normalization\nSingle-Sample Coverage (bamCoverage)\nSample Comparison (bamCompare)\nGC Bias Correction Pipeline\nMulti-Sample Analysis Tools\nData Integration (multiBamSummary and multiBigwigSummary)\nCorrelation Analysis (plotCorrelation)\nPrincipal Component Analysis (plotPCA)\nVisualization Tools\nMatrix Generation (computeMatrix)\nMatrix Operations (computeMatrixOperations)\nHeatmap Visualization (plotHeatmap)\nProfile Plots (plotProfile)\nQuality Control Tools\nChIP-seq Enrichment Assessment (plotFingerprint)\nFragment Size Analysis (bamPEFragmentSize)\nCoverage Distribution (plotCoverage)\nFeature Enrichment (plotEnrichment)\nRead Filtering Estimation (estimateReadFiltering and alignmentSieve)\nGalaxy Integration\nGalaxy Wrapper Architecture\nTool Shed Distribution and Installation\nTesting with Planemo\nDevelopment and Contributing\nProject Structure and Configuration\nCI/CD Pipeline\nDocumentation System\nRelease Process and Versioning\nFile Formats and Data Structures\nInput File Formats\nIntermediate File Formats\nOutput Formats and Visualization\nAdvanced Topics and Optimization\nPerformance Tuning\nFiltering and Read Processing Options\nNormalization Methods Deep Dive\nSpecialized Read Processing Modes\nCHANGES.txt\ndocs/content/about.rst\ndocs/content/changelog.rst\ndocs/content/example_api_tutorial.rst\ndocs/content/example_gallery.rst\ndocs/content/example_step_by_step.rst\ndocs/content/example_usage.rst\ndocs/content/help_faq.rst\ndocs/content/help_faq_galaxy.rst\ndocs/content/help_galaxy_intro.rst\ndocs/content/help_glossary.rst\ndocs/content/installation.rst\ndocs/content/list_of_tools.rst\ndocs/images/Gal_FAQ_filteringDuplicates.png\ndocs/images/Gal_FAQ_info.png\ndocs/index.rst\ngalaxy/wrapper/deepTools_macros.xml\ngalaxy/wrapper/estimateReadFiltering.xml\ngalaxy/wrapper/test-data/estimateReadFiltering.txt\nscripts/convertChromsBigWig.py\nPurpose and Scope\nThis page provides a high-level introduction to deepTools: what it is, what problems it solves, and its primary capabilities for analyzing high-throughput sequencing data. For detailed installation instructions, seeInstallation and Getting Started. For specific tool usage and workflows, seeTypical Workflows.\nSources:README.md9-11docs/index.rst7-8\nWhat is deepTools?\ndeepTools is a suite of Python tools developed for efficient analysis of high-throughput sequencing data, particularly ChIP-seq, RNA-seq, and MNase-seq experiments. It addresses the challenge of handling large amounts of data generated from DNA sequencing centers by providing:\nNormalized coverage file generationin standard bedGraph and bigWig formats\nQuality control modulesfor assessing data quality and technical biases\nPublication-ready visualizationsfor identifying enrichments and functional genome annotations\nEfficient parallel processingusing themapReduceframework for genome-scale computations\ndeepTools is available through three distinct usage modes:\nSources:README.md9-14docs/index.rst10-14CHANGES.txt62\nSystem Architecture\nThe following diagram illustrates the primary components of deepTools and how they relate to each other:\ndeepTools Component Architecture\nData Access LayerCore Processing EnginesTool Modules LayerUser Interface LayerCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)Galaxy XML Wrappers(galaxy/wrapper/*.xml)Python API(import deeptools.*)Quality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSizeCoverage ToolsbamCoveragebamComparebigwigCompareAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrixVisualization ToolsplotHeatmapplotProfileplotCorrelationcountReadsPerBin(deeptools/countReadsPerBin.py)getScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)heatmapper(deeptools/heatmapper.py)mapReduce(deeptools/mapReduce.py)pysam(BAM/CRAM files)pyBigWig(bigWig files)deeptoolsintervals(BED/GTF files)\nData Access Layer\nCore Processing Engines\nTool Modules Layer\nUser Interface Layer\nCommand-Line Tools(bin/bamCoverage, bin/computeMatrix, etc.)\nGalaxy XML Wrappers(galaxy/wrapper/*.xml)\nPython API(import deeptools.*)\nQuality Control ToolsplotFingerprintcomputeGCBiasplotCoveragebamPEFragmentSize\nCoverage ToolsbamCoveragebamComparebigwigCompare\nAggregation ToolsmultiBamSummarymultiBigwigSummarycomputeMatrix\nVisualization ToolsplotHeatmapplotProfileplotCorrelation\ncountReadsPerBin(deeptools/countReadsPerBin.py)\ngetScorePerBigWigBin(deeptools/getScorePerBigWigBin.py)\nheatmapper(deeptools/heatmapper.py)\nmapReduce(deeptools/mapReduce.py)\npysam(BAM/CRAM files)\npyBigWig(bigWig files)\ndeeptoolsintervals(BED/GTF files)\nSources:docs/content/list_of_tools.rst1-46pyproject.toml24-50README.md34-56\nCore Capabilities\ndeepTools provides five major functional categories that address different stages of sequencing data analysis:\n1. Quality Control and Preprocessing\nTools for assessing data quality before analysis:\nplotFingerprint- ChIP-seq enrichment assessment with quality metrics (Jensen-Shannon distance, CHANCE statistics)\nplotFingerprint\ncomputeGCBias/correctGCBias- GC bias detection and correction\ncomputeGCBias\ncorrectGCBias\nbamPEFragmentSize- Fragment size distribution analysis for paired-end data\nbamPEFragmentSize\nplotCoverage- Coverage distribution and genome coverage metrics\nplotCoverage\nestimateReadFiltering- Preview effects of filtering parameters\nestimateReadFiltering\nSources:docs/content/list_of_tools.rst18-32galaxy/wrapper/estimateReadFiltering.xml1-118\n2. Coverage Generation and Normalization\nTools for converting aligned reads (BAM) to normalized coverage tracks (bigWig/bedGraph):\nbamCoverage- Single-sample normalization with RPKM, CPM, BPM, RPGC methods\nbamCoverage\nbamCompare- Two-sample comparison (e.g., ChIP vs. input) with log2ratio, difference, mean operations\nbigwigCompare- Comparison operations on bigWig files\nbigwigCompare\nbigwigAverage- Averaging multiple bigWig files\nbigwigAverage\nSources:docs/content/list_of_tools.rst24-27CHANGES.txt28\n3. Data Aggregation\nTools for integrating multi-sample data:\nmultiBamSummary- Read count aggregation across BAM files (bins or BED-defined regions)\nmultiBamSummary\nmultiBigwigSummary- Score aggregation across bigWig files\nmultiBigwigSummary\ncomputeMatrix- Signal computation over genomic regions in two modes:scale-regionsandreference-point\ncomputeMatrix\nscale-regions\nreference-point\nSources:docs/content/list_of_tools.rst10-29\n4. Analysis and Statistics\nTools for deriving statistical insights:\nplotCorrelation- Pearson/Spearman correlation with hierarchical clustering and heatmap/scatter plot output\nplotCorrelation\nplotPCA- Principal component analysis for dimensionality reduction\nplotEnrichment- Feature enrichment quantification\nplotEnrichment\nSources:docs/content/list_of_tools.rst14-17docs/content/example_usage.rst32-44\n5. Visualization\nTools for creating publication-ready plots:\nplotHeatmap- Customizable heatmaps with clustering, color schemes, and multi-sample/region support\nplotHeatmap\nplotProfile- Average signal profile plots (meta-profiles) with standard error/deviation options\nplotProfile\ncomputeMatrixOperations- Matrix manipulation (filter, subset, sort, combine)\ncomputeMatrixOperations\nMultiple output formats supported: PNG, PDF, SVG, and interactive HTML (via plotly)\nSources:docs/content/list_of_tools.rst34-36CHANGES.txt187\nFile Format Ecosystem\ndeepTools operates on standard genomics file formats and performs conversions between them:\nFile Format Flow Diagram\nOutput FormatsIntermediate FormatsdeepTools ProcessingInput FormatsBAM/CRAM(aligned reads)bigWig(coverage signal)BED/GTF/GFF(genomic regions)2bit/FASTA(reference genome)bamCoveragebamComparemultiBamSummarymultiBigwigSummarycomputeMatrixcomputeGCBiasNPZ matrix(multiBamSummary output)Matrix .gz(computeMatrix output)Tabular(GC frequencies)bigWig/bedGraph(coverage tracks)PNG/PDF/SVG(static plots)HTML(interactive plotly)Tabular(data tables)\nOutput Formats\nIntermediate Formats\ndeepTools Processing\nInput Formats\nBAM/CRAM(aligned reads)\nbigWig(coverage signal)\nBED/GTF/GFF(genomic regions)\n2bit/FASTA(reference genome)\nbamCoverage\nmultiBamSummary\nmultiBigwigSummary\ncomputeMatrix\ncomputeGCBias\nNPZ matrix(multiBamSummary output)\nMatrix .gz(computeMatrix output)\nTabular(GC frequencies)\nbigWig/bedGraph(coverage tracks)\nPNG/PDF/SVG(static plots)\nHTML(interactive plotly)\nTabular(data tables)\nKey intermediate formats serve as bridges between data aggregation and visualization:\nNPZ files- Compressed NumPy arrays storing multi-sample read counts or scores (output frommultiBamSummary/multiBigwigSummary)\nmultiBamSummary\nmultiBigwigSummary\nMatrix .gz files- Compressed matrices of signal values over genomic regions (output fromcomputeMatrix)\ncomputeMatrix\nSources:docs/content/help_glossary.rst82-174docs/content/list_of_tools.rst8-45\nParallel Processing Framework\nAll compute-intensive operations in deepTools utilize themapReduceframework for efficient genome-scale processing:\nmapReduce Execution Model\nmapReduce Framework (deeptools/mapReduce.py)Input:BAM/bigWig filesBED regionsParametersGenome Chunking(~400k reads per chunk)Worker Pool(multiprocessing.Pool)Worker 1:Process chunk 1Worker 2:Process chunk 2Worker N:Process chunk NResult Aggregation(concatenate/merge)Output:Aggregated results\nmapReduce Framework (deeptools/mapReduce.py)\nInput:BAM/bigWig filesBED regionsParameters\nGenome Chunking(~400k reads per chunk)\nWorker Pool(multiprocessing.Pool)\nWorker 1:Process chunk 1\nWorker 2:Process chunk 2\nWorker N:Process chunk N\nResult Aggregation(concatenate/merge)\nOutput:Aggregated results\nThe framework provides:\nAutomatic parallelizationacross user-specified number of processors (via--numberOfProcessors)",
+      "model": "gpt-4o-2024-08-06",
+      "source": "selenium",
+      "success": true
+    },
+    "code_complexity": {
+      "cyclomatic_complexity": "medium",
+      "cognitive_complexity": "medium",
+      "maintainability_index": 75
+    },
+    "security_analysis": {
+      "vulnerabilities_found": 0,
+      "security_score": 85,
+      "recommendations": []
+    }
+  },
+  "plugin_generation": {
+    "files_created": [
+      "mcp_output/start_mcp.py",
+      "mcp_output/mcp_plugin/__init__.py",
+      "mcp_output/mcp_plugin/mcp_service.py",
+      "mcp_output/mcp_plugin/adapter.py",
+      "mcp_output/mcp_plugin/main.py",
+      "mcp_output/requirements.txt",
+      "mcp_output/README_MCP.md"
+    ],
+    "main_entry": "start_mcp.py",
+    "requirements": [
+      "fastmcp>=0.1.0",
+      "pydantic>=2.0.0"
+    ],
+    "readme_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/deepTools/mcp_output/README_MCP.md",
+    "adapter_mode": "import",
+    "total_lines_of_code": 0,
+    "generated_files_size": 0,
+    "tool_endpoints": 0,
+    "supported_features": [
+      "Basic functionality"
+    ],
+    "generated_tools": [
+      "Basic tools",
+      "Health check tools",
+      "Version info tools"
+    ]
+  },
+  "code_review": {},
+  "errors": [],
+  "warnings": [],
+  "recommendations": [
+    "Improve test coverage by adding more unit tests for core modules",
+    "Implement continuous integration (CI) to automate testing and deployment",
+    "Update documentation to include detailed installation and usage instructions",
+    "Optimize large file handling to improve performance",
+    "Refactor code to reduce complexity and improve maintainability",
+    "Ensure all dependencies are clearly defined and up-to-date",
+    "Enhance error handling to provide more informative messages",
+    "Consider adding a setup.py for easier package installation",
+    "Improve code comments for better readability and understanding",
+    "Conduct a code review to identify potential improvements and optimizations."
+  ],
+  "performance_metrics": {
+    "memory_usage_mb": 0,
+    "cpu_usage_percent": 0,
+    "response_time_ms": 0,
+    "throughput_requests_per_second": 0
+  },
+  "deployment_info": {
+    "supported_platforms": [
+      "Linux",
+      "Windows",
+      "macOS"
+    ],
+    "python_versions": [
+      "3.8",
+      "3.9",
+      "3.10",
+      "3.11",
+      "3.12"
+    ],
+    "deployment_methods": [
+      "Docker",
+      "pip",
+      "conda"
+    ],
+    "monitoring_support": true,
+    "logging_configuration": "structured"
+  },
+  "execution_analysis": {
+    "success_factors": [
+      "Efficient execution of all workflow nodes",
+      "Successful generation of MCP plugin files"
+    ],
+    "failure_reasons": [],
+    "overall_assessment": "good",
+    "node_performance": {
+      "download_time": "Completed successfully, indicating efficient data retrieval",
+      "analysis_time": "Completed successfully, indicating effective code analysis",
+      "generation_time": "Completed successfully, indicating efficient code generation",
+      "test_time": "Original project tests failed, but MCP plugin tests passed"
+    },
+    "resource_usage": {
+      "memory_efficiency": "Memory usage data not available, unable to assess",
+      "cpu_efficiency": "CPU usage data not available, unable to assess",
+      "disk_usage": "Disk usage data not available, unable to assess"
+    }
+  },
+  "technical_quality": {
+    "code_quality_score": 75,
+    "architecture_score": 80,
+    "performance_score": 70,
+    "maintainability_score": 75,
+    "security_score": 85,
+    "scalability_score": 70
+  }
+}

deepTools/source/.planemo.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Some versions of planemo don't handle symlinks
+unlink galaxy/wrapper/test-data/test.bw
+cp deeptools/test/test_heatmapper/test.bw galaxy/wrapper/test-data/test.bw
+if [[ $1 == "1" ]] ; then
+    wrappers="galaxy/wrapper/alignmentSieve.xml \
+    galaxy/wrapper/bamCompare.xml \
+    galaxy/wrapper/bamCoverage.xml \
+    galaxy/wrapper/bamPEFragmentSize.xml \
+    galaxy/wrapper/bigwigCompare.xml \
+    galaxy/wrapper/bigwigAverage.xml \
+    galaxy/wrapper/computeGCBias.xml"
+elif [[ $1 == "2" ]] ; then
+    wrappers="galaxy/wrapper/computeMatrix.xml \
+    galaxy/wrapper/computeMatrixOperations.xml \
+    galaxy/wrapper/correctGCBias.xml \
+    galaxy/wrapper/estimateReadFiltering.xml \
+    galaxy/wrapper/multiBamSummary.xml \
+    galaxy/wrapper/multiBigwigSummary.xml"
+else
+    wrappers="galaxy/wrapper/plotCorrelation.xml \
+    galaxy/wrapper/plotCoverage.xml \
+    galaxy/wrapper/plotEnrichment.xml \
+    galaxy/wrapper/plotFingerprint.xml \
+    galaxy/wrapper/plotHeatmap.xml \
+    galaxy/wrapper/plotPCA.xml \
+    galaxy/wrapper/plotProfiler.xml"
+fi
+planemo --version
+planemo lint ${wrappers}
+planemo test --no_dependency_resolution --galaxy_branch $2 --install_galaxy ${wrappers} 2>&1
+mkdir upload
+mv tool_test_output* upload/

deepTools/source/.readthedocs.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+version: 2
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+sphinx:
+  configuration: docs/conf.py
+python:
+  install:
+  - method: pip
+    path: .
+  - requirements: docs/requirements.txt

deepTools/source/CHANGES.txt ADDED Viewed

	@@ -0,0 +1,448 @@

+3.5.5
+* drop support for python 3.7
+* doc fixes (argparse properly displayed, minor changes in installation instructions)
+* deepblue support stops
+* initiate deprecation of tight_layout in plotheatmap, in favor of constrained_layout. Minor changes in paddings, etc can occur (but for the better).
+* documentation changes to improve ESS tab, table constraints have been lifted & sphinx_rtd_theme to v2.0.0
+* upload artifact in gh test runner pinned to 3
+* Try to get the number of processors from sched_getaffinity, to avoid using to many in job submissions for example. #1199
+* Fix typo in estimateScaleFactor that fixes broken argparsing. #1286
+3.5.4
+* error handling and cases for bwAverage with >2 samples
+* Tick.label deprecation for mpl 3.8
+* minimal mpl version is 3.5
+* cicd update for pypi push
+3.5.3
+* requirement cap for matplotlib lifted (changes in plotting can occur)
+* nose has been deprecated in favor of pytests
+* pytests run with python 3.7 - 3.11
+* toml file for installation, requirements, versioning and executables
+* planemo tests updated to galaxy 23.1
+* custom github action runner deprecated
+* deprecation of np types for builtin types
+* stricter label checks and validator in galaxy
+3.5.2
+* new subcommand: Bigwig average #1169
+* dendogram of plotCorrelation now matches each cell correctly
+* Fix label options
+* add pool
+* several other bugs fixed: #1159, #1185, #1172, #1181, #1183
+* Fix galaxy tests, separate planemo and update pypi push only on tag releases
+* upload artifact
+* allow 1 or 2 lines diff for bowtie2 program
+* change github action to get artifacts
+* fix plotPCA
+* try to fix old samtools installed
+* add forgotten channels
+* default chunklength increased for alignmentSieve
+* chunklength in alignmentSieve is a CLI argument now
+* suppress lack of index warnings from pysam
+* fixedStep in bedGraph output to avoid merging bins with equal values
+3.5.1
+* cmp usage is updated to fit the recent mpl updates.
+* The requirements.txt is updated.
+* "NA" occurences in plotFingerprint.py have been replaced by numpy.NAN (PR #1002)
+* computeMatrixOperations.xml is fixed (brought up in #1003)
+* plotly error is fixed. (issue #1013)
+* relase version is updated in planemo.sh
+* fixed galaxy tests
+* A bug is taken care of in computeMatrixOperations.py / dataRange
+* in plotProfile.py legen location is changed from auto to best (issue #1042)
+3.5.0
+ * Fixed a small issue in computeGCBias (issue #969)
+ * Added dataRange to computeMatricOperation to return min,max,median and 10th and 90th percentile.
+ * Fixed a small typo in bamCompare. (issue #966)
+ * Save the output matrix of the plotheatmap in a format to be compatible with running plotheatmap on it again.(issue #953)
+ * Different colors can now be set by user for plotProfile --plotType heatmap (issue #956)
+ * Added the `auto` option to the zMin and zMax of plotHeatmap. (issue #908)
+ * Added `--sortUsingSamples` and `--clusterUsingSamples` to the plotHeatmap galaxy wrapper. (issue #976)
+3.4.3
+ * Changed iteritems() in estimateEscaleFactor to its python3 compatible items().
+ * Added the missing argument (--clusterUsingSamples) to plotProfile.
+3.4.2
+ * Programmed around a bug in matplotlib that prevented the plotCorrelation scatter plot from working. See https://bioinformatics.stackexchange.com/questions/12830/plot-correlation-between-several-bam-files/12831
+3.4.1
+ * Prevented temporary bedGraph files from being written to (possibly small) shared-memory drives even when TMPDIR is set to somewhere else. Now shared memory is only used if requested by setting TMPDIR (or other appropriate environment variables) to `/dev/shm`.
+ * Fixed a bug in bamPEFragmentSize that caused incompatibility with newer matplotlib releases. (issue #928)
+3.4.0
+ * Fixed a bug in one of the Galaxy wrappers.
+ * Added the `--lineAtTickMarks` option to `plotHeatmap` so that there are dashed vertical lines for each tick mark in the plot. (issue #924)
+3.3.2
+ * Fixed --yAxisLabel in plotProfile (issue #889)
+ * Fixed a small X-axis tick offset issue. This caused the location of tick marks in profile plots to be shifted to the left by 0.5 to 1 bin. This was generally not notable, only really appearing when very few bins (e.g., 4) were used. The issue was mostly that the end tick would appear after the end of the plot, since its coordinate was the end of the bin. (issue #888)
+ * multiBamSummary and multiBigwigSummary no longer exclude small bins at the end of genomic chunks. multiBamSummary now has a `--genomicChunkSize` option in case users need to control the size of the genome used for multiprocessing for consistency. (issue #887)
+ * Added 4 new colormaps, which were copied from the seaborn project (issue #879). These are: rocket, mako, vlag, and icefire.
+ * Fixed an issue in the Galaxy wrapper of plotCorrelation where the X and Y.
+ * Fixed an issue with the `--Offset` option, where a single negative value wouldn't include only a single position, but rather that base through the end of the read. (stems from issue #902)
+ * Clustered output from plotHeatmap and plotProfile now allow computing the silhouette score of each row. This is printed in the returned BED file as the last column.
+3.3.1
+ * Fixed `--plotNumbers` not working in `plotCorrelation`. This was issue #838.
+ * Fixed compatibility with matplotlib 3 and restrict to at least that version.
+ * The Y-axis labels should once again appear in both plotHeatmap and plotProfile (issue #844). This was related to the previous point.
+ * Testing is no longer performed with python 2.7, which will reach end of life in a couple months.
+ * Various documentation updates (issues #868, #867 and #851).
+ * Increased support for BED files with track header lines (issue #866).
+3.3.0
+ * `plotCoverage` now has a `--BED` option, to restrict plots and output to apply to a specific set of regions given by a BED or GTF file or files (issue #829).
+ * `plotCoverage` now has a `--DepthSummary` option, which produces a summary similar to GATK's DepthOfCoverage (issue #828).
+ * `plotCoverage` is now able to compute coverage metrics for arbitrary coverage thresholds using multiples of the `-ct` option (e.g., `-ct 0 -ct 10 -ct 20 -ct 30`).
+3.2.1
+ * Changed a bug in `estimateReadFiltering` where the estimated number of filtered reads was typically too low.
+ * Made an internal change that should drastically reduce the memory requirements of many tools. This slightly increases run time, but as the resulting resource usage is much more attractive this is judged worthwhile.
+ * An informative error message is now produced with `bamCoverage` if RPGC normalization is requested but no effective genome size is provided (issue #815).
+ * Fixes some issues with y-axis scaling (issue #822)
+3.2.0
+ * Added access in the Galaxy wrapper to the `--labels` option in most tools (issue #738)
+ * Added the `std` plot type to plotProfile in Galaxy (issue #782)
+ * `bamCompare` now has a `--skipZeroOverZero` option to allow skipping bins where both input files lack coverage (issue #785)
+ * `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
+ * `multiBamSummary` now has a `--scaleFactors` option, which computes scale factors in the same manner as DESeq2 to a file. Note that the produced scaling factors are meant to be used with `bamCoverage`. If you want to use them directly in DESeq2 (or a similar package) you will need to invert them (take 1/scale factor). (issue #800)
+ * Fixed an issue with large numbers of samples and small genome sizes sometimes causing nothing to be processed. (issue #801)
+3.1.3
+ * Added the `--legendLocation` option in the Galaxy wrappers for plotProfile and plotHeatmap
+ * More thoroughly checked that output files can be written (issue #764).
+ * `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
+3.1.2
+ * Added a `--markers` option to `plotPCA`, courtesy of @sklasfeld.
+ * `computeMatrixOperations rbind` now properly supports multiple region groups (issue #742)
+ * Fixed the usage of `--xRange` and `--yRange` with `plotCorrelation` (issue #709)
+3.1.1
+ * Fixed the `--outFileNameData` option in `plotProfile` when `computeMatrix reference-point --referencePoint center` was used. This caused an error previously. (issue #727)
+ * RPGC normalization and the `--scaleFactor` option in `bamCoverage` are no longer mutually exclusive.
+ * Increased the default plot width in plotPCA (issue #738)
+3.1.0
+ * The `--centerReads` option in `bamCoverage` is now compatible with `--Offset` (previously `--centerReads` was silently ignored if `--Offset` was specified). (issue #693)
+ * `bamCoverage` and `bamCompare` now have an `--exactScaling` option. Instead of using a random sample of alignment to compute the scaling factor, this causes all reads in the file to be used. This is significantly slower, but helpful in situations where reads that should be excluded clump together on the genome (i.e., when sampling based on location is likely to be inaccurate).
+ * `plotCorrelation --whatToPlot scatterplot` now has `--xRange` and `--yRange` options rather than just `--maxRange`. (issue #709)
+ * `computeMatrixOperations` can now be used to change sample and group names.
+ * `computeMatrixOperations` can now filter rows by minimum and/or maximum value.
+ * `--maxThreshold` and `--minThreshold` are now more consistently honoured. (#702)
+ * Fixed region handling when using files on deepBlue (#700)
+ * Using `--normalizeUsing RPGC` with `bamCompare` will now result in a fatal error, rather than a simple warning and the settings being changed under the hood. (#718)
+ * Related to the last point, setting `--normalizeUsing` to anything other than `None` will result in an error unless `--scaleFactorsMethod None` is also used. This is to prevent people from accidentally getting unintended normalization.
+ * bamPEFragmentSize no longer exploids its memory use with multiple large BAM/CRAM files (#720). Many other tools will also benefit from this change.
+3.0.2
+ * Fixed an issue regarding under sampling alignments in some cases with computing scaling factors. This was issue #690. The resolution isn't perfect, it's hard to know how many reads really need to be sampled for things like RNA-seq.
+ * `computeMatrix` now has a `--verbose` option. Setting this will drastically increase the verbosity of the messages sent to the screen. Only do this for debugging. `--quiet` will disable this completely (as well as all other messages printed to screen).
+ * Fixed handling of `--sortUsing region_length` in `plotHeatmap`. This now works properly for `--referencePoint center` and `--referencePoint TES`, where in the latter case the dashed line is drawn at the region start. The documentation has been updated to mention this. (issue #671)
+ * The reference point label specified by `computeMatrix reference-point` is now respected by plotHeatmap and plotProfile. So if you used `computeMatrix reference-point --referencePointLabel center` then 'center' will now appear as the tick label in your heatmaps and profiles automatically. (issues #606 and #683)
+ * Enabled using regions with a `.` in the chromosome name in the Galaxy wrappers (issue #692)
+3.0.1
+ * Fixed the `--perGroup` option in plotProfile and plotHeatmap when multiple groups were being used. In version 3.0.0, this would typically cause an error and deepTools to crash. (issue #673)
+ * Fixed a few issues with the Galaxy wrappers. Thanks to Ralf Gilsbach, Claudia Keller, and @bgruening (e.g., issue #678)
+3.0.0
+ * plotCorrelation` now has `--log1p` and `--maxRange` options if a scatter plot is produced. `--log1p` plots the natural log of the values (plus 1). `--maxRange` sets the maximum X and Y axis ranges. If they would normally be below this value then they are left unchanged. (issue #536)
+ * The PCA plot now includes "% of var. explained" in the top axis labels. (issue #547)
+ * `plotProfile` and `plotHeatmap` now have a `--labelRotation` option that can rotate the X-axis labels. This is one of the more common requests for customization. For further customization, please modify your .matplotlibrc file or save as a PDF and modify further in Illustrator or a similar program. (issue #537)
+ * The `--ignoreDuplicates` algorithm has been updated to better handle paired-end reads. (issue #524)
+ * Added the `estimateReadFiltering` tool to estimate how many reads would be filtered from a BAM file or files if a variety of desired filtering criterion are applied (issue #518).
+ * Rewrote the bigWig creation functions so there are no longer steps involving creating a single large bedGraph and then sorting it. That was a hold-over from previous versions that used UCSC tools. This was issue #546. This also means that there are no longer any required external programs (previously, only `sort` was required).
+ * `plotPCA` can now be run on the transposed matrix, as is typically done with RNAseq data (e.g., with deepTools). Further, matplotlib is now no longer used for computing the PCA, but rather an SVD is performed and the results directly used. The options `--transpose` and `--ntop` were also added. The former computes the PCA of the transposed matrix and the latter specifies how many of the most variable rows in the matrix to use. By default, the 1000 most variable features are used. In the (now optional) plot, the `--PCs` option can now be used to specify which principal components to plot. Finally, the unbiased standard deviation is used in the out, as is done by `prcomp()` in R.  This was issue #496.
+ * Symbol colors for `plotPCA` can now be specified. (issue #560)
+ * `plotFingerprint` always returns the synthetic JSD, even if no `--JSDsample` is specified. (issue #564)
+ * `plotEnrichment` will only read in annotation files a single time rather than in each thread. This prevents terrible performance when using many tens of millions of BED/GTF regions at the expense of a slight memory increase. (issue #530)
+ * Fixed a small bug generally affecting `plotFingerprint` where BAM files without an index were processed as bigWig files, resulting in a confusing error message (issue #574). Thanks to Sitanshu Gakkhar for poiting this out!
+ * `bamPEFragmentSize` now has `--table` and `--outRawFragmentLengths` options. The former option will output the read/fragment metrics to a file in tabular format (in addition to the previous information written to the screen). The latter option will write the raw read/fragment counts to a tsv file. The format of the file is a line with "#bamPEFragmentSize", followed by a header line of "Size\tOccurences\tSample", which should facilitate processing in things like R. (issue #572)
+ * `bamPEFragmentSize` will now plot the read length distribution for single-end BAM files. Note that if you mix single and paired-end files that the resulting plots may be difficult to interpret.
+ * The various plot commands do not actually have to plot anything, instead they can optionally only print their raw metrics or other text output. This is mostly useful with large numbers of input files, since the resulting plots can become quickly crowded. (issue #5719
+ * Expanded the metrics output by `bamPEFragmentSize` such that it now fully replaces Picard CollectInsertSizeMetrics (issue #577).
+ * "plotly" is now available as an output image format for all tools. Note that this is not really an image format, but rather an interactive webpage that you can open in your browser. The resulting webpages can be VERY large (especially for `plotHeatmap`), so please keep that in mind. Further, plotly does not currently have the capabilities to support all of deepTools' features, so note that some options will be ignored. For privacy reasons, all plotly files are saved locally and not uploaded to the public plot.ly site. You can click on the "Export to plot.ly" link on the bottom right of plotly output if you would like to modify the resulting files.
+ * `bamCoverage` no longer prints `normalization: depth` be default, but rather a more accurate message indicating that the scaling is performed according to the percentage of alignments kept after filtering. This was originally added in #366 (issue #590).
+ * The output of `plotFingerprint --outRawCounts` now has a header line to facilitate identification by MultiQC.
+ * `plotPCA` now has a `--log2` option, which log2 transforms the data before computing the PCA. Note that 0.01 is added to all values to 0 doesn't become -infinity.
+ * `computeGCBias` no longer requires a fragment length for paired-end datasets. This was apparently always meant to be the case anyway. (issue #595)
+ * `computeMatrixOperations sort` can now properly perform filtering of individual regions, as was originally intended (issue #594)
+ * `plotCoverage --outRawCounts` now has another line it its header, which is meant to aid MultiQC.
+ * There is no longer a configuration file. The default number of threads for all tools is 1. See issue #613.
+ * `bamCoverage` and `bamCompare` have rewritten normalization functions. They have both added CPM and BPM normalization and, importantly, filtering is now done **before** computing scaling factors. A few of the options associated with this (e.g., `--normalizeUsingRPKM`) have been replaced with the `--normalizeUsing` option. This behavior represents a break from that seen in earlier versions but should be easier to follow and more in line with what users expect is happening. The syntax for normalization has been reworked multiple times (see #629).
+ * Fixed issue #631
+ * `computeMatrix` now repeats labels for each column in a plot. This is convenient if you later want to merge reference-point and scale-regions runs and still have correct tick marks and labels in plotHeatmap/plotProfile (issue #614). Note that the output of computeMatrix and computeMatrixOperations can not be used with older versions of deepTools (but output from previous versions can still be used).
+ * `plotHeatmap --sortRegions` now has a `keep` option. This is identical to `--sortRegions no`, but may be clearer (issue #621)
+ * `plotPCA --outFileNameData` and `plotCorrelation --outFileCorMatrix` now produce files with a single comment line (i.e., '#plotPCA --outFileNameData' and '#plotCorrelation --outFileCorMatrix'). These can then be more easily parsed by programs like MultiQC.
+ * All functions that accept file labels (e.g., via a `--samplesLabel` option) now also have a `--smartLabels` option. This will result in labels comprised of the file name, after stripping any path and the file extension. (issue #627)
+ * The `-o` option can now be universally used to indicate the file to save a tool's primary output. Previously, some tools use `-o`, some used `-out` and still others used things like `-hist` or `-freq`. This caused annoyance due to having to always remember the appropriate switch. Hopefully standardizing to `-o` will alleviate this. (issue #640)
+ * Using a --blackListFileName with overlapping regions will typically now cause the various deepTools programs to stop. This is to ensure that resulting scale factors are correct (issue #649)
+ * `bamCoverage` is a bit more efficient with small BAM files now due to underlying algorithmic changes. Relatedely, bamCoverage will skip some unnecessary estimation steps if you are not filtering reads, further speeding processing a bit. (issue #662)
+ * Added support for CRAM files. This requires pysam > 0.13.0 (issue #619).
+2.5.7
+ * Fixed a small bug that caused computation to stop. This was related to a change made for release 2.5.5.
+2.5.6
+ * Fixed a bug where deepTools in python3 can't handle npz file labels created under python 2.
+2.5.5
+ * Updated blacklist handling such that an error is thrown on overlapping regions.
+2.5.4
+ * Fixed issue #612, which only occurs when unaligned reads have a position assigned to them.
+ * Ticks in the profile plot at the top of the output of `plotHeatmap` should now always line up properly. (issue #616)
+2.5.3
+ * Fixed a bug in `plotEnrichment`, the `--keepExons` option with a BED12 file would cause an error. (issue #559)
+ * `bamCoverage` now doesn't cause and error to be thrown by `sort` in there are "/spaces in quoted path/". (issue #558)
+2.5.2
+ * Fixed a bug in `bamCoverage` that can cause crashes when python3 is used.
+ * Fixed a bug in the multiBigwigSummary Galaxy wrapper.
+ * A more reasonable exit code (not 0) is now returned if there's a mismatch in the label and file number.
+ * `plotFingerprint` no longer tries to use illegal line designators (issue #538)
+ * Various documentation fixes
+2.5.1
+ * Added universal new line support to deeptoolsintervals (issue #506).
+ * Fixed a few issues with correctGCBias under python 3.5 (thanks to @drakeeee)
+ * Setting `--minThreshold 0.0` or `--maxThreshold 0.0` now works properly. Previously, setting either of these to 0 was ignored. (issue #516)
+ * You can now specify the plot width and height in `plotPCA` and `plotCorrelation` (heatmap only) with the `--plotWidth` and `--plotHeight` parameters. (issue #507)
+ * plotCoverage no longer clips the top off of plots. Further, you can now set the plot width and height with `--plotWidth` and `--plotHeight`. (issue #508)
+ * In bamCoverage, specifying `--filterRNAstrand` no longer results in `--extendReads` being ignored. (issue #520)
+ * `plotFingerprint` and `plotEnrichment` no longer require producing a plot, which is useful if you only need QC metrics and are using a LOT of samples (such that matplotlib would crash anyway). This hasn't been implemented in Galaxy, but can if people would like it. (issues #519 and #526)
+ * `computeMatrix` now accepts a `--samplesLabel` option, which is useful in those cases when you aren't immediately running `plotHeatmap` and don't have terribly descriptive file names (issue #523)
+ * If you use `plotFingerprint` with the `--JSDsample` option and forget to list that file under `--bamfiles` it will be added automatically and the file name added to the labels if needed (issue #527)
+ * Various Galaxy wrapper fixes
+2.5.0
+ * Fix a bug where using regions with the same name in multiple BED files in computeMatrix caused downstream problems in plotHeatmap/plotProfile (issue #477).
+ * If computeMatrix/plotHeatmap/plotProfile is asked to sort the output matrix, it now does so by ignoring NaN values. Previously, any row with an NaN was placed at the top of the output (issue #447).
+ * Fixed issue #471
+ * Various Galaxy wrapper fixes
+ * There is now a `--rowCenter` option in `plotPCA`, which can be used to make each row of the matrix used in the PCA to have a mean of 0. This can be useful in cases where there's extreme region-based depth variation that is shared between all samples. This was issue #477.
+ * The --Offset option is now available in `plotEnrichment`. This was issue #481.
+ * The maximum coverage allowed while calculating the Jensen-Shannon distance in `plotFingerprint` has been increased to 2 million and an informational message containing the number of bins above this value is printed to the standard output.
+ * `bamCoverage` now respects the `--scaleFactor` argument even if not other normalization is performed (issue #482).
+ * The `--minFragmentLength` and `--maxFragmentLength` options now respect single-end reads. For SE reads, these parameters refer to the number of aligned bases (i.e., splicing is ignored). This was issue #489.
+ * `--yMin` and `--yMax` can now be lists of values in `plotHeatmap`. This was issue #487. Note that the plots are not perfectly aligned if you do this.
+2.4.3
+ * Fixed incorrect label ordering in the `plotCorrelation` command with the `--outFileCorMatrix` options.
+ * Fixed bug #491, which involved python 3 and bamCoverage.
+2.4.2
+ * Fixed an issue where `computeMatrix reference-point --referencePoint center` would break if 1-base regions were used. This was bug #456.
+ * `plotCorrelation` with `--outFileCorMatrix` now works with `--labels` again (thanks to @sklasfeld for supplying the patch).
+ * `bigwigCompare` and `bamCompare` can now return the average (mean) of two input files (issue #467).
+2.4.1
+ * Setting --zMin to the same value as --zMax, whether intentionally or because the --zMax value computed by deepTools happens to be now larger than the desired value, will result in the maximum value in the dataset being used (internally, --zMax gets set to None).
+ * Scale factor is now set to 1 in bamCoverage if no normalization is used. The fact that this wasn't being done previously was a bug.
+ * Fixed a bug (#451) affecting BED files with a `deepTools_group` column that caused a problem with `--sortRegions keep` in computeMatrix.
+ * Fixed a bug where some matrices produced with `computeMatrixOperations cbind` would result in the right-most samples sometimes getting squished due to having ticks outside of their graph bounds. Ticks are now scaled if they don't match the data range (issue #452).
+ * In plotFingerprint, the number of reads per-bin are no longer used. Instead, the sum of the per-base coverage (or signal if bigWig input is used) is used. This leads to more similar metrics produced by us and others regarding things like Jensen-Shannon metrics. For those just interested in the plots, there's little effective change here.
+2.4.0
+ * The --Offset option to bamCoverage can now take two values, which can be used to specify a range within each alignment of bases to use. As an example, `--Offset 5 -1` will use ignore the first 4 bases of an alignment (accounting for orientation) and use only the 5th through last base. This can be useful for things like ATACseq (see #370).
+ * Read extension can now be used in conjunction with --Offset in bamCoverage.
+ * plotFingerprint can now output quality metrics, including the Jensen-Shannon distance if a reference sample is specified (see #328). Additionally, various statistics from CHANCE can be produced.
+ * Switched from using the 'twobitreader' python module to our new custom 'py2bit' module for accessing 2bit files. This fixes the performance regression seen in computeGCBias starting in version 2.3.0 (#383).
+ * `bigwigCompare`, `computeMatrix`, and `multiBigwigSummary` can read signal files hosted on [deepBlue](http://deepblue.mpi-inf.mpg.de/).
+ * Fixed a minor bug in `deeptools`, where the `--version` option was ignored (see #404).
+ * Text in SVG and PDF files is now actual text and not a path (see #403).
+ * The `--maxFragmentLength` option in bamCoverage now alters the `maxPairedFragmentLength` that is otherwise hard-coded (see #410).
+ * Added the `computeMatrixOperations` tools, which can be used to sort/reorder/subset/filter/combine the output of `computeMatrix`.
+ * `computeMatrix --sortRegions` has a new `keep` option, which is the default. This mimics the behavior in deepTools prior to 2.3.0 where the output order matched the input order. This is, of course, a bit slower, so if the order doesn't matter then use `no`.
+ * Fixed issue #435, where `plotHeatmap --sortRegions region_length` would crash with an error.
+ * Output bedGraph files are now sorted (#439).
+ * Values stored in bedGraph files (and therefore placed into bigWig files) now use python's "general" format with 6 digits of precision. This tends to produce slightly larger files, but with less loss for values near 0 (see #438).
+ * Corrected how computeGCBias determines the lambda parameter, which should only really affect very atypical experiments (i.e., correctGCBias would have crashed is this greatly affected you).
+2.3.6
+ * multiBamSummary will now not automatically append .npz to the output file name if it's not present. This was bug #436
+ * Fixed a bug with plotHeatmap where --yMin and --yMax didn't work
+2.3.5
+ * Various Galaxy wrapper fixes (e.g., issue #415 and #417)
+ * Fixed issue #413, wherein the --nanAfterEnd option sometimes causes computeMatrix to throw an error.
+ * Fixed issue #416, wherein --outRawCounts in multiBamSummary and multiBigwigSummary would cause an error if python3 was being used.
+2.3.4
+ * Fixed bug #405, which dealt with the SES normalization in bamCompare (it was producing an error and terminating the program).
+ * Fixed bug #407, which dealt with multiBamSummary or multiBigwigSummary bins and saving the raw data. This was causing an error and the program to terminate.
+2.3.3
+ * Fixed a bug wherein proper pairs where being incorrectly called improper pairs, thereby causing slightly incorrect read extension.
+2.3.2
+ * The deeptoolsinterval module was modified to speed up plotEnrichment, which was taking forever to finish.
+2.3.1
+ * This release has no real code changes, the 2.3.0 release on pypi was missing files.
+2.3.0
+ * Modified how normalization is done when filtering is used. Previously, the filtering wasn't taken into account when computing the total number of alignments. That is now being done. Note that this uses sampling and will try to sample at least 100000 alignments and see what fraction of them are filtered. The total number of aligned reads is then scaled accordingly (#309).
+ * Modified how normalization is done when a blacklist is used. Previously, the number of alignments overlapping a blacklisted region was subtracted from the total number of alignments in the file. This decreased things a bit too much, since only alignments falling completely within a blacklisted region are actually excluded completely (#312).
+ * BED12 and GTF files can now be used as input (issue #71). Additionally, multiBamSummary, multiBigwigSummary and computeMatrix now have a --metagene option, which allows summarization over concatenated exons, rather than include introns as well (this has always been the default). This was issue #76.
+ * Read extension is handled more accurately, such that if a read originates outside of a bin or BED/GTF region that it will typically be included if the --extendReads option is used and the extension would put it in a given bin/region.
+ * deepTools now uses a custom interval-tree implementation that allows including metadata, such as gene/transcript IDs, along with intervals. For those interested, the code for this available separately (https://github.com/dpryan79/deeptools_intervals) with the original C-only implementation here: https://github.com/dpryan79/libGTF.
+ * The API for the countReadsPerBin, getScorePerBigWigBin, and mapReduce modules has changed slightly (this was needed to support the --metagene option). Anyone using these in their own programs is encouraged to look at the modified API before upgrading.
+ * Added the `plotEnrichment` function (this was issue #329).
+ * There is now a `subsetMatrix` script available that can be used to subset the output of computeMatrix. This is useful for preparing plots that only contain a subset of samples/region groups. Note that this isn't installed by default.
+ * The Galaxy wrappers were updated to include the ability to exclude blacklisted regions.
+ * Most functions (both at the command line and within Galaxy) that process BAM files can now filter by fragment length (--minFragmentLength and --maxFragmentLength). By default there's no filtering performed. The primary purpose of this is to facilitate ATACseq analysis, where fragment length determines whether one is processing mono-/di-/poly-nucleosome fragments. This was issue #336.
+ * bamPEFragmentSize now has --logScale and --maxFragmentLength options, which allow you to plot frequencies on the log scale and set the max plotted fragment length, respectively. This was issue #337.
+ * --blackListFileName now accepts multiple files.
+ * bamPEFragmentSize now supports multiple input files.
+ * If the sequence has been removed from BAM files, SE reads no longer cause an error in bamCoverage if --normalizeTo1x is specified. In general, the code that looks at read length now checks the CIGAR string if there's no sequence available in a BAM file (for both PE and SE datasets). This was issue #369.
+ * bamCoverage now respects the --filterRNAstrand option when computing scaling factors. This was issue #353.
+ * computeMatrix and plotHeatmap can now sort using only a subset of samples
+ * There is now an --Offset option to bamCoverage, which allows having the signal at a single base. This is useful for things like RiboSeq or GROseq, where the goal is to get focal peaks at single bases/codons/etc.
+ * The --MNase option to `bamCoverage` now respects --minFragmentLength and --maxFragmentLength, with defaults set to 130 and 200.
+2.2.4
+ * Fix the incorrectly oriented dendrogram in plotCorrelation (issue #350). Relatedly, we're bumping the minimum version of scipy required to one where this is correct.
+2.2.3
+ * Fixed issue #334, where computeGCBias wasn't properly handling the black list option.
+2.2.2
+ * Fixed labels when hierarchical clustering is used (they were off by one previously).
+ * Fixed a bug wherein bamCompare couldn't work with a blacklist
+ * Fixed yet another change in pysam, though at least in this case is was fixing a previous problem
+2.2.1
+ * Fixed a bug introduced in version 2.2.0 wherein sometimes a pre-2.2.0 produced matrix file could no longer be used with plotHeatmap or plotProfile (this only happened when --outFileNameData was then used).
+ * Finally suppressed all of the runtime warnings that numpy likes to randomly throw.
+ * Worked around an undocumented change in pysam-0.9.0 that tended to break things.
+2.2.0
+ * plotFingerprint now iterates through line styles as well as colors. This allows up to 35 samples per plot without repeating (not that that many would ever be recommended). This was issue #80.
+ * Fixed a number of Galaxy wrappers, which were rendered incorrectly due to including a section title of "Background".
+ * A number of image file handles were previously not explicitly closed, which caused occasional completion of a plot* program but without the files actually being there. This only happened on some NFS mount points.
+ * The Galaxy wrappers now support the `--outFileNameData` option on plotProfile and plotHeatmap.
+ * Added support for blacklist regions. These can be supplied as a BED file and the regions will largely be skipped in processing (they'll also be ignored during normalization). This is very useful to skip regions known to attract excess signal. This was issue #101.
+ * Modified plotPCA to include the actual eigenvalues rather than rescaled ones. Also, plotPCA can now output the underlying values (issue #231).
+ * Regions within each feature body can now be unscaled when using `computeMatrix`. Thus, if you're interested in unscaled signal around the TSS/TES then you can now use the `--unscaled5prime` and `--unscaled3prime` options. This was issue #108.
+ * bamCoverage now has a `--filterRNAstrand` option, that will produce coverage for only a single strand. Note that the strand referred to is the DNA strand and not sense/anti-sense.
+ * Issues with plotHeatmap x-axis labels were fixed (issue #301).
+2.1.1
+ * Fixed a how the --hclust option was handled in plotHeatmap/plotProfile. This gets around a quirk in scipy.
+ * A bug involving processing comment lines in BED files was corrected (issue #288)
+ * The Galaxy wrappers are now automatically tested with each modification.
+ * plotCoverage and plotFingerprint in Galaxy now accept 1 or more BAM files rather than at least 2 files.
+2.1.0
+ * Updates to many of the Galaxy wrappers and associated documentation.
+ * A bug was fixed in how chromosome names were dealt with in bigWig files. If you ever received errors due to illegal intervals then that should now be fixed. This was issue #250
+ * plotProfile now has an --outFileNameData option for saving the underlying data in a text format.
+ * correctGCBias ensures that the resulting BAM file will pass picard/HTSJDK's validation if the input file did (issue #248)
+ * The default bin size was changed to 10, which is typically a bit more useful
+ * The --regionsLabel option to plotProfile and plotHeatmap now accepts a space-separated list, in line with --samplesLabel
+ * BAM files that have had their sequences stripped no longer cause an error
+ * bamPEFragmentSize now has -bs and -n options to allow adjusting the number of alignments sampled. Note that the default value is auto-adjusted if the sampling is too sparse.
+ * bamPEFragmentSize now accepts single-end files.
+ * The --hclust option to plotProfile and plotHeatmap continues even if one of the groups is too small for plotting (matplotlib will produce a warning that you can ignore). This was issue #280.
+2.0.1
+ * A critical bug that prevented plotPCA from running was fixed.
+ * multiBamCoverage was renamed to multiBamSummary, to be in better alignment with multiBigwigSummary.
+ * computeGCBias and correctGCBias are now more tolerant of chromosome name mismatches.
+ * multiBigwigSummary and multiBamSummary can accept a single bigWig/BAM input file, though one should use the
+   --outRawCounts argument.
+2.0.0
+ * Documentation improved and migrated to http://deeptools.readthedocs.org The API to use deepTools modules is now
+   part of the documentation and includes a tutorial.
+ * Allow multiple bigwig files in computeMatrix that can be clustered together
+ * computeMatrix now accepts multiple bed files. Each bed file is considered as a group. Labels are automatically
+   added based on the file names.
+ * When computing read coverage now splited reads are understood. This is convenient for computing the
+   coverage of for RNA-seq data.
+ * New quality control tool 'plotCoverage' to plot the coverage over base pairs for multiple samples
+ * renaming of --missingDataAsZero to --skipNonCovered regions for clarity in bamCoverage and bamCompare
+ * New analysis tool plotPCA that visualizes the results from principal component analysis
+ * New option in bamCoverage `--MNase` that will compute the read coverage only considering 2 base pairs at the
+   center of the fragment.
+ * Make read extension optional. Remove the need to specify a default fragment length for most of the tools. Now, when
+   read extension is enabled and the bam files contain paired en data, the mean fragment length is automatically
+   calculated by sampling the read pairs in the bam file. The --doNotExtendPairedEnds and --fragmentLentgh parameters
+   are no longer used and the new --extendReads parameter was added.
+ * Dramatically improved bigwig related tools by using the new pyBigWig module. Eliminated the requirement for the
+   UCSC program `bigWigInfo`
+ * renamed heatmapper to plotHeatmap and profiler to plotProfile
+ * added hierarchical clustering, besides k-means to plotProfile and plotHeatmap
+ * improved plotting features for plotProfile when using 'overlapped_lines' and 'heatmap' plot types
+ * Resolved an error introduced by numpy version 1.10 in computeMatrix
+ * plotting of correlations (from bamCorrelate or bigwigCorrelate) was separated from the computation of the
+   underlying data. A new tool, plotCorrelation was added. This tool can plot correlations as heatmaps or as scatter
+   plots and includes options to adjust a large array of visual features.
+ * Fixed issue with bed intervals in bigwigCorrelate and bamCorrelate and a user specified region.
+ * Correlation coefficients can be computed even if the data contains NaNs
+ * Allow computeMatrix to read files with DOS newline characters
+ * Added option --skipChromosomes to  bigwigCorrelate, for example to skip all 'random' chromosomes. bigwigCorrelate
+   now also considers chromosomes as identical when their names between samples differ with the prefix 'chr'. E.g.
+   chr1 vs. 1
+ * For bamCoverage and bamCompare, behaviour of scaleFactor was updated such that now, if given in combination
+   with the normalization options (normalize to 1x or normalize using RPKM) the given scaleFactor
+   will multiply the scale factor computed for the normalization methods.
+ * Fixed problem with read pairs labelled as proper pairs by the aligner but that were actually not proper pairs, for
+   example because the mates did not face each other. deepTools adds further checks to determine if a read pair is a
+   proper pair.
+ * Added titles to QC plots (#74)
+ * Added --samFlagInclude and --samFlagExclude parameters. This is useful to for example only include forward reads
+ * In deeptools2 most of the core code was rewriting to facilitate API usage and for optimization.

deepTools/source/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+The file deeptools/cm.py is licensed under the BSD license, see a copy in that file. The remainder of the code is licensed under the MIT license:
+Copyright 2019 Max Planck Institute for Immunobiology and Epigenetics
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

deepTools/source/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,8 @@

+include *.txt
+include README.md
+exclude examples/*
+exclude deepTools.egg-info/*
+include scripts/*
+exclude deeptools/test/*
+exclude galaxy/*
+exclude gallery/*

deepTools/source/README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+# deepTools
+[![Documentation Status](https://readthedocs.org/projects/deeptools/badge/)](http://deeptools.readthedocs.org/)
+[![PyPI Version](https://img.shields.io/pypi/v/deeptools.svg?style=plastic)](https://pypi.org/project/deepTools/)
+[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/deeptools/README.html)
+[![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAASCAYAAABB7B6eAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAACXBIWXMAAAsTAAALEwEAmpwYAAACC2lUWHRYTUw6Y29tLmFkb2JlLnhtcAAAAAAAPHg6eG1wbWV0YSB4bWxuczp4PSJhZG9iZTpuczptZXRhLyIgeDp4bXB0az0iWE1QIENvcmUgNS40LjAiPgogICA8cmRmOlJERiB4bWxuczpyZGY9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyMiPgogICAgICA8cmRmOkRlc2NyaXB0aW9uIHJkZjphYm91dD0iIgogICAgICAgICAgICB4bWxuczp0aWZmPSJodHRwOi8vbnMuYWRvYmUuY29tL3RpZmYvMS4wLyI+CiAgICAgICAgIDx0aWZmOlJlc29sdXRpb25Vbml0PjI8L3RpZmY6UmVzb2x1dGlvblVuaXQ+CiAgICAgICAgIDx0aWZmOkNvbXByZXNzaW9uPjE8L3RpZmY6Q29tcHJlc3Npb24+CiAgICAgICAgIDx0aWZmOk9yaWVudGF0aW9uPjE8L3RpZmY6T3JpZW50YXRpb24+CiAgICAgICAgIDx0aWZmOlBob3RvbWV0cmljSW50ZXJwcmV0YXRpb24+MjwvdGlmZjpQaG90b21ldHJpY0ludGVycHJldGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KD0UqkwAAAn9JREFUOBGlVEuLE0EQruqZiftwDz4QYT1IYM8eFkHFw/4HYX+GB3/B4l/YP+CP8OBNTwpCwFMQXAQPKtnsg5nJZpKdni6/6kzHvAYDFtRUT71f3UwAEbkLch9ogQxcBwRKMfAnM1/CBwgrbxkgPAYqlBOy1jfovlaPsEiWPROZmqmZKKzOYCJb/AbdYLso9/9B6GppBRqCrjSYYaquZq20EUKAzVpjo1FzWRDVrNay6C/HDxT92wXrAVCH3ASqq5VqEtv1WZ13Mdwf8LFyyKECNbgHHAObWhScf4Wnj9CbQpPzWYU3UFoX3qkhlG8AY2BTQt5/EA7qaEPQsgGLWied0A8VKrHAsCC1eJ6EFoUd1v6GoPOaRAtDPViUr/wPzkIFV9AaAZGtYB568VyJfijV+ZBzlVZJ3W7XHB2RESGe4opXIGzRTdjcAupOK09RA6kzr1NTrTj7V1ugM4VgPGWEw+e39CxO6JUw5XhhKihmaDacU2GiR0Ohcc4cZ+Kq3AjlEnEeRSazLs6/9b/kh4eTC+hngE3QQD7Yyclxsrf3cpxsPXn+cFdenF9aqlBXMXaDiEyfyfawBz2RqC/O9WF1ysacOpytlUSoqNrtfbS642+4D4CS9V3xb4u8P/ACI4O810efRu6KsC0QnjHJGaq4IOGUjWTo/YDZDB3xSIxcGyNlWcTucb4T3in/3IaueNrZyX0lGOrWndstOr+w21UlVFokILjJLFhPukbVY8OmwNQ3nZgNJNmKDccusSb4UIe+gtkI+9/bSLJDjqn763f5CQ5TLApmICkqwR0QnUPKZFIUnoozWcQuRbC0Km02knj0tPYx63furGs3x/iPnz83zJDVNtdP3QAAAABJRU5ErkJggg==)](https://usegalaxy.eu/root?tool_id=deeptools_compute_matrix)
+![test](https://github.com/deeptools/deepTools/actions/workflows/test.yml/badge.svg)
+## User-friendly tools for exploring deep-sequencing data
+deepTools addresses the challenge of handling the large amounts of data that are now routinely generated from DNA sequencing centers. deepTools contains useful modules to process the mapped reads data for multiple quality checks, creating **normalized coverage files** in standard bedGraph and bigWig file formats, that allow comparison between different files (for example, treatment and control). Finally, using such normalized and standardized files, deepTools can create many publication-ready  **visualizations** to identify enrichments and for functional annotations of the genome.
+For support or questions please post to [Biostars](http://biostars.org). For bug reports and feature requests please open an issue [on github](http://github.com/deeptools/deeptools).
+### Citation:
+Ramírez F, Ryan DP, Grüning B, Bhardwaj V, Kilpert F, Richter AS, Heyne S, Dündar F, Manke T. [deepTools2: a next generation web server for deep-sequencing data analysis.](https://nar.oxfordjournals.org/content/early/2016/04/12/nar.gkw257.abstract) Nucleic Acids Research. 2016 Apr 13:gkw257.
+### Documentation:
+Our [documentation](http://deeptools.readthedocs.org/) contains more details on the [individual tool scopes and usages](http://deeptools.readthedocs.org/en/latest/content/list_of_tools.html) and an [introduction to our deepTools Galaxy web server](http://deeptools.readthedocs.org/en/latest/content/help_galaxy_intro.html) including [step-by-step protocols](http://deeptools.readthedocs.org/en/latest/content/example_usage.html).
+>Please see also the [FAQ](http://deeptools.readthedocs.org/en/latest/content/help_faq.html), which we update regularly.
+Our [Gallery](http://deeptools.readthedocs.org/en/latest/content/example_gallery.html) may give you some more ideas about the scope of deepTools.
+>For more specific **troubleshooting, feedback, and tool suggestions**, please post [to Biostars](http://biostars.org).
+-------------------------------------------------------------------------------------------------------------------
+### Installation
+deepTools are available for:
+* Command line usage (via pip / conda / github)
+* Integration into Galaxy servers (via toolshed/API/web-browser)
+There are many easy ways to install deepTools. More details can be found [here](https://deeptools.readthedocs.io/en/latest/content/installation.html).
+In Brief:
+**Install through pypi**
+	$ pip install deeptools
+**Install via conda**
+	$ conda install -c bioconda deeptools
+**Install by cloning the repository**
+	$ git clone https://github.com/deeptools/deepTools
+	$ cd deepTools
+	$ pip install .
+<a name="galaxy"/></a>
+### Galaxy Installation
+deepTools can be easily integrated into [Galaxy](http://galaxyproject.org). Please see the [installation instructions in our documentation](http://deeptools.readthedocs.io/en/latest/content/installation.html#galaxy-installation) for further details.
+**Note:** From version 2.3 onwards, deepTools support **python3**.
+------------------------------------
+This tool suite is developed by the [Bioinformatics Facility](http://www1.ie-freiburg.mpg.de/bioinformaticsfac) at the [Max Planck Institute for Immunobiology and Epigenetics, Freiburg](http://www1.ie-freiburg.mpg.de/).
+[Documentation](http://deeptools.readthedocs.org/en/latest/index.html) | [deepTools Galaxy](http://deeptools.ie-freiburg.mpg.de) | [FAQ](http://deeptools.readthedocs.org/en/latest/content/help_faq.html)

deepTools/source/README.rst ADDED Viewed

	@@ -0,0 +1,29 @@

+======================================================================
+deepTools
+======================================================================
+User-friendly tools for exploring deep-sequencing data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+deepTools addresses the challenge of handling the large amounts of data
+that are now routinely generated from DNA sequencing centers. deepTools
+contains useful modules to process the mapped reads data for multiple
+quality checks, creating **normalized coverage files** in standard
+bedGraph and bigWig file formats, that allow comparison between
+different files (for example, treatment and control). Finally, using
+such normalized and standardized files, deepTools can create many
+publication-ready **visualizations** to identify enrichments and for
+functional annotations of the genome.
+For support or questions please make a post on `Biostars <http://biostars.org>`__. For feature requests, please open an issue on `github <http://github.com/deeptools/deeptools>`__.
+For further documentation, please see our `read the docs page <http://deeptools.readthedocs.org/>`__.
+Citation:
+^^^^^^^^^
+Ramírez F, Ryan DP, Grüning B, Bhardwaj V, Kilpert F, Richter AS, Heyne
+S, Dündar F, Manke T. `deepTools2: a next generation web server for
+deep-sequencing data
+analysis. <https://nar.oxfordjournals.org/content/early/2016/04/12/nar.gkw257.abstract>`__
+Nucleic Acids Research. 2016 Apr 13:gkw257.

deepTools/source/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# -*- coding: utf-8 -*-
+"""
+deepTools Project Package Initialization File
+"""

deepTools/source/deeptools/SES_scaleFactor.py ADDED Viewed

	@@ -0,0 +1,195 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import numpy as np
+# own packages
+from deeptools import bamHandler
+import deeptools.countReadsPerBin as countR
+old_settings = np.seterr(all='ignore')
+debug = 0
+def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
+                        normalizationLength,
+                        avg_method='median', blackListFileName=None, numberOfProcessors=1,
+                        verbose=False, chrsToSkip=[], mappingStatsList=[]):
+    r"""
+    Subdivides the genome into chunks to be analyzed in parallel
+    using several processors. The code handles the creation of
+    workers that compute fragment counts (coverage) for different
+    regions and then collect and integrates the results.
+    Parameters
+    ----------
+    bamFilesList : list
+        list of bam files to normalize
+    binLength : int
+        the window size in bp, where reads are going to be
+        counted.
+    numberOfSamples : int
+        number of sites to sample from the genome. For more info see
+        the documentation of the CountReadsPerBin class
+    normalizationLength : int
+        length, in bp, to normalize the data.
+        For a value of 1, on average
+        1 read per base pair is found
+    avg_method : str
+        defines how the different values are to be summarized.
+        The options are 'mean' and 'median'
+    chrsToSkip : list
+        name of the chromosomes to be excluded from the
+        scale estimation. Usually the chrX is included.
+    blackListFileName : str
+        BED file containing blacklisted regions
+    mappingStatsList : list
+        List of the number of mapped reads per file
+    Returns
+    -------
+    dict
+        Dictionary with the following keys::
+            'size_factors'
+            'size_factors_based_on_mapped_reads'
+            'size_factors_SES'
+            'size_factors_based_on_mean'
+            'size_factors_based_on_median'
+            'mean'
+            'meanSES'
+            'median'
+            'reads_per_bin'
+            'std'
+            'sites_sampled'
+    Examples
+    --------
+    >>> test = Tester()
+    >>> bin_length = 50
+    >>> num_samples = 4
+    >>> _dict = estimateScaleFactor([test.bamFile1, test.bamFile2], bin_length, num_samples,  1)
+    >>> _dict['size_factors']
+    array([1. , 0.5])
+    >>> _dict['size_factors_based_on_mean']
+    array([1. , 0.5])
+    """
+    assert len(bamFilesList) == 2, "SES scale factors are only defined for 2 files"
+    if len(mappingStatsList) == len(bamFilesList):
+        mappedReads = mappingStatsList
+    else:
+        mappedReads = []
+        for fname in bamFilesList:
+            mappedReads.append(bamHandler.openBam(fname, returnStats=True, nThreads=numberOfProcessors)[1])
+    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')
+    sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads
+    cr = countR.CountReadsPerBin(bamFilesList,
+                                 binLength=binLength,
+                                 numberOfSamples=numberOfSamples,
+                                 extendReads=False,
+                                 blackListFileName=blackListFileName,
+                                 numberOfProcessors=numberOfProcessors,
+                                 verbose=verbose,
+                                 chrsToSkip=chrsToSkip)
+    try:
+        num_reads_per_bin = cr.run()
+    except Exception as detail:
+        exit("*ERROR*: {}".format(detail))
+    sitesSampled = len(num_reads_per_bin)
+    # the transpose is taken to easily iterate by columns which are now
+    # converted to rows
+    num_reads_per_bin = num_reads_per_bin.transpose()
+    # size factors based on order statistics
+    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
+    # Normalization, bias correction, and peak calling for ChIP-seq.
+    # Statistical applications in genetics and molecular biology, 11(3).
+    # using the same names as in Diaz paper
+    # p refers to ChIP, q to input
+    p = np.sort(num_reads_per_bin[0, :]).cumsum()
+    q = np.sort(num_reads_per_bin[1, :]).cumsum()
+    # p[-1] and q[-1] are the maximum values in the  arrays.
+    # both p and q are normalized by this value
+    diff = np.abs(p / p[-1] - q / q[-1])
+    # get the lowest rank for wich the difference is the maximum
+    maxIndex = np.flatnonzero(diff == diff.max())[0]
+    # Take a lower rank to move to a region with probably
+    # less peaks and more background.
+    maxIndex = int(maxIndex * 0.8)
+    while maxIndex < len(p):
+        # in rare cases the maxIndex maps to a zero value.
+        # In such cases, the next index is used until
+        # a non zero value appears.
+        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
+        if cumSum.min() > 0:
+            break
+        maxIndex += 1
+    meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
+               np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])]
+    # the maxIndex may be too close to the the signal regions
+    # so i take a more conservative approach by taking a close number
+    sizeFactorsSES = cumSum.min() / cumSum
+    median = np.median(num_reads_per_bin, axis=1)
+    # consider only those read numbers that are below the 90
+    # percentile to stimate the
+    # mean and std
+    mean = []
+    std = []
+    for values in num_reads_per_bin:
+        maxNumReads = (np.percentile(values, 90))
+        if maxNumReads == 0:
+            maxNumReads = (np.percentile(values, 99))
+            if maxNumReads == 0:
+                print("all genomic regions sampled from one ")
+                "of the bam files have no reads.\n"
+                values = values[values <= maxNumReads]
+        mean.append(np.mean(values))
+        std.append(np.std(values))
+    mean = np.array(mean)
+    readsPerBin = mean if avg_method == 'mean' else median
+    if min(median) == 0:
+        idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
+        exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
+             "Try selecting a larger sample size or a region with coverage\n".format(idx_zero))
+    sizeFactor = sizeFactorsSES
+    return {'size_factors': sizeFactor,
+            'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
+            'size_factors_SES': sizeFactorsSES,
+            'size_factors_based_on_mean': mean.min() / mean,
+            'size_factors_based_on_median': median.min() / median,
+            'mean': mean,
+            'meanSES': meanSES,
+            'median': median,
+            'reads_per_bin': readsPerBin,
+            'std': std,
+            'sites_sampled': sitesSampled}
+class Tester(object):
+    def __init__(self):
+        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+        self.bamFile1 = self.root + "testA.bam"
+        self.bamFile2 = self.root + "testB.bam"
+        global debug
+        debug = 0
+        self.chrom = '3R'

deepTools/source/deeptools/__init__.py ADDED Viewed

File without changes

deepTools/source/deeptools/alignmentSieve.py ADDED Viewed

	@@ -0,0 +1,439 @@

+#!/usr/bin/env python
+import argparse
+import pysam
+import os
+import sys
+from deeptools import parserCommon
+from deeptools.bamHandler import openBam
+from deeptools.mapReduce import mapReduce
+from deeptools.utilities import getTLen, smartLabels, getTempFileName
+from importlib.metadata import version
+def parseArguments():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="This tool filters alignments in a BAM/CRAM file according the the specified parameters. It can optionally output to BEDPE format.",
+        usage='alignmentSieve -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt\n'
+        'help: alignmentSieve -h / alignmentSieve --help')
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--bam', '-b',
+                          metavar='FILE1',
+                          help='An indexed BAM file.',
+                          required=True)
+    required.add_argument('--outFile', '-o',
+                          help='The file to write results to. These are the alignments or fragments that pass the filtering criteria.')
+    general = parser.add_argument_group('General arguments')
+    general.add_argument('--numberOfProcessors', '-p',
+                         help='Number of processors to use. Type "max/2" to '
+                         'use half the maximum number of processors or "max" '
+                         'to use all available processors. (Default: %(default)s)',
+                         metavar="INT",
+                         type=parserCommon.numberOfProcessors,
+                         default=1,
+                         required=False)
+    general.add_argument('--filterMetrics',
+                         metavar="FILE.log",
+                         help="The number of entries in total and filtered are saved to this file")
+    general.add_argument('--filteredOutReads',
+                         metavar="filtered.bam",
+                         help="If desired, all reads NOT passing the filtering criteria can be written to this file.")
+    general.add_argument('--label', '-l',
+                         metavar='sample1',
+                         help='User defined label instead of the default label '
+                         '(file name).')
+    general.add_argument('--smartLabels',
+                         action='store_true',
+                         help='Instead of manually specifying a labels for the input '
+                         'file, this causes deepTools to use the file name '
+                         'after removing the path and extension.')
+    general.add_argument('--verbose', '-v',
+                         help='Set to see processing messages.',
+                         action='store_true')
+    general.add_argument('--version', action='version',
+                         version='%(prog)s {}'.format(version('deeptools')))
+    general.add_argument('--shift',
+                         nargs='+',
+                         type=int,
+                         help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.')
+    general.add_argument('--ATACshift',
+                         action='store_true',
+                         help='Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.')
+    general.add_argument('--genomeChunkLength',
+                         type=int,
+                         default=int(1e6),
+                         help='Size of the genome (in bps) to be processed per thread. (Default: %(default)s)')
+    output = parser.add_argument_group('Output arguments')
+    output.add_argument('--BED',
+                        action='store_true',
+                        help='Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.')
+    filtering = parser.add_argument_group('Optional arguments')
+    filtering.add_argument('--filterRNAstrand',
+                           help='Selects RNA-seq reads (single-end or paired-end) in '
+                                'the given strand. (Default: %(default)s)',
+                           choices=['forward', 'reverse'],
+                           default=None)
+    filtering.add_argument('--ignoreDuplicates',
+                           help='If set, reads that have the same orientation '
+                           'and start position will be considered only '
+                           'once. If reads are paired, the mate\'s position '
+                           'also has to coincide to ignore a read.',
+                           action='store_true')
+    filtering.add_argument('--minMappingQuality',
+                           metavar='INT',
+                           help='If set, only reads that have a mapping '
+                           'quality score of at least this are '
+                           'considered.',
+                           type=int)
+    filtering.add_argument('--samFlagInclude',
+                           help='Include reads based on the SAM flag. For example, '
+                           'to get only reads that are the first mate, use a flag of 64. '
+                           'This is useful to count properly paired reads only once, '
+                           'as otherwise the second mate will be also considered for the '
+                           'coverage.',
+                           metavar='INT',
+                           default=None,
+                           type=int,
+                           required=False)
+    filtering.add_argument('--samFlagExclude',
+                           help='Exclude reads based on the SAM flag. For example, '
+                           'to get only reads that map to the forward strand, use '
+                           '--samFlagExclude 16, where 16 is the SAM flag for reads '
+                           'that map to the reverse strand.',
+                           metavar='INT',
+                           default=None,
+                           type=int,
+                           required=False)
+    filtering.add_argument('--blackListFileName', '-bl',
+                           help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.",
+                           metavar="BED file",
+                           nargs="+",
+                           required=False)
+    filtering.add_argument('--minFragmentLength',
+                           help='The minimum fragment length needed for read/pair '
+                           'inclusion. This option is primarily useful '
+                           'in ATACseq experiments, for filtering mono- or '
+                           'di-nucleosome fragments. (Default: %(default)s)',
+                           metavar='INT',
+                           default=0,
+                           type=int,
+                           required=False)
+    filtering.add_argument('--maxFragmentLength',
+                           help='The maximum fragment length needed for read/pair '
+                           'inclusion. A value of 0 indicates no limit. (Default: %(default)s)',
+                           metavar='INT',
+                           default=0,
+                           type=int,
+                           required=False)
+    return parser
+def shiftRead(b, chromDict, args):
+    if not b.is_proper_pair:
+        return None
+    tLen = getTLen(b, notAbs=True)
+    start = b.pos
+    end = start + b.query_alignment_end
+    if b.is_reverse and not b.is_read2:
+        end -= args.shift[2]
+        deltaTLen = args.shift[3] - args.shift[2]
+    elif b.is_reverse and b.is_read2:
+        end += args.shift[1]
+        deltaTLen = args.shift[1] - args.shift[0]
+    elif not b.is_reverse and not b.is_read2:
+        start += args.shift[0]
+        deltaTLen = args.shift[1] - args.shift[0]
+    else:
+        start -= args.shift[3]
+        deltaTLen = args.shift[3] - args.shift[2]
+    # Sanity check
+    if end - start < 1:
+        if b.is_reverse:
+            start = end - 1
+        else:
+            end = start + 1
+    if start < 0:
+        start = 0
+    if end > chromDict[b.reference_name]:
+        end = chromDict[b.reference_name]
+    if end - start < 1:
+        return None
+    # create a new read
+    b2 = pysam.AlignedSegment()
+    b2.query_name = b.query_name
+    b2.flag = b.flag
+    b2.reference_id = b.reference_id
+    b2.reference_start = start
+    b2.mapping_quality = b.mapping_quality
+    b2.cigar = ((0, end - start),)  # Returned cigar is only matches
+    if tLen < 0:
+        b2.template_length = tLen - deltaTLen
+    else:
+        b2.template_length = tLen + deltaTLen
+    b2.next_reference_id = b.next_reference_id
+    b2.next_reference_start = b.next_reference_start
+    if b.is_proper_pair:
+        if b2.is_read2 and b2.is_reverse:
+            b2.next_reference_start += args.shift[0]
+        elif not b2.is_read2 and b2.is_reverse:
+            b2.next_reference_start -= args.shift[3]
+    return b2
+def filterWorker(arglist):
+    chrom, start, end, args, chromDict = arglist
+    fh = openBam(args.bam)
+    mode = 'wb'
+    oname = getTempFileName(suffix='.bam')
+    if args.filteredOutReads:
+        onameFiltered = getTempFileName(suffix='.bam')
+    else:
+        onameFiltered = None
+    ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
+    if onameFiltered:
+        ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
+    else:
+        ofiltered = None
+    prev_pos = set()
+    lpos = None
+    nFiltered = 0
+    total = 0
+    for read in fh.fetch(chrom, start, end):
+        if read.pos < start:
+            # ensure that we never double count (in case distanceBetweenBins == 0)
+            continue
+        total += 1
+        if read.flag & 4:
+            # Ignore unmapped reads, they were counted already
+            nFiltered += 1
+            if ofiltered:
+                ofiltered.write(read)
+            continue
+        if args.minMappingQuality and read.mapq < args.minMappingQuality:
+            nFiltered += 1
+            if ofiltered:
+                ofiltered.write(read)
+            continue
+        if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
+            nFiltered += 1
+            if ofiltered:
+                ofiltered.write(read)
+            continue
+        if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
+            nFiltered += 1
+            if ofiltered:
+                ofiltered.write(read)
+            continue
+        tLen = getTLen(read)
+        if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
+            nFiltered += 1
+            if ofiltered:
+                ofiltered.write(read)
+            continue
+        if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
+            nFiltered += 1
+            if ofiltered:
+                ofiltered.write(read)
+            continue
+        if args.ignoreDuplicates:
+            # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+            if tLen >= 0:
+                s = read.pos
+                e = s + tLen
+            else:
+                s = read.pnext
+                e = s - tLen
+            if read.reference_id != read.next_reference_id:
+                e = read.pnext
+            if lpos is not None and lpos == read.reference_start \
+                    and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+                nFiltered += 1
+                if ofiltered:
+                    ofiltered.write(read)
+                continue
+            if lpos != read.reference_start:
+                prev_pos.clear()
+            lpos = read.reference_start
+            prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+        # filterRNAstrand
+        if args.filterRNAstrand:
+            if read.is_paired:
+                if args.filterRNAstrand == 'forward':
+                    if read.flag & 144 == 128 or read.flag & 96 == 64:
+                        pass
+                    else:
+                        nFiltered += 1
+                        if ofiltered:
+                            ofiltered.write(read)
+                        continue
+                elif args.filterRNAstrand == 'reverse':
+                    if read.flag & 144 == 144 or read.flag & 96 == 96:
+                        pass
+                    else:
+                        nFiltered += 1
+                        if ofiltered:
+                            ofiltered.write(read)
+                        continue
+            else:
+                if args.filterRNAstrand == 'forward':
+                    if read.flag & 16 == 16:
+                        pass
+                    else:
+                        nFiltered += 1
+                        if ofiltered:
+                            ofiltered.write(read)
+                        continue
+                elif args.filterRNAstrand == 'reverse':
+                    if read.flag & 16 == 0:
+                        pass
+                    else:
+                        nFiltered += 1
+                        if ofiltered:
+                            ofiltered.write(read)
+                        continue
+        if args.shift:
+            read = shiftRead(read, chromDict, args)
+            if not read:
+                continue
+        # Read survived filtering
+        ofh.write(read)
+    # The results from the workers will get sorted, so get the TID
+    tid = fh.get_tid(chrom)
+    ofh.close()
+    if ofiltered:
+        ofiltered.close()
+    fh.close()
+    return tid, start, total, nFiltered, oname, onameFiltered
+def convertBED(oname, tmpFiles, chromDict):
+    """
+    Stores results in BEDPE format, which is:
+    chromosome	frag_leftend	frag_rightend
+    The fragment ends can be shifted
+    """
+    ofile = open(oname, "w")
+    for tmpFile in tmpFiles:
+        # Setting verbosity to avoid lack of index error/warning
+        pysam.set_verbosity(0)
+        fh = pysam.AlignmentFile(tmpFile)
+        # Reset verbosity
+        pysam.set_verbosity(3)
+        for b in fh.fetch(until_eof=True):
+            tLen = getTLen(b, notAbs=True)
+            if tLen > 0:
+                start = b.pos
+                end = start + tLen
+                if end > chromDict[b.reference_name]:
+                    end = chromDict[b.reference_name]
+                if end - start < 1:
+                    continue
+                ofile.write("{}\t{}\t{}\n".format(b.reference_name, start, end))
+        fh.close()
+        os.unlink(tmpFile)
+    ofile.close()
+def main(args=None):
+    args = parseArguments().parse_args(args)
+    if args.shift:
+        if len(args.shift) not in [2, 4]:
+            sys.exit("The --shift option can accept either 2 or 4 values only.")
+        if len(args.shift) == 2:
+            args.shift.extend([-args.shift[1], -args.shift[0]])
+    elif args.ATACshift:
+        args.shift = [4, -5, 5, -4]
+    bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
+    total = mapped + unmapped
+    chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)]
+    chromDict = {x: y for x, y in zip(bam.references, bam.lengths)}
+    # Filter, writing the results to a bunch of temporary files
+    res = mapReduce([args, chromDict],
+                    filterWorker,
+                    chrom_sizes,
+                    blackListFileName=args.blackListFileName,
+                    numberOfProcessors=args.numberOfProcessors,
+                    genomeChunkLength=args.genomeChunkLength,
+                    verbose=args.verbose)
+    res = sorted(res)  # The temp files are now in order for concatenation
+    nFiltered = sum([x[3] for x in res])
+    totalSeen = sum([x[2] for x in res])  # The * contig isn't queried
+    tmpFiles = [x[4] for x in res]
+    if not args.BED:
+        arguments = ["-o", args.outFile]
+        arguments.extend(tmpFiles)  # [..., *someList] isn't available in python 2.7
+        pysam.samtools.cat(*arguments)
+        for tmpFile in tmpFiles:
+            os.unlink(tmpFile)
+    else:
+        convertBED(args.outFile, tmpFiles, chromDict)
+    if args.filteredOutReads:
+        tmpFiles = [x[5] for x in res]
+        if not args.BED:
+            arguments = ["-o", args.filteredOutReads]
+            arguments.extend(tmpFiles)  # [..., *someList] isn't available in python 2.7
+            pysam.samtools.cat(*arguments)
+            for tmpFile in tmpFiles:
+                os.unlink(tmpFile)
+        else:
+            convertBED(args.outFile, tmpFiles, chromDict, args)
+    if args.filterMetrics:
+        sampleName = args.bam
+        if args.label:
+            sampleName = args.label
+        if args.smartLabels:
+            sampleName = smartLabels([args.bam])[0]
+        of = open(args.filterMetrics, "w")
+        of.write("#bamFilterReads --filterMetrics\n")
+        of.write("#File\tReads Remaining\tTotal Initial Reads\n")
+        of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total))
+        of.close()
+    return 0

deepTools/source/deeptools/bamCompare.py ADDED Viewed

	@@ -0,0 +1,314 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse  # to parse command line arguments
+import numpy as np
+import sys
+# my packages
+from deeptools import writeBedGraph
+from deeptools.SES_scaleFactor import estimateScaleFactor
+from deeptools import parserCommon
+from deeptools import bamHandler
+from deeptools.getRatio import getRatio
+from deeptools.getScaleFactor import get_num_kept_reads
+from deeptools.getScaleFactor import get_scale_factor
+debug = 0
+old_settings = np.seterr(all='ignore')
+def parseArguments():
+    parentParser = parserCommon.getParentArgParse()
+    bamParser = parserCommon.read_options()
+    normalizationParser = parserCommon.normalization_options()
+    requiredArgs = getRequiredArgs()
+    optionalArgs = getOptionalArgs()
+    outputParser = parserCommon.output()
+    parser = argparse.ArgumentParser(
+        parents=[requiredArgs, outputParser, optionalArgs,
+                 parentParser, normalizationParser, bamParser],
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='This tool compares two BAM files based on the number of '
+        'mapped reads. To compare the BAM files, the genome is partitioned '
+        'into bins of equal size, then the number of reads found in each bin'
+        ' is counted per file, and finally a summary value is '
+        'reported. This value can be the ratio of the number of reads per '
+        'bin, the log2 of the ratio, or the difference. This tool can '
+        'normalize the number of reads in each BAM file using the SES method '
+        'proposed by Diaz et al. (2012) "Normalization, bias correction, and '
+        'peak calling for ChIP-seq". Statistical Applications in Genetics '
+        'and Molecular Biology, 11(3). Normalization based on read counts '
+        'is also available. The output is either a bedgraph or bigWig file '
+        'containing the bin location and the resulting comparison value. '
+        'Note that *each end* in a pair (for paired-end reads) is treated '
+        'independently. If this is undesirable, then use the --samFlagInclude '
+        'or --samFlagExclude options.',
+        usage='bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw\n'
+        'help: bamCompare -h / bamCompare --help',
+        add_help=False)
+    return parser
+def getRequiredArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    # define the arguments
+    required.add_argument('--bamfile1', '-b1',
+                          metavar='BAM file',
+                          help='Sorted BAM file 1. Usually the BAM file '
+                          'for the treatment.',
+                          required=True)
+    required.add_argument('--bamfile2', '-b2',
+                          metavar='BAM file',
+                          help='Sorted BAM file 2. Usually the BAM '
+                          'file for the control.',
+                          required=True)
+    return parser
+def getOptionalArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument("--help", "-h", action="help",
+                          help="show this help message and exit")
+    optional.add_argument('--scaleFactorsMethod',
+                          help='Method to use to scale the samples. '
+                          'If a method is specified, then it will be used to compensate '
+                          'for sequencing depth differences between the samples. '
+                          'As an alternative, this can be set to None and an option from '
+                          '--normalizeUsing <method> can be used. (Default: %(default)s)',
+                          choices=['readCount', 'SES', 'None'],
+                          default='readCount')
+    optional.add_argument('--sampleLength', '-l',
+                          help='*Only relevant when SES is chosen for the '
+                          'scaleFactorsMethod.* To compute the SES, specify '
+                          'the length (in bases) of the regions (see --numberOfSamples) '
+                          'that will be randomly sampled to calculate the scaling factors. '
+                          'If you do not have a good sequencing depth for '
+                          'your samples consider increasing the sampling '
+                          'regions\' size to minimize the probability '
+                          'that zero-coverage regions are used. (Default: %(default)s)',
+                          default=1000,
+                          type=int)
+    optional.add_argument('--numberOfSamples', '-n',
+                          help='*Only relevant when SES is chosen for the '
+                          'scaleFactorsMethod.* Number of samplings taken '
+                          'from the genome to compute the scaling factors. (Default: %(default)s)',
+                          default=1e5,
+                          type=int)
+    optional.add_argument('--scaleFactors',
+                          help='Set this parameter manually to avoid the computation of '
+                          'scaleFactors. The format is scaleFactor1:scaleFactor2.'
+                          'For example, --scaleFactor 0.7:1 will cause the first BAM file to'
+                          'be multiplied by 0.7, while not scaling '
+                          'the second BAM file (multiplication with 1).',
+                          default=None,
+                          required=False)
+    optional.add_argument('--operation',
+                          help='The default is to output the log2 ratio of the '
+                          'two samples. The reciprocal ratio returns the '
+                          'the negative of the inverse of the ratio '
+                          'if the ratio is less than 0. The resulting '
+                          'values are interpreted as negative fold changes. '
+                          'Instead of performing a computation using both files, the scaled signal can '
+                          'alternatively be output for the first or second file using '
+                          'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)',
+                          default='log2',
+                          choices=['log2', 'ratio', 'subtract', 'add', 'mean',
+                                   'reciprocal_ratio', 'first', 'second'],
+                          required=False)
+    optional.add_argument('--pseudocount',
+                          help='A small number to avoid x/0. Only useful '
+                          'together with --operation log2 or --operation ratio. '
+                          'You can specify different values as pseudocounts for '
+                          'the numerator and the denominator by providing two '
+                          'values (the first value is used as the numerator '
+                          'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
+                          default=[1],
+                          type=float,
+                          nargs='+',
+                          action=parserCommon.requiredLength(1, 2),
+                          required=False)
+    optional.add_argument('--skipZeroOverZero',
+                          help='Skip bins where BOTH BAM files lack coverage. '
+                          'This is determined BEFORE any applicable pseudocount '
+                          'is added.',
+                          action='store_true')
+    return parser
+def process_args(args=None):
+    args = parseArguments().parse_args(args)
+    if args.smoothLength and args.smoothLength <= args.binSize:
+        print("Warning: the smooth length given ({}) is smaller than the bin "
+              "size ({}).\n\n No smoothing will be "
+              "done".format(args.smoothLength,
+                            args.binSize))
+        args.smoothLength = None
+    if not args.ignoreForNormalization:
+        args.ignoreForNormalization = []
+    if not isinstance(args.pseudocount, list):
+        args.pseudocount = [args.pseudocount]
+    if len(args.pseudocount) == 1:
+        args.pseudocount *= 2
+    return args
+# get_scale_factors function is used for scaling in bamCompare
+# while get_scale_factor is used for depth normalization
+def get_scale_factors(args, statsList, mappedList):
+    if args.scaleFactors:
+        scale_factors = list(map(float, args.scaleFactors.split(":")))
+    elif args.scaleFactorsMethod == 'SES':
+        scalefactors_dict = estimateScaleFactor(
+            [args.bamfile1, args.bamfile2],
+            args.sampleLength, args.numberOfSamples,
+            1,
+            mappingStatsList=mappedList,
+            blackListFileName=args.blackListFileName,
+            numberOfProcessors=args.numberOfProcessors,
+            verbose=args.verbose,
+            chrsToSkip=args.ignoreForNormalization)
+        scale_factors = scalefactors_dict['size_factors']
+        if args.verbose:
+            print("Size factors using SES: {}".format(scale_factors))
+            print("%s regions of size %s where used " %
+                  (scalefactors_dict['sites_sampled'],
+                   args.sampleLength))
+            print("ignoring filtering/blacklists, size factors if the number of mapped "
+                  "reads would have been used:")
+            print(tuple(
+                float(min(mappedList)) / np.array(mappedList)))
+    elif args.scaleFactorsMethod == 'readCount':
+        # change the scaleFactor to 1.0
+        args.scaleFactor = 1.0
+        # get num of kept reads for bam file 1
+        args.bam = args.bamfile1
+        bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
+        # get num of kept reads for bam file 2
+        args.bam = args.bamfile2
+        bam2_mapped, _ = get_num_kept_reads(args, statsList[1])
+        mapped_reads = [bam1_mapped, bam2_mapped]
+        # new scale_factors (relative to min of two bams)
+        scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads)
+        if args.verbose:
+            print("Size factors using total number "
+                  "of mapped reads: {}".format(scale_factors))
+    elif args.scaleFactorsMethod == 'None':
+        scale_factors = None
+    return scale_factors
+def main(args=None):
+    """
+    The algorithm is composed of two steps.
+    1. Per-sample scaling / depth Normalization:
+     + If scaling is used (using the SES or read counts method), appropriate scaling
+       factors are determined to account for sequencing depth differences.
+     + Optionally scaling can be turned off and individual samples could be depth normalized using
+       RPKM, BPM or CPM methods
+    2. Ratio calculation between two bam files:
+     + The genome is transversed and computing
+       the log ratio/ratio/difference etc. for bins of fixed width
+       given by the user.
+    """
+    args = process_args(args)
+    if args.normalizeUsing == "RPGC":
+        sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!")
+    if args.normalizeUsing == 'None':
+        args.normalizeUsing = None  # For the sake of sanity
+    if args.scaleFactorsMethod != 'None' and args.normalizeUsing:
+        sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))
+    # Get mapping statistics
+    bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)
+    bam1.close()
+    bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors)
+    bam2.close()
+    scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2])
+    if scale_factors is None:
+        # check whether one of the depth norm methods are selected
+        if args.normalizeUsing is not None:
+            args.scaleFactor = 1.0
+            # if a normalization is required then compute the scale factors
+            args.bam = args.bamfile1
+            scale_factor_bam1 = get_scale_factor(args, stats1)
+            args.bam = args.bamfile2
+            scale_factor_bam2 = get_scale_factor(args, stats2)
+            scale_factors = [scale_factor_bam1, scale_factor_bam2]
+        else:
+            scale_factors = [1, 1]
+    if args.verbose:
+        print("Individual scale factors are {0}".format(scale_factors))
+    # the getRatio function is called and receives
+    # the func_args per each tile that is considered
+    FUNC = getRatio
+    func_args = {'valueType': args.operation,
+                 'scaleFactors': scale_factors,
+                 'pseudocount': args.pseudocount
+                 }
+    wr = writeBedGraph.WriteBedGraph([args.bamfile1, args.bamfile2], args.binSize, 0,
+                                     stepSize=args.binSize,
+                                     region=args.region,
+                                     numberOfProcessors=args.numberOfProcessors,
+                                     extendReads=args.extendReads,
+                                     blackListFileName=args.blackListFileName,
+                                     minMappingQuality=args.minMappingQuality,
+                                     ignoreDuplicates=args.ignoreDuplicates,
+                                     center_read=args.centerReads,
+                                     zerosToNans=args.skipNonCoveredRegions,
+                                     skipZeroOverZero=args.skipZeroOverZero,
+                                     samFlag_include=args.samFlagInclude,
+                                     samFlag_exclude=args.samFlagExclude,
+                                     minFragmentLength=args.minFragmentLength,
+                                     maxFragmentLength=args.maxFragmentLength,
+                                     chrsToSkip=args.ignoreForNormalization,
+                                     verbose=args.verbose
+                                     )
+    wr.run(FUNC, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)
+if __name__ == "__main__":
+    main()

deepTools/source/deeptools/bamCoverage.py ADDED Viewed

	@@ -0,0 +1,416 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# own tools
+import argparse
+import sys
+import numpy as np
+from deeptools import writeBedGraph  # This should be made directly into a bigWig
+from deeptools import parserCommon
+from deeptools.getScaleFactor import get_scale_factor
+from deeptools.bamHandler import openBam
+debug = 0
+def parseArguments():
+    parentParser = parserCommon.getParentArgParse()
+    bamParser = parserCommon.read_options()
+    normalizationParser = parserCommon.normalization_options()
+    requiredArgs = get_required_args()
+    optionalArgs = get_optional_args()
+    outputParser = parserCommon.output()
+    parser = \
+        argparse.ArgumentParser(
+            parents=[requiredArgs, outputParser, optionalArgs,
+                     parentParser, normalizationParser, bamParser],
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+            description='This tool takes an alignment of reads or fragments '
+            'as input (BAM file) and generates a coverage track (bigWig or '
+            'bedGraph) as output. '
+            'The coverage is calculated as the number of reads per bin, '
+            'where bins are short consecutive counting windows of a defined '
+            'size. It is possible to extended the length of the reads '
+            'to better reflect the actual fragment length. *bamCoverage* '
+            'offers normalization by scaling factor, Reads Per Kilobase per '
+            'Million mapped reads (RPKM), counts per million (CPM), bins per '
+            'million mapped reads (BPM) and 1x depth (reads per genome '
+            'coverage, RPGC).\n',
+            usage='bamCoverage -b reads.bam -o coverage.bw\n'
+            'help: bamCoverage -h / bamCoverage --help',
+            add_help=False)
+    return parser
+def get_required_args():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    # define the arguments
+    required.add_argument('--bam', '-b',
+                          help='BAM file to process',
+                          metavar='BAM file',
+                          required=True)
+    return parser
+def get_optional_args():
+    parser = argparse.ArgumentParser(add_help=False)
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument("--help", "-h", action="help",
+                          help="show this help message and exit")
+    optional.add_argument('--scaleFactor',
+                          help='The computed scaling factor (or 1, if not applicable) will '
+                          'be multiplied by this. (Default: %(default)s)',
+                          default=1.0,
+                          type=float,
+                          required=False)
+    optional.add_argument('--MNase',
+                          help='Determine nucleosome positions from MNase-seq data. '
+                          'Only 3 nucleotides at the center of each fragment are counted. '
+                          'The fragment ends are defined by the two mate reads. Only fragment lengths'
+                          'between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. '
+                          'By default, any fragments smaller or larger than this are ignored. To '
+                          'over-ride this, use the --minFragmentLength and --maxFragmentLength options, '
+                          'which will default to 130 and 200 if not otherwise specified in the presence '
+                          'of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.',
+                          action='store_true')
+    optional.add_argument('--Offset',
+                          help='Uses this offset inside of each read as the signal. This is useful in '
+                          'cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the '
+                          'start of the read. This can be paired with the --filterRNAstrand option. '
+                          'Note that negative values indicate offsets from the end of each read. A value '
+                          'of 1 indicates the first base of the alignment (taking alignment orientation '
+                          'into account). Likewise, a value of -1 is the last base of the alignment. An '
+                          'offset of 0 is not permitted. If two values are specified, then they will be '
+                          'used to specify a range of positions. Note that specifying something like '
+                          '--Offset 5 -1 will result in the 5th through last position being used, which '
+                          'is equivalent to trimming 4 bases from the 5-prime end of alignments. Note '
+                          'that if you specify --centerReads, the centering will be performed before the '
+                          'offset.',
+                          metavar='INT',
+                          type=int,
+                          nargs='+',
+                          required=False)
+    optional.add_argument('--filterRNAstrand',
+                          help='Selects RNA-seq reads (single-end or paired-end) originating from genes '
+                          'on the given strand. This option assumes a standard dUTP-based library '
+                          'preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, '
+                          'which originally came from genes on the forward strand using a dUTP-based '
+                          'method). Consider using --samExcludeFlag instead for filtering by strand in '
+                          'other contexts.',
+                          choices=['forward', 'reverse'],
+                          default=None)
+    return parser
+def scaleFactor(string):
+    try:
+        scalefactor1, scalefactor2 = string.split(":")
+        scalefactors = (float(scalefactor1), float(scalefactor2))
+    except:
+        raise argparse.ArgumentTypeError(
+            "Format of scaleFactors is factor1:factor2. "
+            "The value given ( {} ) is not valid".format(string))
+    return scalefactors
+def process_args(args=None):
+    args = parseArguments().parse_args(args)
+    if args.smoothLength and args.smoothLength <= args.binSize:
+        print("Warning: the smooth length given ({}) is smaller than the bin "
+              "size ({}).\n\n No smoothing will be done".format(args.smoothLength, args.binSize))
+        args.smoothLength = None
+    if not args.ignoreForNormalization:
+        args.ignoreForNormalization = []
+    return args
+def main(args=None):
+    args = process_args(args)
+    global debug
+    if args.verbose:
+        sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor))
+        debug = 1
+    else:
+        debug = 0
+    if args.normalizeUsing == 'None':
+        args.normalizeUsing = None  # For the sake of sanity
+    elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
+        sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")
+    if args.normalizeUsing:
+        # if a normalization is required then compute the scale factors
+        bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
+        bam.close()
+        scale_factor = get_scale_factor(args, stats)
+    else:
+        scale_factor = args.scaleFactor
+    func_args = {'scaleFactor': scale_factor}
+    # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
+    if args.filterRNAstrand and not args.Offset:
+        args.Offset = [1, -1]
+    if args.MNase:
+        # check that library is paired end
+        # using getFragmentAndReadSize
+        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
+                                                                    return_lengths=False,
+                                                                    blackListFileName=args.blackListFileName,
+                                                                    numberOfProcessors=args.numberOfProcessors,
+                                                                    verbose=args.verbose)
+        if frag_len_dict is None:
+            sys.exit("*Error*: For the --MNAse function a paired end library is required. ")
+        # Set some default fragment length bounds
+        if args.minFragmentLength == 0:
+            args.minFragmentLength = 130
+        if args.maxFragmentLength == 0:
+            args.maxFragmentLength = 200
+        wr = CenterFragment([args.bam],
+                            binLength=args.binSize,
+                            stepSize=args.binSize,
+                            region=args.region,
+                            blackListFileName=args.blackListFileName,
+                            numberOfProcessors=args.numberOfProcessors,
+                            extendReads=args.extendReads,
+                            minMappingQuality=args.minMappingQuality,
+                            ignoreDuplicates=args.ignoreDuplicates,
+                            center_read=args.centerReads,
+                            zerosToNans=args.skipNonCoveredRegions,
+                            samFlag_include=args.samFlagInclude,
+                            samFlag_exclude=args.samFlagExclude,
+                            minFragmentLength=args.minFragmentLength,
+                            maxFragmentLength=args.maxFragmentLength,
+                            chrsToSkip=args.ignoreForNormalization,
+                            verbose=args.verbose,
+                            )
+    elif args.Offset:
+        if len(args.Offset) > 1:
+            if args.Offset[0] == 0:
+                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
+            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
+                sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
+        else:
+            if args.Offset[0] == 0:
+                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
+        wr = OffsetFragment([args.bam],
+                            binLength=args.binSize,
+                            stepSize=args.binSize,
+                            region=args.region,
+                            numberOfProcessors=args.numberOfProcessors,
+                            extendReads=args.extendReads,
+                            minMappingQuality=args.minMappingQuality,
+                            ignoreDuplicates=args.ignoreDuplicates,
+                            center_read=args.centerReads,
+                            zerosToNans=args.skipNonCoveredRegions,
+                            samFlag_include=args.samFlagInclude,
+                            samFlag_exclude=args.samFlagExclude,
+                            minFragmentLength=args.minFragmentLength,
+                            maxFragmentLength=args.maxFragmentLength,
+                            chrsToSkip=args.ignoreForNormalization,
+                            verbose=args.verbose)
+        wr.filter_strand = args.filterRNAstrand
+        wr.Offset = args.Offset
+    else:
+        wr = writeBedGraph.WriteBedGraph([args.bam],
+                                         binLength=args.binSize,
+                                         stepSize=args.binSize,
+                                         region=args.region,
+                                         blackListFileName=args.blackListFileName,
+                                         numberOfProcessors=args.numberOfProcessors,
+                                         extendReads=args.extendReads,
+                                         minMappingQuality=args.minMappingQuality,
+                                         ignoreDuplicates=args.ignoreDuplicates,
+                                         center_read=args.centerReads,
+                                         zerosToNans=args.skipNonCoveredRegions,
+                                         samFlag_include=args.samFlagInclude,
+                                         samFlag_exclude=args.samFlagExclude,
+                                         minFragmentLength=args.minFragmentLength,
+                                         maxFragmentLength=args.maxFragmentLength,
+                                         chrsToSkip=args.ignoreForNormalization,
+                                         verbose=args.verbose,
+                                         )
+    wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
+           blackListFileName=args.blackListFileName,
+           format=args.outFileFormat, smoothLength=args.smoothLength)
+class OffsetFragment(writeBedGraph.WriteBedGraph):
+    """
+    Class to redefine the get_fragment_from_read for the --Offset case
+    """
+    def filterStrand(self, read, rv):
+        """
+        A generic read filtering function that gets used by everything in this class.
+        rv is returned if the strand is correct, otherwise [(None, None)]
+        """
+        # Filter by RNA strand, if desired
+        if read.is_paired:
+            if self.filter_strand == 'forward':
+                if read.flag & 144 == 128 or read.flag & 96 == 64:
+                    return rv
+            elif self.filter_strand == 'reverse':
+                if read.flag & 144 == 144 or read.flag & 96 == 96:
+                    return rv
+            else:
+                return rv
+        else:
+            if self.filter_strand == 'forward':
+                if read.flag & 16 == 16:
+                    return rv
+            elif self.filter_strand == 'reverse':
+                if read.flag & 16 == 0:
+                    return rv
+            else:
+                return rv
+        return [(None, None)]
+    def get_fragment_from_read_list(self, read, offset):
+        """
+        Return the range of exons from the 0th through 1st bases, inclusive. Positions are 1-based
+        """
+        rv = [(None, None)]
+        blocks = read.get_blocks()
+        blockLen = sum([x[1] - x[0] for x in blocks])
+        if self.defaultFragmentLength != 'read length':
+            if self.is_proper_pair(read, self.maxPairedFragmentLength):
+                if read.is_reverse:
+                    foo = (read.next_reference_start, read.reference_start)
+                    if foo[0] < foo[1]:
+                        blocks.insert(0, foo)
+                else:
+                    foo = (read.reference_end, read.reference_end + abs(read.template_length) - read.infer_query_length())
+                    if foo[0] < foo[1]:
+                        blocks.append(foo)
+            # Extend using the default fragment length
+            else:
+                if read.is_reverse:
+                    foo = (read.reference_start - self.defaultFragmentLength + read.infer_query_length(), read.reference_start)
+                    if foo[0] < 0:
+                        foo = (0, foo[1])
+                    if foo[0] < foo[1]:
+                        blocks.insert(0, foo)
+                else:
+                    foo = (read.reference_end, read.reference_end + self.defaultFragmentLength - read.infer_query_length())
+                    if foo[0] < foo[1]:
+                        blocks.append(foo)
+        stretch = []
+        # For the sake of simplicity, convert [(10, 20), (30, 40)] to [10, 11, 12, 13, ..., 40]
+        # Then subset accordingly
+        for block in blocks:
+            stretch.extend(range(block[0], block[1]))
+        if read.is_reverse:
+            stretch = stretch[::-1]
+        # Handle --centerReads
+        if self.center_read:
+            _ = (len(stretch) - blockLen) // 2
+            stretch = stretch[_:_ + blockLen]
+        # Subset by --Offset
+        try:
+            foo = stretch[offset[0]:offset[1]]
+        except:
+            return rv
+        if len(foo) == 0:
+            return rv
+        if read.is_reverse:
+            foo = foo[::-1]
+        # Convert the stretch back to a list of tuples
+        foo = np.array(foo)
+        d = foo[1:] - foo[:-1]
+        idx = np.argwhere(d > 1).flatten().tolist()  # This now holds the interval bounds as a list
+        idx.append(-1)
+        last = 0
+        rv = []
+        for i in idx:
+            rv.append((foo[last].astype("int"), foo[i].astype("int") + 1))
+            last = i + 1
+        # Handle strand filtering, if needed
+        return self.filterStrand(read, rv)
+    def get_fragment_from_read(self, read):
+        """
+        This is mostly a wrapper for self.get_fragment_from_read_list(),
+        which needs a list and for the offsets to be tweaked by 1.
+        """
+        offset = [x for x in self.Offset]
+        if len(offset) > 1:
+            if offset[0] > 0:
+                offset[0] -= 1
+            if offset[1] < 0:
+                offset[1] += 1
+        else:
+            if offset[0] > 0:
+                offset[0] -= 1
+                offset = [offset[0], offset[0] + 1]
+            else:
+                if offset[0] < -1:
+                    offset = [offset[0], offset[0] + 1]
+                else:
+                    offset = [offset[0], None]
+        if offset[1] == 0:
+            # -1 gets switched to 0, which screws things up
+            offset = (offset[0], None)
+        return self.get_fragment_from_read_list(read, offset)
+class CenterFragment(writeBedGraph.WriteBedGraph):
+    """
+    Class to redefine the get_fragment_from_read for the --MNase case
+    The coverage of the fragment is defined as the 2 or 3 basepairs at the
+    center of the fragment length.
+    """
+    def get_fragment_from_read(self, read):
+        """
+        Takes a proper pair fragment of high quality and limited
+        to a certain length and outputs the center
+        """
+        fragment_start = fragment_end = None
+        # only paired forward reads are considered
+        # Fragments have already been filtered according to length
+        if read.is_proper_pair and not read.is_reverse and 1 < abs(read.tlen):
+            # distance between pairs is even return two bases at the center
+            if read.tlen % 2 == 0:
+                fragment_start = read.pos + read.tlen / 2 - 1
+                fragment_end = fragment_start + 2
+            # distance is odd return three bases at the center
+            else:
+                fragment_start = read.pos + read.tlen / 2 - 1
+                fragment_end = fragment_start + 3
+        return [(fragment_start, fragment_end)]

deepTools/source/deeptools/bamHandler.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+import pysam
+from deeptools.mapReduce import mapReduce
+def countReadsInInterval(args):
+    chrom, start, end, fname, toEOF = args
+    bam = openBam(fname)
+    mapped = 0
+    unmapped = 0
+    for b in bam.fetch(chrom, start, end):
+        if chrom == "*":
+            unmapped += 1
+            continue
+        if b.pos < start:
+            continue
+        if not b.is_unmapped:
+            mapped += 1
+        else:
+            unmapped += 1
+    return mapped, unmapped, chrom
+def getMappingStats(bam, nThreads):
+    """
+    This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless
+    This requires pysam > 0.13.0
+    """
+    header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
+    res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)
+    mapped = sum([x[0] for x in res])
+    unmapped = sum([x[1] for x in res])
+    stats = {x[0]: [0, 0] for x in header}
+    for r in res:
+        stats[r[2]][0] += r[0]
+        stats[r[2]][1] += r[1]
+    # We need to count the number of unmapped reads as well
+    unmapped += bam.count("*")
+    return mapped, unmapped, stats
+def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True):
+    """
+    A wrapper for opening BAM/CRAM files.
+    bamFile: str
+        A BAM/CRAM file name
+    returnStats: bool
+        Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict).
+        These additional values are needed by some downstream functions, since one
+        can't use file_handle.mapped on CRAM files (or idxstats())
+    nThreads: int
+        If returnStats is True, number of threads to use for computing statistics
+    minimalDecoding: Bool
+        For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions).
+    Returns either the file handle or a tuple as described in returnStats
+    """
+    format_options = ["required_fields=0x1FF"]
+    if sys.version_info.major >= 3:
+        format_options = [b"required_fields=0x1FF"]
+    if not minimalDecoding:
+        format_options = None
+    try:
+        bam = pysam.Samfile(bamFile, 'rb', format_options=format_options)
+    except IOError:
+        sys.exit("The file '{}' does not exist".format(bamFile))
+    except:
+        sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile))
+    try:
+        assert bam.check_index() is not False
+    except:
+        sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile))
+    if bam.is_cram and returnStats:
+        mapped, unmapped, stats = getMappingStats(bam, nThreads)
+    elif bam.is_bam:
+        mapped = bam.mapped
+        unmapped = bam.unmapped
+        # Make the dictionary to hold the stats
+        if returnStats:
+            stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()}
+    if bam.is_bam or (bam.is_cram and returnStats):
+        if mapped == 0:
+            sys.stderr.write("WARNING! '{}' does not have any mapped reads. Please "
+                             "check that the file is properly indexed and "
+                             "that it contains mapped reads.\n".format(bamFile))
+    if returnStats:
+        return bam, mapped, unmapped, stats
+    else:
+        return bam

deepTools/source/deeptools/bamPEFragmentSize.py ADDED Viewed

	@@ -0,0 +1,369 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import sys
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm  # noqa: F401
+import matplotlib.pyplot as plt
+import plotly.offline as py
+import plotly.graph_objs as go
+# own tools
+from deeptools.parserCommon import writableFile
+from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+from importlib.metadata import version
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description='This tool calculates the fragment sizes for read pairs given a BAM file from paired-end sequencing.'
+        'Several regions are sampled depending on the '
+        'size of the genome and number of processors to estimate the'
+        'summary statistics on the fragment lengths. '
+        'Properly paired reads are preferred for computation, i.e., '
+        'it will only use discordant pairs if no concordant alignments '
+        'overlap with a given region. '
+        'The default setting simply prints the summary statistics to the screen.',
+        usage='bamPEFragmentSize -b sample1.bam sample2.bam -o hist.png\n'
+        'help: bamPEFragmentSize -h / bamPEFragmentSize --help'
+    )
+    parser.add_argument('--bamfiles', '-b',
+                        help='List of BAM files to process',
+                        nargs='+',
+                        metavar='bam files')
+    parser.add_argument('--histogram', '-hist', '-o',
+                        help='Save a .png file with a histogram '
+                        'of the fragment length distribution.',
+                        metavar='FILE')
+    parser.add_argument('--plotFileFormat',
+                        metavar='FILETYPE',
+                        help='Image format type. If given, this option '
+                        'overrides the image format based on the plotFile '
+                        'ending. The available options are: png, '
+                        'eps, pdf, svg and plotly.',
+                        default=None,
+                        choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+    parser.add_argument('--numberOfProcessors', '-p',
+                        help='Number of processors to use. The default is '
+                        'to use 1. (Default: %(default)s)',
+                        metavar="INT",
+                        type=int,
+                        default=1,
+                        required=False)
+    parser.add_argument('--samplesLabel',
+                        help='Labels for the samples plotted. The '
+                        'default is to use the file name of the '
+                        'sample. The sample labels should be separated '
+                        'by spaces and quoted if a label itself'
+                        'contains a space E.g. --samplesLabel label-1 "label 2"  ',
+                        nargs='+')
+    parser.add_argument('--plotTitle', '-T',
+                        help='Title of the plot, to be printed on top of '
+                        'the generated image. Leave blank for no title. (Default: %(default)s)',
+                        default='')
+    parser.add_argument('--maxFragmentLength',
+                        help='The maximum fragment length in the histogram. A value of 0 (the default) indicates to use twice the mean fragment length. (Default: %(default)s)',
+                        default=0,
+                        type=int)
+    parser.add_argument('--logScale',
+                        help='Plot on the log scale',
+                        action='store_true')
+    parser.add_argument('--binSize', '-bs',
+                        metavar='INT',
+                        help='Length in bases of the window used to sample the genome. (Default: %(default)s)',
+                        default=1000,
+                        type=int)
+    parser.add_argument('--distanceBetweenBins', '-n',
+                        metavar='INT',
+                        help='To reduce the computation time, not every possible genomic '
+                        'bin is sampled. This option allows you to set the distance '
+                        'between bins actually sampled from. Larger numbers are sufficient '
+                        'for high coverage samples, while smaller values are useful for '
+                        'lower coverage samples. Note that if you specify a value that '
+                        'results in too few (<1000) reads sampled, the value will be '
+                        'decreased. (Default: %(default)s)',
+                        default=1000000,
+                        type=int)
+    parser.add_argument('--blackListFileName', '-bl',
+                        help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
+                        metavar="BED file",
+                        required=False)
+    parser.add_argument('--table',
+                        metavar='FILE',
+                        help='In addition to printing read and fragment length metrics to the screen, write them to the given file in tabular format.',
+                        required=False)
+    parser.add_argument('--outRawFragmentLengths',
+                        metavar='FILE',
+                        required=False,
+                        type=writableFile,
+                        help='Save the fragment (or read if the input is single-end) length and their associated number of occurrences to a tab-separated file. Columns are length, number of occurrences, and the sample label.')
+    parser.add_argument('--verbose',
+                        help='Set if processing data messages are wanted.',
+                        action='store_true',
+                        required=False)
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s {}'.format(version('deeptools')))
+    return parser
+def getDensity(lengths, minVal, maxVal):
+    """
+    This is essentially computing what hist() in matplotlib is doing and returning the results.
+    This then allows us to free up the memory consumed by each sample rather than returning it all back to main() for plotting.
+    """
+    n, bins, patches = plt.hist(lengths, bins=100, range=(minVal, maxVal), density=True)
+    plt.clf()
+    return (n, bins)
+def getFragSize(bam, args, idx, outRawFrags):
+    fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True,
+                                                                    blackListFileName=args.blackListFileName,
+                                                                    numberOfProcessors=args.numberOfProcessors,
+                                                                    verbose=args.verbose,
+                                                                    binSize=args.binSize,
+                                                                    distanceBetweenBins=args.distanceBetweenBins)
+    if outRawFrags:
+        label = bam
+        if args.samplesLabel and idx < len(args.samplesLabel):
+            label = args.samplesLabel[idx]
+        if fragment_len_dict:
+            fragment_len_dict['lengths'] = [int(x) for x in fragment_len_dict['lengths']]
+            cnts = np.bincount(fragment_len_dict['lengths'], minlength=int(fragment_len_dict['max']) + 1)
+        else:
+            read_len_dict['lengths'] = [int(x) for x in read_len_dict['lengths']]
+            cnts = np.bincount(read_len_dict['lengths'], minlength=int(read_len_dict['max']) + 1)
+        for idx, v in enumerate(cnts):
+            if v > 0:
+                outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label))
+    if args.samplesLabel and idx < len(args.samplesLabel):
+        print("\n\nSample label: {}".format(args.samplesLabel[idx]))
+    else:
+        print("\n\nBAM file : {}".format(bam))
+    if fragment_len_dict:
+        if fragment_len_dict['mean'] == 0:
+            print("No pairs were found. Is the data from a paired-end sequencing experiment?")
+        print("Sample size: {}\n".format(fragment_len_dict['sample_size']))
+        print("Fragment lengths:")
+        print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
+              "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'],
+                                                      fragment_len_dict['qtile25'],
+                                                      fragment_len_dict['mean'],
+                                                      fragment_len_dict['median'],
+                                                      fragment_len_dict['qtile75'],
+                                                      fragment_len_dict['max'],
+                                                      fragment_len_dict['std']))
+        print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(fragment_len_dict['mad'],
+                                                                                                                                                               fragment_len_dict['qtile10'],
+                                                                                                                                                               fragment_len_dict['qtile20'],
+                                                                                                                                                               fragment_len_dict['qtile30'],
+                                                                                                                                                               fragment_len_dict['qtile40'],
+                                                                                                                                                               fragment_len_dict['qtile60'],
+                                                                                                                                                               fragment_len_dict['qtile70'],
+                                                                                                                                                               fragment_len_dict['qtile80'],
+                                                                                                                                                               fragment_len_dict['qtile90'],
+                                                                                                                                                               fragment_len_dict['qtile99']))
+    else:
+        print("No pairs were found. Is the data from a paired-end sequencing experiment?")
+    print("\nRead lengths:")
+    print("Sample size: {}\n".format(read_len_dict['sample_size']))
+    print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
+          "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'],
+                                                  read_len_dict['qtile25'],
+                                                  read_len_dict['mean'],
+                                                  read_len_dict['median'],
+                                                  read_len_dict['qtile75'],
+                                                  read_len_dict['max'],
+                                                  read_len_dict['std']))
+    print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(read_len_dict['mad'],
+                                                                                                                                                           read_len_dict['qtile10'],
+                                                                                                                                                           read_len_dict['qtile20'],
+                                                                                                                                                           read_len_dict['qtile30'],
+                                                                                                                                                           read_len_dict['qtile40'],
+                                                                                                                                                           read_len_dict['qtile60'],
+                                                                                                                                                           read_len_dict['qtile70'],
+                                                                                                                                                           read_len_dict['qtile80'],
+                                                                                                                                                           read_len_dict['qtile90'],
+                                                                                                                                                           read_len_dict['qtile99']))
+    # The read and fragment lists will just eat up memory if not removed!
+    if args.histogram:
+        if fragment_len_dict:
+            maxVal = fragment_len_dict['mean'] * 2
+            minVal = fragment_len_dict['min']
+        else:
+            maxVal = read_len_dict['mean'] * 2
+            minVal = read_len_dict['min']
+        if args.maxFragmentLength > 0:
+            maxVal = args.maxFragmentLength
+        if fragment_len_dict:
+            fragment_len_dict['lengths'] = getDensity(fragment_len_dict['lengths'], minVal, maxVal)
+        if read_len_dict:
+            read_len_dict['lengths'] = getDensity(read_len_dict['lengths'], minVal, maxVal)
+    else:
+        if fragment_len_dict:
+            del fragment_len_dict['lengths']
+        if read_len_dict:
+            del read_len_dict['lengths']
+    return (fragment_len_dict, read_len_dict)
+def printTable(args, fragDict, readDict):
+    """
+    Print the read and fragment dictionary in more easily parsable tabular format to a file.
+    """
+    of = open(args.table, "w")
+    of.write("\tFrag. Sampled")
+    of.write("\tFrag. Len. Min.\tFrag. Len. 1st. Qu.\tFrag. Len. Mean\tFrag. Len. Median\tFrag. Len. 3rd Qu.\tFrag. Len. Max\tFrag. Len. Std.")
+    of.write("\tFrag. Med. Abs. Dev.\tFrag. Len. 10%\tFrag. Len. 20%\tFrag. Len. 30%\tFrag. Len. 40%\tFrag. Len. 60%\tFrag. Len. 70%\tFrag. Len. 80%\tFrag. Len. 90%\tFrag. Len. 99%")
+    of.write("\tReads Sampled")
+    of.write("\tRead Len. Min.\tRead Len. 1st. Qu.\tRead Len. Mean\tRead Len. Median\tRead Len. 3rd Qu.\tRead Len. Max\tRead Len. Std.")
+    of.write("\tRead Med. Abs. Dev.\tRead Len. 10%\tRead Len. 20%\tRead Len. 30%\tRead Len. 40%\tRead Len. 60%\tRead Len. 70%\tRead Len. 80%\tRead Len. 90%\tRead Len. 99%\n")
+    for idx, bam in enumerate(args.bamfiles):
+        if args.samplesLabel and idx < len(args.samplesLabel):
+            of.write(args.samplesLabel[idx])
+        else:
+            of.write(bam)
+        if fragDict is not None and fragDict[bam] is not None:
+            d = fragDict[bam]
+            of.write("\t{}".format(d['sample_size']))
+            of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'],
+                                                           d['qtile25'],
+                                                           d['mean'],
+                                                           d['median'],
+                                                           d['qtile75'],
+                                                           d['max'],
+                                                           d['std']))
+            of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['mad'],
+                                                                       d['qtile10'],
+                                                                       d['qtile20'],
+                                                                       d['qtile30'],
+                                                                       d['qtile40'],
+                                                                       d['qtile60'],
+                                                                       d['qtile70'],
+                                                                       d['qtile80'],
+                                                                       d['qtile90'],
+                                                                       d['qtile99']))
+        else:
+            of.write("\t0")
+            of.write("\t0\t0\t0\t0\t0\t0\t0")
+            of.write("\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0")
+        d = readDict[bam]
+        of.write("\t{}".format(d['sample_size']))
+        of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'],
+                                                       d['qtile25'],
+                                                       d['mean'],
+                                                       d['median'],
+                                                       d['qtile75'],
+                                                       d['max'],
+                                                       d['std']))
+        of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(d['mad'],
+                                                                     d['qtile10'],
+                                                                     d['qtile20'],
+                                                                     d['qtile30'],
+                                                                     d['qtile40'],
+                                                                     d['qtile60'],
+                                                                     d['qtile70'],
+                                                                     d['qtile80'],
+                                                                     d['qtile90'],
+                                                                     d['qtile99']))
+    of.close()
+def main(args=None):
+    args = parse_arguments().parse_args(args)
+    if len(sys.argv) == 1:
+        parse_arguments().print_help()
+        sys.exit()
+    fraglengths = {}
+    readlengths = {}
+    of = None
+    if args.outRawFragmentLengths is not None:
+        of = open(args.outRawFragmentLengths, "w")
+        of.write("#bamPEFragmentSize\nSize\tOccurrences\tSample\n")
+    for idx, bam in enumerate(args.bamfiles):
+        f, r = getFragSize(bam, args, idx, of)
+        fraglengths[bam] = f
+        readlengths[bam] = r
+    if args.table is not None:
+        printTable(args, fraglengths, readlengths)
+    if args.histogram:
+        if args.samplesLabel:
+            if len(args.bamfiles) != len(args.samplesLabel):
+                sys.exit("The number of labels does not match the number of BAM files.")
+            else:
+                labels = args.samplesLabel
+        else:
+            labels = list(fraglengths.keys())
+        i = 0
+        data = []
+        for bam in fraglengths.keys():
+            d = fraglengths[bam]
+            if d is None:
+                d = readlengths[bam]
+            if args.maxFragmentLength > 0:
+                maxVal = args.maxFragmentLength
+            else:
+                maxVal = d['mean'] * 2
+            if args.plotFileFormat == 'plotly':
+                trace = go.Histogram(x=d['lengths'],
+                                     histnorm='probability',
+                                     opacity=0.5,
+                                     name=labels[i],
+                                     nbinsx=100,
+                                     xbins=dict(start=d['min'], end=maxVal))
+                data.append(trace)
+            else:
+                plt.bar(d['lengths'][1][:-1], height=d['lengths'][0],
+                        width=d['lengths'][1][1:] - d['lengths'][1][:-1],
+                        align='edge', log=args.logScale,
+                        alpha=0.5, label=labels[i])
+            i += 1
+        if args.plotFileFormat == 'plotly':
+            fig = go.Figure()
+            fig.add_traces(data)
+            fig['layout']['yaxis1'].update(title='Frequency')
+            fig['layout']['xaxis1'].update(title='Fragment Length')
+            fig['layout'].update(title=args.plotTitle)
+            fig['layout'].update(showlegend=True)
+            if args.logScale:
+                fig['layout']['yaxis1'].update(type='log')
+            py.plot(fig, filename=args.histogram, auto_open=False)
+        else:
+            plt.xlabel('Fragment Length')
+            plt.ylabel('Frequency')
+            plt.legend(loc='upper right')
+            plt.title(args.plotTitle)
+            plt.savefig(args.histogram, bbox_inches=0, format=args.plotFileFormat)
+            plt.close()
+if __name__ == "__main__":
+    main()

deepTools/source/deeptools/bigwigAverage.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import sys
+import numpy as np
+from deeptools import parserCommon
+from deeptools import writeBedGraph_bam_and_bw
+debug = 0
+def parse_arguments(args=None):
+    parentParser = parserCommon.getParentArgParse()
+    outputParser = parserCommon.output()
+    parser = argparse.ArgumentParser(
+        parents=[parentParser, outputParser],
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='This tool average multiple bigWig files based on the number '
+        'of mapped reads. To average the bigWig files, the genome is '
+        'partitioned into bins of equal size, then the scores '
+        'in each bigwig file are computed per bin.'
+        'These scores are averaged and scaleFactors can be applied before the average.',
+        usage='bigwigAverage -b sample1.bw sample2.bw -o outfile.bw\n'
+        'help: bigwigAverage -h / bigwigAverage --help')
+    # define the arguments
+    parser.add_argument('--bigwigs', '-b',
+                        metavar='Bigwig files',
+                        help='Bigwig files separated by space.',
+                        nargs='+',
+                        required=True)
+    parser.add_argument('--scaleFactors',
+                        help='Set this parameter to multipy the bigwig values '
+                        'by a constant. The format is '
+                        'scaleFactor1:scaleFactor2:scaleFactor3 etc. '
+                        'For example 0.7:1 to scale the first bigwig file '
+                        'by 0.7 while not scaling the second bigwig file',
+                        default=None,
+                        required=False)
+    parser.add_argument('--skipNonCoveredRegions', '--skipNAs',
+                        help='This parameter determines if non-covered regions (regions without a score) '
+                        'in the bigWig files should be skipped. The default is to treat those '
+                        'regions as having a value of zero. '
+                        'The decision to skip non-covered regions '
+                        'depends on the interpretation of the data. Non-covered regions '
+                        'in a bigWig file may represent repetitive regions that should '
+                        'be skipped. Alternatively, the interpretation of non-covered regions as '
+                        'zeros may be wrong and this option should be used ',
+                        action='store_true')
+    return parser
+def getType(fname):
+    """
+    Tries to determine if a file is a wiggle, a bedgraph, or a bigWig file.
+    """
+    if fname.endswith(".wig") or fname.endswith(".wiggle"):
+        return "wiggle"
+    elif fname.lower().endswith(".bedgraph") or fname.endswith(".bdg"):
+        return "bedgraph"
+    else:
+        return "bigwig"
+def average(tileCoverage, args):
+    r"""
+    The mapreduce method calls this function
+    for each tile. The parameters (args) are fixed
+    in the main method.
+    >>> funcArgs= {'scaleFactors': (1,1)}
+    >>> average([1, 2], funcArgs)
+    1.5
+    >>> funcArgs= {'scaleFactors': (1,0.5)}
+    >>> average([1, 2], funcArgs)
+    1.0
+    >>> funcArgs= {'scaleFactors': (1,0.5,0.1,0.2)}
+    >>> average([1, 2, 3, 12], funcArgs)
+    1.175
+    >>> average([1, 2, 3, np.nan], funcArgs)
+    nan
+    """
+    norm_values = [args['scaleFactors'][i] * cov for i, cov in enumerate(tileCoverage)]
+    return np.mean(norm_values)
+def main(args=None):
+    args = parse_arguments().parse_args(args)
+    if len(sys.argv) == 1:
+        parse_arguments().print_help()
+        sys.exit()
+    nFiles = len(args.bigwigs)
+    if args.scaleFactors:
+        scaleFactors = [float(x) for x in args.scaleFactors.split(":")]
+        if len(scaleFactors) == 1:
+            scaleFactors = scaleFactors * nFiles
+        elif len(scaleFactors) != nFiles:
+            raise argparse.ArgumentTypeError(
+                "Format of scaleFactors is factor or factor1:factor2... as many as bigwig files. "
+                "There are {} bigwigs and {} factors."
+                "The value given ( {} ) is not valid".format(nFiles, len(scaleFactors), args.scaleFactors))
+    else:
+        scaleFactors = [1] * nFiles
+    # the average function is called and receives
+    # the function_args per each tile that is considered
+    FUNC = average
+    function_args = {'scaleFactors': scaleFactors}
+    writeBedGraph_bam_and_bw.writeBedGraph(
+        [(b, getType(b)) for b in args.bigwigs],
+        args.outFileName, 0, FUNC,
+        function_args, tileSize=args.binSize, region=args.region,
+        blackListFileName=args.blackListFileName,
+        verbose=args.verbose,
+        numberOfProcessors=args.numberOfProcessors,
+        skipZeroOverZero=False,
+        format=args.outFileFormat,
+        smoothLength=False,
+        missingDataAsZero=not args.skipNonCoveredRegions,
+        extendPairedEnds=False)

deepTools/source/deeptools/bigwigCompare.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+from deeptools import parserCommon
+from deeptools.getRatio import getRatio
+from deeptools import writeBedGraph_bam_and_bw
+debug = 0
+def parse_arguments(args=None):
+    parentParser = parserCommon.getParentArgParse()
+    outputParser = parserCommon.output()
+    parser = argparse.ArgumentParser(
+        parents=[parentParser, outputParser],
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='This tool compares two bigWig files based on the number '
+        'of mapped reads. To compare the bigWig files, the genome is '
+        'partitioned into bins of equal size, then the number of reads found '
+        'in each BAM file are counted per bin and finally a summary '
+        'value is reported. This value can be the ratio of the number of reads'
+        'per bin, the log2 of the ratio, the sum or the difference.',
+        usage='bigwigCompare -b1 sample1.bw -b2 sample2.bw -o log2.bw\n'
+        'help: bigwigCompare -h / bigwigCompare --help')
+    # define the arguments
+    parser.add_argument('--bigwig1', '-b1',
+                        metavar='Bigwig file',
+                        help='Bigwig file 1. Usually the file for the '
+                        'treatment.',
+                        required=True)
+    parser.add_argument('--bigwig2', '-b2',
+                        metavar='Bigwig file',
+                        help='Bigwig file 2. Usually the file for the '
+                        'control.',
+                        required=True)
+    parser.add_argument('--scaleFactors',
+                        help='Set this parameter to multipy the bigwig values '
+                        'by a constant. The format is '
+                        'scaleFactor1:scaleFactor2. '
+                        'For example 0.7:1 to scale the first bigwig file '
+                        'by 0.7 while not scaling the second bigwig file',
+                        default=None,
+                        required=False)
+    parser.add_argument('--pseudocount',
+                        help='A small number to avoid x/0. Only useful '
+                        'together with --operation log2 or --operation ratio. '
+                        'You can specify different values as pseudocounts for '
+                        'the numerator and the denominator by providing two '
+                        'values (the first value is used as the numerator '
+                        'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
+                        default=1,
+                        nargs='+',
+                        action=parserCommon.requiredLength(1, 2),
+                        type=float,
+                        required=False)
+    parser.add_argument('--skipZeroOverZero',
+                        help='Skip bins where BOTH BAM files lack coverage. '
+                        'This is determined BEFORE any applicable pseudocount '
+                        'is added.',
+                        action='store_true')
+    parser.add_argument('--operation',
+                        help='The default is to output the log2ratio of the '
+                        'two samples. The reciprocal ratio returns the '
+                        'the negative of the inverse of the ratio '
+                        'if the ratio is less than 0. The resulting '
+                        'values are interpreted as negative fold changes. '
+                        'Instead of performing a '
+                        'computation using both files, the scaled signal can '
+                        'alternatively be output for the first or second file using '
+                        'the \'--operation first\' or \'--operation second\' (Default: %(default)s)',
+                        default='log2',
+                        choices=['log2', 'ratio', 'subtract', 'add', 'mean',
+                                 'reciprocal_ratio', 'first', 'second'],
+                        required=False)
+    parser.add_argument('--skipNonCoveredRegions', '--skipNAs',
+                        help='This parameter determines if non-covered regions (regions without a score) '
+                        'in the bigWig files should be skipped. The default is to treat those '
+                        'regions as having a value of zero. '
+                        'The decision to skip non-covered regions '
+                        'depends on the interpretation of the data. Non-covered regions '
+                        'in a bigWig file may represent repetitive regions that should '
+                        'be skipped. Alternatively, the interpretation of non-covered regions as '
+                        'zeros may be wrong and this option should be used ',
+                        action='store_true')
+    parser.add_argument('--fixedStep',
+                        help='Write out all bins (of size --binSize) '
+                        'instead of merging neighbouring bins with equal values.',
+                        action='store_true')
+    return parser
+def getType(fname):
+    """
+    Tries to determine if a file is a wiggle, a bedgraph or a bigWig.
+    """
+    if fname.endswith(".wig") or fname.endswith(".wiggle"):
+        return "wiggle"
+    elif fname.endswith(".bedgraph"):
+        return "bedgraph"
+    else:
+        return "bigwig"
+def main(args=None):
+    args = parse_arguments().parse_args(args)
+    if args.scaleFactors:
+        scaleFactors = [float(x) for x in args.scaleFactors.split(":")]
+    else:
+        scaleFactors = [1, 1]
+    if not isinstance(args.pseudocount, list):
+        args.pseudocount = [args.pseudocount]
+    if len(args.pseudocount) == 1:
+        args.pseudocount *= 2
+    # the getRatio function is called and receives
+    # the function_args per each tile that is considered
+    FUNC = getRatio
+    function_args = {'valueType': args.operation,
+                     'scaleFactors': scaleFactors,
+                     'pseudocount': args.pseudocount}
+    writeBedGraph_bam_and_bw.writeBedGraph(
+        [(args.bigwig1, getType(args.bigwig1)),
+         (args.bigwig2, getType(args.bigwig2))],
+        args.outFileName, 0, FUNC,
+        function_args, tileSize=args.binSize, region=args.region,
+        blackListFileName=args.blackListFileName,
+        verbose=args.verbose,
+        numberOfProcessors=args.numberOfProcessors,
+        skipZeroOverZero=args.skipZeroOverZero,
+        format=args.outFileFormat,
+        smoothLength=False,
+        missingDataAsZero=not args.skipNonCoveredRegions,
+        extendPairedEnds=False,
+        fixedStep=args.fixedStep)

deepTools/source/deeptools/cm.py ADDED Viewed

	@@ -0,0 +1,1088 @@

+#!/usr/bin/env python
+# This file comes from the seaborn project and is under a BSD license:
+# Copyright (c) 2012-2019, Michael L. Waskom
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the project nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from matplotlib import colors, colormaps as mpl_cm
+_rocket_lut = [
+    [0.01060815, 0.01808215, 0.10018654],
+    [0.01428972, 0.02048237, 0.10374486],
+    [0.01831941, 0.0229766, 0.10738511],
+    [0.02275049, 0.02554464, 0.11108639],
+    [0.02759119, 0.02818316, 0.11483751],
+    [0.03285175, 0.03088792, 0.11863035],
+    [0.03853466, 0.03365771, 0.12245873],
+    [0.04447016, 0.03648425, 0.12631831],
+    [0.05032105, 0.03936808, 0.13020508],
+    [0.05611171, 0.04224835, 0.13411624],
+    [0.0618531, 0.04504866, 0.13804929],
+    [0.06755457, 0.04778179, 0.14200206],
+    [0.0732236, 0.05045047, 0.14597263],
+    [0.0788708, 0.05305461, 0.14995981],
+    [0.08450105, 0.05559631, 0.15396203],
+    [0.09011319, 0.05808059, 0.15797687],
+    [0.09572396, 0.06050127, 0.16200507],
+    [0.10132312, 0.06286782, 0.16604287],
+    [0.10692823, 0.06517224, 0.17009175],
+    [0.1125315, 0.06742194, 0.17414848],
+    [0.11813947, 0.06961499, 0.17821272],
+    [0.12375803, 0.07174938, 0.18228425],
+    [0.12938228, 0.07383015, 0.18636053],
+    [0.13501631, 0.07585609, 0.19044109],
+    [0.14066867, 0.0778224, 0.19452676],
+    [0.14633406, 0.07973393, 0.1986151],
+    [0.15201338, 0.08159108, 0.20270523],
+    [0.15770877, 0.08339312, 0.20679668],
+    [0.16342174, 0.0851396, 0.21088893],
+    [0.16915387, 0.08682996, 0.21498104],
+    [0.17489524, 0.08848235, 0.2190294],
+    [0.18065495, 0.09009031, 0.22303512],
+    [0.18643324, 0.09165431, 0.22699705],
+    [0.19223028, 0.09317479, 0.23091409],
+    [0.19804623, 0.09465217, 0.23478512],
+    [0.20388117, 0.09608689, 0.23860907],
+    [0.20973515, 0.09747934, 0.24238489],
+    [0.21560818, 0.09882993, 0.24611154],
+    [0.22150014, 0.10013944, 0.2497868],
+    [0.22741085, 0.10140876, 0.25340813],
+    [0.23334047, 0.10263737, 0.25697736],
+    [0.23928891, 0.10382562, 0.2604936],
+    [0.24525608, 0.10497384, 0.26395596],
+    [0.25124182, 0.10608236, 0.26736359],
+    [0.25724602, 0.10715148, 0.27071569],
+    [0.26326851, 0.1081815, 0.27401148],
+    [0.26930915, 0.1091727, 0.2772502],
+    [0.27536766, 0.11012568, 0.28043021],
+    [0.28144375, 0.11104133, 0.2835489],
+    [0.2875374, 0.11191896, 0.28660853],
+    [0.29364846, 0.11275876, 0.2896085],
+    [0.29977678, 0.11356089, 0.29254823],
+    [0.30592213, 0.11432553, 0.29542718],
+    [0.31208435, 0.11505284, 0.29824485],
+    [0.31826327, 0.1157429, 0.30100076],
+    [0.32445869, 0.11639585, 0.30369448],
+    [0.33067031, 0.11701189, 0.30632563],
+    [0.33689808, 0.11759095, 0.3088938],
+    [0.34314168, 0.11813362, 0.31139721],
+    [0.34940101, 0.11863987, 0.3138355],
+    [0.355676, 0.11910909, 0.31620996],
+    [0.36196644, 0.1195413, 0.31852037],
+    [0.36827206, 0.11993653, 0.32076656],
+    [0.37459292, 0.12029443, 0.32294825],
+    [0.38092887, 0.12061482, 0.32506528],
+    [0.38727975, 0.12089756, 0.3271175],
+    [0.39364518, 0.12114272, 0.32910494],
+    [0.40002537, 0.12134964, 0.33102734],
+    [0.40642019, 0.12151801, 0.33288464],
+    [0.41282936, 0.12164769, 0.33467689],
+    [0.41925278, 0.12173833, 0.33640407],
+    [0.42569057, 0.12178916, 0.33806605],
+    [0.43214263, 0.12179973, 0.33966284],
+    [0.43860848, 0.12177004, 0.34119475],
+    [0.44508855, 0.12169883, 0.34266151],
+    [0.45158266, 0.12158557, 0.34406324],
+    [0.45809049, 0.12142996, 0.34540024],
+    [0.46461238, 0.12123063, 0.34667231],
+    [0.47114798, 0.12098721, 0.34787978],
+    [0.47769736, 0.12069864, 0.34902273],
+    [0.48426077, 0.12036349, 0.35010104],
+    [0.49083761, 0.11998161, 0.35111537],
+    [0.49742847, 0.11955087, 0.35206533],
+    [0.50403286, 0.11907081, 0.35295152],
+    [0.51065109, 0.11853959, 0.35377385],
+    [0.51728314, 0.1179558, 0.35453252],
+    [0.52392883, 0.11731817, 0.35522789],
+    [0.53058853, 0.11662445, 0.35585982],
+    [0.53726173, 0.11587369, 0.35642903],
+    [0.54394898, 0.11506307, 0.35693521],
+    [0.5506426, 0.11420757, 0.35737863],
+    [0.55734473, 0.11330456, 0.35775059],
+    [0.56405586, 0.11235265, 0.35804813],
+    [0.57077365, 0.11135597, 0.35827146],
+    [0.5774991, 0.11031233, 0.35841679],
+    [0.58422945, 0.10922707, 0.35848469],
+    [0.59096382, 0.10810205, 0.35847347],
+    [0.59770215, 0.10693774, 0.35838029],
+    [0.60444226, 0.10573912, 0.35820487],
+    [0.61118304, 0.10450943, 0.35794557],
+    [0.61792306, 0.10325288, 0.35760108],
+    [0.62466162, 0.10197244, 0.35716891],
+    [0.63139686, 0.10067417, 0.35664819],
+    [0.63812122, 0.09938212, 0.35603757],
+    [0.64483795, 0.0980891, 0.35533555],
+    [0.65154562, 0.09680192, 0.35454107],
+    [0.65824241, 0.09552918, 0.3536529],
+    [0.66492652, 0.09428017, 0.3526697],
+    [0.67159578, 0.09306598, 0.35159077],
+    [0.67824099, 0.09192342, 0.3504148],
+    [0.684863, 0.09085633, 0.34914061],
+    [0.69146268, 0.0898675, 0.34776864],
+    [0.69803757, 0.08897226, 0.3462986],
+    [0.70457834, 0.0882129, 0.34473046],
+    [0.71108138, 0.08761223, 0.3430635],
+    [0.7175507, 0.08716212, 0.34129974],
+    [0.72398193, 0.08688725, 0.33943958],
+    [0.73035829, 0.0868623, 0.33748452],
+    [0.73669146, 0.08704683, 0.33543669],
+    [0.74297501, 0.08747196, 0.33329799],
+    [0.74919318, 0.08820542, 0.33107204],
+    [0.75535825, 0.08919792, 0.32876184],
+    [0.76145589, 0.09050716, 0.32637117],
+    [0.76748424, 0.09213602, 0.32390525],
+    [0.77344838, 0.09405684, 0.32136808],
+    [0.77932641, 0.09634794, 0.31876642],
+    [0.78513609, 0.09892473, 0.31610488],
+    [0.79085854, 0.10184672, 0.313391],
+    [0.7965014, 0.10506637, 0.31063031],
+    [0.80205987, 0.10858333, 0.30783],
+    [0.80752799, 0.11239964, 0.30499738],
+    [0.81291606, 0.11645784, 0.30213802],
+    [0.81820481, 0.12080606, 0.29926105],
+    [0.82341472, 0.12535343, 0.2963705],
+    [0.82852822, 0.13014118, 0.29347474],
+    [0.83355779, 0.13511035, 0.29057852],
+    [0.83850183, 0.14025098, 0.2876878],
+    [0.84335441, 0.14556683, 0.28480819],
+    [0.84813096, 0.15099892, 0.281943],
+    [0.85281737, 0.15657772, 0.27909826],
+    [0.85742602, 0.1622583, 0.27627462],
+    [0.86196552, 0.16801239, 0.27346473],
+    [0.86641628, 0.17387796, 0.27070818],
+    [0.87079129, 0.17982114, 0.26797378],
+    [0.87507281, 0.18587368, 0.26529697],
+    [0.87925878, 0.19203259, 0.26268136],
+    [0.8833417, 0.19830556, 0.26014181],
+    [0.88731387, 0.20469941, 0.25769539],
+    [0.89116859, 0.21121788, 0.2553592],
+    [0.89490337, 0.21785614, 0.25314362],
+    [0.8985026, 0.22463251, 0.25108745],
+    [0.90197527, 0.23152063, 0.24918223],
+    [0.90530097, 0.23854541, 0.24748098],
+    [0.90848638, 0.24568473, 0.24598324],
+    [0.911533, 0.25292623, 0.24470258],
+    [0.9144225, 0.26028902, 0.24369359],
+    [0.91717106, 0.26773821, 0.24294137],
+    [0.91978131, 0.27526191, 0.24245973],
+    [0.92223947, 0.28287251, 0.24229568],
+    [0.92456587, 0.29053388, 0.24242622],
+    [0.92676657, 0.29823282, 0.24285536],
+    [0.92882964, 0.30598085, 0.24362274],
+    [0.93078135, 0.31373977, 0.24468803],
+    [0.93262051, 0.3215093, 0.24606461],
+    [0.93435067, 0.32928362, 0.24775328],
+    [0.93599076, 0.33703942, 0.24972157],
+    [0.93752831, 0.34479177, 0.25199928],
+    [0.93899289, 0.35250734, 0.25452808],
+    [0.94036561, 0.36020899, 0.25734661],
+    [0.94167588, 0.36786594, 0.2603949],
+    [0.94291042, 0.37549479, 0.26369821],
+    [0.94408513, 0.3830811, 0.26722004],
+    [0.94520419, 0.39062329, 0.27094924],
+    [0.94625977, 0.39813168, 0.27489742],
+    [0.94727016, 0.4055909, 0.27902322],
+    [0.94823505, 0.41300424, 0.28332283],
+    [0.94914549, 0.42038251, 0.28780969],
+    [0.95001704, 0.42771398, 0.29244728],
+    [0.95085121, 0.43500005, 0.29722817],
+    [0.95165009, 0.44224144, 0.30214494],
+    [0.9524044, 0.44944853, 0.3072105],
+    [0.95312556, 0.45661389, 0.31239776],
+    [0.95381595, 0.46373781, 0.31769923],
+    [0.95447591, 0.47082238, 0.32310953],
+    [0.95510255, 0.47787236, 0.32862553],
+    [0.95569679, 0.48489115, 0.33421404],
+    [0.95626788, 0.49187351, 0.33985601],
+    [0.95681685, 0.49882008, 0.34555431],
+    [0.9573439, 0.50573243, 0.35130912],
+    [0.95784842, 0.51261283, 0.35711942],
+    [0.95833051, 0.51946267, 0.36298589],
+    [0.95879054, 0.52628305, 0.36890904],
+    [0.95922872, 0.53307513, 0.3748895],
+    [0.95964538, 0.53983991, 0.38092784],
+    [0.96004345, 0.54657593, 0.3870292],
+    [0.96042097, 0.55328624, 0.39319057],
+    [0.96077819, 0.55997184, 0.39941173],
+    [0.9611152, 0.5666337, 0.40569343],
+    [0.96143273, 0.57327231, 0.41203603],
+    [0.96173392, 0.57988594, 0.41844491],
+    [0.96201757, 0.58647675, 0.42491751],
+    [0.96228344, 0.59304598, 0.43145271],
+    [0.96253168, 0.5995944, 0.43805131],
+    [0.96276513, 0.60612062, 0.44471698],
+    [0.96298491, 0.6126247, 0.45145074],
+    [0.96318967, 0.61910879, 0.45824902],
+    [0.96337949, 0.6255736, 0.46511271],
+    [0.96355923, 0.63201624, 0.47204746],
+    [0.96372785, 0.63843852, 0.47905028],
+    [0.96388426, 0.64484214, 0.4861196],
+    [0.96403203, 0.65122535, 0.4932578],
+    [0.96417332, 0.65758729, 0.50046894],
+    [0.9643063, 0.66393045, 0.5077467],
+    [0.96443322, 0.67025402, 0.51509334],
+    [0.96455845, 0.67655564, 0.52251447],
+    [0.96467922, 0.68283846, 0.53000231],
+    [0.96479861, 0.68910113, 0.53756026],
+    [0.96492035, 0.69534192, 0.5451917],
+    [0.96504223, 0.7015636, 0.5528892],
+    [0.96516917, 0.70776351, 0.5606593],
+    [0.96530224, 0.71394212, 0.56849894],
+    [0.96544032, 0.72010124, 0.57640375],
+    [0.96559206, 0.72623592, 0.58438387],
+    [0.96575293, 0.73235058, 0.59242739],
+    [0.96592829, 0.73844258, 0.60053991],
+    [0.96612013, 0.74451182, 0.60871954],
+    [0.96632832, 0.75055966, 0.61696136],
+    [0.96656022, 0.75658231, 0.62527295],
+    [0.96681185, 0.76258381, 0.63364277],
+    [0.96709183, 0.76855969, 0.64207921],
+    [0.96739773, 0.77451297, 0.65057302],
+    [0.96773482, 0.78044149, 0.65912731],
+    [0.96810471, 0.78634563, 0.66773889],
+    [0.96850919, 0.79222565, 0.6764046],
+    [0.96893132, 0.79809112, 0.68512266],
+    [0.96935926, 0.80395415, 0.69383201],
+    [0.9698028, 0.80981139, 0.70252255],
+    [0.97025511, 0.81566605, 0.71120296],
+    [0.97071849, 0.82151775, 0.71987163],
+    [0.97120159, 0.82736371, 0.72851999],
+    [0.97169389, 0.83320847, 0.73716071],
+    [0.97220061, 0.83905052, 0.74578903],
+    [0.97272597, 0.84488881, 0.75440141],
+    [0.97327085, 0.85072354, 0.76299805],
+    [0.97383206, 0.85655639, 0.77158353],
+    [0.97441222, 0.86238689, 0.78015619],
+    [0.97501782, 0.86821321, 0.78871034],
+    [0.97564391, 0.87403763, 0.79725261],
+    [0.97628674, 0.87986189, 0.8057883],
+    [0.97696114, 0.88568129, 0.81430324],
+    [0.97765722, 0.89149971, 0.82280948],
+    [0.97837585, 0.89731727, 0.83130786],
+    [0.97912374, 0.90313207, 0.83979337],
+    [0.979891, 0.90894778, 0.84827858],
+    [0.98067764, 0.91476465, 0.85676611],
+    [0.98137749, 0.92061729, 0.86536915]
+]
+_mako_lut = [
+    [0.04503935, 0.01482344, 0.02092227],
+    [0.04933018, 0.01709292, 0.02535719],
+    [0.05356262, 0.01950702, 0.03018802],
+    [0.05774337, 0.02205989, 0.03545515],
+    [0.06188095, 0.02474764, 0.04115287],
+    [0.06598247, 0.0275665, 0.04691409],
+    [0.07005374, 0.03051278, 0.05264306],
+    [0.07409947, 0.03358324, 0.05834631],
+    [0.07812339, 0.03677446, 0.06403249],
+    [0.08212852, 0.0400833, 0.06970862],
+    [0.08611731, 0.04339148, 0.07538208],
+    [0.09009161, 0.04664706, 0.08105568],
+    [0.09405308, 0.04985685, 0.08673591],
+    [0.09800301, 0.05302279, 0.09242646],
+    [0.10194255, 0.05614641, 0.09813162],
+    [0.10587261, 0.05922941, 0.103854],
+    [0.1097942, 0.06227277, 0.10959847],
+    [0.11370826, 0.06527747, 0.11536893],
+    [0.11761516, 0.06824548, 0.12116393],
+    [0.12151575, 0.07117741, 0.12698763],
+    [0.12541095, 0.07407363, 0.1328442],
+    [0.12930083, 0.07693611, 0.13873064],
+    [0.13317849, 0.07976988, 0.14465095],
+    [0.13701138, 0.08259683, 0.15060265],
+    [0.14079223, 0.08542126, 0.15659379],
+    [0.14452486, 0.08824175, 0.16262484],
+    [0.14820351, 0.09106304, 0.16869476],
+    [0.15183185, 0.09388372, 0.17480366],
+    [0.15540398, 0.09670855, 0.18094993],
+    [0.15892417, 0.09953561, 0.18713384],
+    [0.16238588, 0.10236998, 0.19335329],
+    [0.16579435, 0.10520905, 0.19960847],
+    [0.16914226, 0.10805832, 0.20589698],
+    [0.17243586, 0.11091443, 0.21221911],
+    [0.17566717, 0.11378321, 0.21857219],
+    [0.17884322, 0.11666074, 0.2249565],
+    [0.18195582, 0.11955283, 0.23136943],
+    [0.18501213, 0.12245547, 0.23781116],
+    [0.18800459, 0.12537395, 0.24427914],
+    [0.19093944, 0.1283047, 0.25077369],
+    [0.19381092, 0.13125179, 0.25729255],
+    [0.19662307, 0.13421303, 0.26383543],
+    [0.19937337, 0.13719028, 0.27040111],
+    [0.20206187, 0.14018372, 0.27698891],
+    [0.20469116, 0.14319196, 0.28359861],
+    [0.20725547, 0.14621882, 0.29022775],
+    [0.20976258, 0.14925954, 0.29687795],
+    [0.21220409, 0.15231929, 0.30354703],
+    [0.21458611, 0.15539445, 0.31023563],
+    [0.21690827, 0.15848519, 0.31694355],
+    [0.21916481, 0.16159489, 0.32366939],
+    [0.2213631, 0.16471913, 0.33041431],
+    [0.22349947, 0.1678599, 0.33717781],
+    [0.2255714, 0.1710185, 0.34395925],
+    [0.22758415, 0.17419169, 0.35075983],
+    [0.22953569, 0.17738041, 0.35757941],
+    [0.23142077, 0.18058733, 0.3644173],
+    [0.2332454, 0.18380872, 0.37127514],
+    [0.2350092, 0.18704459, 0.3781528],
+    [0.23670785, 0.190297, 0.38504973],
+    [0.23834119, 0.19356547, 0.39196711],
+    [0.23991189, 0.19684817, 0.39890581],
+    [0.24141903, 0.20014508, 0.4058667],
+    [0.24286214, 0.20345642, 0.4128484],
+    [0.24423453, 0.20678459, 0.41985299],
+    [0.24554109, 0.21012669, 0.42688124],
+    [0.2467815, 0.21348266, 0.43393244],
+    [0.24795393, 0.21685249, 0.4410088],
+    [0.24905614, 0.22023618, 0.448113],
+    [0.25007383, 0.22365053, 0.45519562],
+    [0.25098926, 0.22710664, 0.46223892],
+    [0.25179696, 0.23060342, 0.46925447],
+    [0.25249346, 0.23414353, 0.47623196],
+    [0.25307401, 0.23772973, 0.48316271],
+    [0.25353152, 0.24136961, 0.49001976],
+    [0.25386167, 0.24506548, 0.49679407],
+    [0.25406082, 0.2488164, 0.50348932],
+    [0.25412435, 0.25262843, 0.51007843],
+    [0.25404842, 0.25650743, 0.51653282],
+    [0.25383134, 0.26044852, 0.52286845],
+    [0.2534705, 0.26446165, 0.52903422],
+    [0.25296722, 0.2685428, 0.53503572],
+    [0.2523226, 0.27269346, 0.54085315],
+    [0.25153974, 0.27691629, 0.54645752],
+    [0.25062402, 0.28120467, 0.55185939],
+    [0.24958205, 0.28556371, 0.55701246],
+    [0.24842386, 0.28998148, 0.56194601],
+    [0.24715928, 0.29446327, 0.56660884],
+    [0.24580099, 0.29899398, 0.57104399],
+    [0.24436202, 0.30357852, 0.57519929],
+    [0.24285591, 0.30819938, 0.57913247],
+    [0.24129828, 0.31286235, 0.58278615],
+    [0.23970131, 0.3175495, 0.5862272],
+    [0.23807973, 0.32226344, 0.58941872],
+    [0.23644557, 0.32699241, 0.59240198],
+    [0.2348113, 0.33173196, 0.59518282],
+    [0.23318874, 0.33648036, 0.59775543],
+    [0.2315855, 0.34122763, 0.60016456],
+    [0.23001121, 0.34597357, 0.60240251],
+    [0.2284748, 0.35071512, 0.6044784],
+    [0.22698081, 0.35544612, 0.60642528],
+    [0.22553305, 0.36016515, 0.60825252],
+    [0.22413977, 0.36487341, 0.60994938],
+    [0.22280246, 0.36956728, 0.61154118],
+    [0.22152555, 0.37424409, 0.61304472],
+    [0.22030752, 0.37890437, 0.61446646],
+    [0.2191538, 0.38354668, 0.61581561],
+    [0.21806257, 0.38817169, 0.61709794],
+    [0.21703799, 0.39277882, 0.61831922],
+    [0.21607792, 0.39736958, 0.61948028],
+    [0.21518463, 0.40194196, 0.62059763],
+    [0.21435467, 0.40649717, 0.62167507],
+    [0.21358663, 0.41103579, 0.62271724],
+    [0.21288172, 0.41555771, 0.62373011],
+    [0.21223835, 0.42006355, 0.62471794],
+    [0.21165312, 0.42455441, 0.62568371],
+    [0.21112526, 0.42903064, 0.6266318],
+    [0.21065161, 0.43349321, 0.62756504],
+    [0.21023306, 0.43794288, 0.62848279],
+    [0.20985996, 0.44238227, 0.62938329],
+    [0.20951045, 0.44680966, 0.63030696],
+    [0.20916709, 0.45122981, 0.63124483],
+    [0.20882976, 0.45564335, 0.63219599],
+    [0.20849798, 0.46005094, 0.63315928],
+    [0.20817199, 0.46445309, 0.63413391],
+    [0.20785149, 0.46885041, 0.63511876],
+    [0.20753716, 0.47324327, 0.63611321],
+    [0.20722876, 0.47763224, 0.63711608],
+    [0.20692679, 0.48201774, 0.63812656],
+    [0.20663156, 0.48640018, 0.63914367],
+    [0.20634336, 0.49078002, 0.64016638],
+    [0.20606303, 0.49515755, 0.6411939],
+    [0.20578999, 0.49953341, 0.64222457],
+    [0.20552612, 0.50390766, 0.64325811],
+    [0.20527189, 0.50828072, 0.64429331],
+    [0.20502868, 0.51265277, 0.64532947],
+    [0.20479718, 0.51702417, 0.64636539],
+    [0.20457804, 0.52139527, 0.64739979],
+    [0.20437304, 0.52576622, 0.64843198],
+    [0.20418396, 0.53013715, 0.64946117],
+    [0.20401238, 0.53450825, 0.65048638],
+    [0.20385896, 0.53887991, 0.65150606],
+    [0.20372653, 0.54325208, 0.65251978],
+    [0.20361709, 0.5476249, 0.6535266],
+    [0.20353258, 0.55199854, 0.65452542],
+    [0.20347472, 0.55637318, 0.655515],
+    [0.20344718, 0.56074869, 0.65649508],
+    [0.20345161, 0.56512531, 0.65746419],
+    [0.20349089, 0.56950304, 0.65842151],
+    [0.20356842, 0.57388184, 0.65936642],
+    [0.20368663, 0.57826181, 0.66029768],
+    [0.20384884, 0.58264293, 0.6612145],
+    [0.20405904, 0.58702506, 0.66211645],
+    [0.20431921, 0.59140842, 0.66300179],
+    [0.20463464, 0.59579264, 0.66387079],
+    [0.20500731, 0.60017798, 0.66472159],
+    [0.20544449, 0.60456387, 0.66555409],
+    [0.20596097, 0.60894927, 0.66636568],
+    [0.20654832, 0.61333521, 0.66715744],
+    [0.20721003, 0.61772167, 0.66792838],
+    [0.20795035, 0.62210845, 0.66867802],
+    [0.20877302, 0.62649546, 0.66940555],
+    [0.20968223, 0.63088252, 0.6701105],
+    [0.21068163, 0.63526951, 0.67079211],
+    [0.21177544, 0.63965621, 0.67145005],
+    [0.21298582, 0.64404072, 0.67208182],
+    [0.21430361, 0.64842404, 0.67268861],
+    [0.21572716, 0.65280655, 0.67326978],
+    [0.21726052, 0.65718791, 0.6738255],
+    [0.21890636, 0.66156803, 0.67435491],
+    [0.220668, 0.66594665, 0.67485792],
+    [0.22255447, 0.67032297, 0.67533374],
+    [0.22458372, 0.67469531, 0.67578061],
+    [0.22673713, 0.67906542, 0.67620044],
+    [0.22901625, 0.6834332, 0.67659251],
+    [0.23142316, 0.68779836, 0.67695703],
+    [0.23395924, 0.69216072, 0.67729378],
+    [0.23663857, 0.69651881, 0.67760151],
+    [0.23946645, 0.70087194, 0.67788018],
+    [0.24242624, 0.70522162, 0.67813088],
+    [0.24549008, 0.70957083, 0.67835215],
+    [0.24863372, 0.71392166, 0.67854868],
+    [0.25187832, 0.71827158, 0.67872193],
+    [0.25524083, 0.72261873, 0.67887024],
+    [0.25870947, 0.72696469, 0.67898912],
+    [0.26229238, 0.73130855, 0.67907645],
+    [0.26604085, 0.73564353, 0.67914062],
+    [0.26993099, 0.73997282, 0.67917264],
+    [0.27397488, 0.74429484, 0.67917096],
+    [0.27822463, 0.74860229, 0.67914468],
+    [0.28264201, 0.75290034, 0.67907959],
+    [0.2873016, 0.75717817, 0.67899164],
+    [0.29215894, 0.76144162, 0.67886578],
+    [0.29729823, 0.76567816, 0.67871894],
+    [0.30268199, 0.76989232, 0.67853896],
+    [0.30835665, 0.77407636, 0.67833512],
+    [0.31435139, 0.77822478, 0.67811118],
+    [0.3206671, 0.78233575, 0.67786729],
+    [0.32733158, 0.78640315, 0.67761027],
+    [0.33437168, 0.79042043, 0.67734882],
+    [0.34182112, 0.79437948, 0.67709394],
+    [0.34968889, 0.79827511, 0.67685638],
+    [0.35799244, 0.80210037, 0.67664969],
+    [0.36675371, 0.80584651, 0.67649539],
+    [0.3759816, 0.80950627, 0.67641393],
+    [0.38566792, 0.81307432, 0.67642947],
+    [0.39579804, 0.81654592, 0.67656899],
+    [0.40634556, 0.81991799, 0.67686215],
+    [0.41730243, 0.82318339, 0.67735255],
+    [0.4285828, 0.82635051, 0.6780564],
+    [0.44012728, 0.82942353, 0.67900049],
+    [0.45189421, 0.83240398, 0.68021733],
+    [0.46378379, 0.83530763, 0.6817062],
+    [0.47573199, 0.83814472, 0.68347352],
+    [0.48769865, 0.84092197, 0.68552698],
+    [0.49962354, 0.84365379, 0.68783929],
+    [0.5114027, 0.8463718, 0.69029789],
+    [0.52301693, 0.84908401, 0.69288545],
+    [0.53447549, 0.85179048, 0.69561066],
+    [0.54578602, 0.8544913, 0.69848331],
+    [0.55695565, 0.85718723, 0.70150427],
+    [0.56798832, 0.85987893, 0.70468261],
+    [0.57888639, 0.86256715, 0.70802931],
+    [0.5896541, 0.8652532, 0.71154204],
+    [0.60028928, 0.86793835, 0.71523675],
+    [0.61079441, 0.87062438, 0.71910895],
+    [0.62116633, 0.87331311, 0.72317003],
+    [0.63140509, 0.87600675, 0.72741689],
+    [0.64150735, 0.87870746, 0.73185717],
+    [0.65147219, 0.8814179, 0.73648495],
+    [0.66129632, 0.8841403, 0.74130658],
+    [0.67097934, 0.88687758, 0.74631123],
+    [0.68051833, 0.88963189, 0.75150483],
+    [0.68991419, 0.89240612, 0.75687187],
+    [0.69916533, 0.89520211, 0.76241714],
+    [0.70827373, 0.89802257, 0.76812286],
+    [0.71723995, 0.90086891, 0.77399039],
+    [0.72606665, 0.90374337, 0.7800041],
+    [0.73475675, 0.90664718, 0.78615802],
+    [0.74331358, 0.90958151, 0.79244474],
+    [0.75174143, 0.91254787, 0.79884925],
+    [0.76004473, 0.91554656, 0.80536823],
+    [0.76827704, 0.91856549, 0.81196513],
+    [0.77647029, 0.921603, 0.81855729],
+    [0.78462009, 0.92466151, 0.82514119],
+    [0.79273542, 0.92773848, 0.83172131],
+    [0.8008109, 0.93083672, 0.83829355],
+    [0.80885107, 0.93395528, 0.84485982],
+    [0.81685878, 0.9370938, 0.85142101],
+    [0.82483206, 0.94025378, 0.8579751],
+    [0.83277661, 0.94343371, 0.86452477],
+    [0.84069127, 0.94663473, 0.87106853],
+    [0.84857662, 0.9498573, 0.8776059],
+    [0.8564431, 0.95309792, 0.88414253],
+    [0.86429066, 0.95635719, 0.89067759],
+    [0.87218969, 0.95960708, 0.89725384]
+]
+_vlag_lut = [
+    [0.13850039, 0.41331206, 0.74052025],
+    [0.15077609, 0.41762684, 0.73970427],
+    [0.16235219, 0.4219191, 0.7389667],
+    [0.1733322, 0.42619024, 0.73832537],
+    [0.18382538, 0.43044226, 0.73776764],
+    [0.19394034, 0.4346772, 0.73725867],
+    [0.20367115, 0.43889576, 0.73685314],
+    [0.21313625, 0.44310003, 0.73648045],
+    [0.22231173, 0.44729079, 0.73619681],
+    [0.23125148, 0.45146945, 0.73597803],
+    [0.23998101, 0.45563715, 0.7358223],
+    [0.24853358, 0.45979489, 0.73571524],
+    [0.25691416, 0.4639437, 0.73566943],
+    [0.26513894, 0.46808455, 0.73568319],
+    [0.27322194, 0.47221835, 0.73575497],
+    [0.28117543, 0.47634598, 0.73588332],
+    [0.28901021, 0.48046826, 0.73606686],
+    [0.2967358, 0.48458597, 0.73630433],
+    [0.30436071, 0.48869986, 0.73659451],
+    [0.3118955, 0.49281055, 0.73693255],
+    [0.31935389, 0.49691847, 0.73730851],
+    [0.32672701, 0.5010247, 0.73774013],
+    [0.33402607, 0.50512971, 0.73821941],
+    [0.34125337, 0.50923419, 0.73874905],
+    [0.34840921, 0.51333892, 0.73933402],
+    [0.35551826, 0.51744353, 0.73994642],
+    [0.3625676, 0.52154929, 0.74060763],
+    [0.36956356, 0.52565656, 0.74131327],
+    [0.37649902, 0.52976642, 0.74207698],
+    [0.38340273, 0.53387791, 0.74286286],
+    [0.39025859, 0.53799253, 0.7436962],
+    [0.39706821, 0.54211081, 0.744578],
+    [0.40384046, 0.54623277, 0.74549872],
+    [0.41058241, 0.55035849, 0.74645094],
+    [0.41728385, 0.55448919, 0.74745174],
+    [0.42395178, 0.55862494, 0.74849357],
+    [0.4305964, 0.56276546, 0.74956387],
+    [0.4372044, 0.56691228, 0.75068412],
+    [0.4437909, 0.57106468, 0.75183427],
+    [0.45035117, 0.5752235, 0.75302312],
+    [0.45687824, 0.57938983, 0.75426297],
+    [0.46339713, 0.58356191, 0.75551816],
+    [0.46988778, 0.58774195, 0.75682037],
+    [0.47635605, 0.59192986, 0.75816245],
+    [0.48281101, 0.5961252, 0.75953212],
+    [0.4892374, 0.60032986, 0.76095418],
+    [0.49566225, 0.60454154, 0.76238852],
+    [0.50206137, 0.60876307, 0.76387371],
+    [0.50845128, 0.61299312, 0.76538551],
+    [0.5148258, 0.61723272, 0.76693475],
+    [0.52118385, 0.62148236, 0.76852436],
+    [0.52753571, 0.62574126, 0.77013939],
+    [0.53386831, 0.63001125, 0.77180152],
+    [0.54020159, 0.63429038, 0.7734803],
+    [0.54651272, 0.63858165, 0.77521306],
+    [0.55282975, 0.64288207, 0.77695608],
+    [0.55912585, 0.64719519, 0.77875327],
+    [0.56542599, 0.65151828, 0.78056551],
+    [0.57170924, 0.65585426, 0.78242747],
+    [0.57799572, 0.6602009, 0.78430751],
+    [0.58426817, 0.66456073, 0.78623458],
+    [0.590544, 0.66893178, 0.78818117],
+    [0.59680758, 0.67331643, 0.79017369],
+    [0.60307553, 0.67771273, 0.79218572],
+    [0.60934065, 0.68212194, 0.79422987],
+    [0.61559495, 0.68654548, 0.7963202],
+    [0.62185554, 0.69098125, 0.79842918],
+    [0.62810662, 0.69543176, 0.80058381],
+    [0.63436425, 0.69989499, 0.80275812],
+    [0.64061445, 0.70437326, 0.80497621],
+    [0.6468706, 0.70886488, 0.80721641],
+    [0.65312213, 0.7133717, 0.80949719],
+    [0.65937818, 0.71789261, 0.81180392],
+    [0.66563334, 0.72242871, 0.81414642],
+    [0.67189155, 0.72697967, 0.81651872],
+    [0.67815314, 0.73154569, 0.81892097],
+    [0.68441395, 0.73612771, 0.82136094],
+    [0.69068321, 0.74072452, 0.82382353],
+    [0.69694776, 0.7453385, 0.82633199],
+    [0.70322431, 0.74996721, 0.8288583],
+    [0.70949595, 0.75461368, 0.83143221],
+    [0.7157774, 0.75927574, 0.83402904],
+    [0.72206299, 0.76395461, 0.83665922],
+    [0.72835227, 0.76865061, 0.8393242],
+    [0.73465238, 0.7733628, 0.84201224],
+    [0.74094862, 0.77809393, 0.84474951],
+    [0.74725683, 0.78284158, 0.84750915],
+    [0.75357103, 0.78760701, 0.85030217],
+    [0.75988961, 0.79239077, 0.85313207],
+    [0.76621987, 0.79719185, 0.85598668],
+    [0.77255045, 0.8020125, 0.85888658],
+    [0.77889241, 0.80685102, 0.86181298],
+    [0.78524572, 0.81170768, 0.86476656],
+    [0.79159841, 0.81658489, 0.86776906],
+    [0.79796459, 0.82148036, 0.8707962],
+    [0.80434168, 0.82639479, 0.87385315],
+    [0.8107221, 0.83132983, 0.87695392],
+    [0.81711301, 0.8362844, 0.88008641],
+    [0.82351479, 0.84125863, 0.88325045],
+    [0.82992772, 0.84625263, 0.88644594],
+    [0.83634359, 0.85126806, 0.8896878],
+    [0.84277295, 0.85630293, 0.89295721],
+    [0.84921192, 0.86135782, 0.89626076],
+    [0.85566206, 0.866432, 0.89959467],
+    [0.86211514, 0.87152627, 0.90297183],
+    [0.86857483, 0.87663856, 0.90638248],
+    [0.87504231, 0.88176648, 0.90981938],
+    [0.88151194, 0.88690782, 0.91328493],
+    [0.88797938, 0.89205857, 0.91677544],
+    [0.89443865, 0.89721298, 0.9202854],
+    [0.90088204, 0.90236294, 0.92380601],
+    [0.90729768, 0.90749778, 0.92732797],
+    [0.91367037, 0.91260329, 0.93083814],
+    [0.91998105, 0.91766106, 0.93431861],
+    [0.92620596, 0.92264789, 0.93774647],
+    [0.93231683, 0.9275351, 0.94109192],
+    [0.93827772, 0.9322888, 0.94432312],
+    [0.94404755, 0.93686925, 0.94740137],
+    [0.94958284, 0.94123072, 0.95027696],
+    [0.95482682, 0.9453245, 0.95291103],
+    [0.9597248, 0.94909728, 0.95525103],
+    [0.96422552, 0.95249273, 0.95723271],
+    [0.96826161, 0.95545812, 0.95882188],
+    [0.97178458, 0.95793984, 0.95995705],
+    [0.97474105, 0.95989142, 0.96059997],
+    [0.97708604, 0.96127366, 0.96071853],
+    [0.97877855, 0.96205832, 0.96030095],
+    [0.97978484, 0.96222949, 0.95935496],
+    [0.9805997, 0.96155216, 0.95813083],
+    [0.98152619, 0.95993719, 0.95639322],
+    [0.9819726, 0.95766608, 0.95399269],
+    [0.98191855, 0.9547873, 0.95098107],
+    [0.98138514, 0.95134771, 0.94740644],
+    [0.98040845, 0.94739906, 0.94332125],
+    [0.97902107, 0.94300131, 0.93878672],
+    [0.97729348, 0.93820409, 0.93385135],
+    [0.9752533, 0.933073, 0.92858252],
+    [0.97297834, 0.92765261, 0.92302309],
+    [0.97049104, 0.92200317, 0.91723505],
+    [0.96784372, 0.91616744, 0.91126063],
+    [0.96507281, 0.91018664, 0.90514124],
+    [0.96222034, 0.90409203, 0.89890756],
+    [0.9593079, 0.89791478, 0.89259122],
+    [0.95635626, 0.89167908, 0.88621654],
+    [0.95338303, 0.88540373, 0.87980238],
+    [0.95040174, 0.87910333, 0.87336339],
+    [0.94742246, 0.87278899, 0.86691076],
+    [0.94445249, 0.86646893, 0.86045277],
+    [0.94150476, 0.86014606, 0.85399191],
+    [0.93857394, 0.85382798, 0.84753642],
+    [0.93566206, 0.84751766, 0.84108935],
+    [0.93277194, 0.8412164, 0.83465197],
+    [0.92990106, 0.83492672, 0.82822708],
+    [0.92704736, 0.82865028, 0.82181656],
+    [0.92422703, 0.82238092, 0.81541333],
+    [0.92142581, 0.81612448, 0.80902415],
+    [0.91864501, 0.80988032, 0.80264838],
+    [0.91587578, 0.80365187, 0.79629001],
+    [0.9131367, 0.79743115, 0.78994],
+    [0.91041602, 0.79122265, 0.78360361],
+    [0.90771071, 0.78502727, 0.77728196],
+    [0.90501581, 0.77884674, 0.7709771],
+    [0.90235365, 0.77267117, 0.76467793],
+    [0.8997019, 0.76650962, 0.75839484],
+    [0.89705346, 0.76036481, 0.752131],
+    [0.89444021, 0.75422253, 0.74587047],
+    [0.89183355, 0.74809474, 0.73962689],
+    [0.88923216, 0.74198168, 0.73340061],
+    [0.88665892, 0.73587283, 0.72717995],
+    [0.88408839, 0.72977904, 0.72097718],
+    [0.88153537, 0.72369332, 0.71478461],
+    [0.87899389, 0.7176179, 0.70860487],
+    [0.87645157, 0.71155805, 0.7024439],
+    [0.8739399, 0.70549893, 0.6962854],
+    [0.87142626, 0.6994551, 0.69014561],
+    [0.8689268, 0.69341868, 0.68401597],
+    [0.86643562, 0.687392, 0.67789917],
+    [0.86394434, 0.68137863, 0.67179927],
+    [0.86147586, 0.67536728, 0.665704],
+    [0.85899928, 0.66937226, 0.6596292],
+    [0.85654668, 0.66337773, 0.6535577],
+    [0.85408818, 0.65739772, 0.64750494],
+    [0.85164413, 0.65142189, 0.64145983],
+    [0.84920091, 0.6454565, 0.63542932],
+    [0.84676427, 0.63949827, 0.62941],
+    [0.84433231, 0.63354773, 0.62340261],
+    [0.84190106, 0.62760645, 0.61740899],
+    [0.83947935, 0.62166951, 0.61142404],
+    [0.8370538, 0.61574332, 0.60545478],
+    [0.83463975, 0.60981951, 0.59949247],
+    [0.83221877, 0.60390724, 0.593547],
+    [0.82980985, 0.59799607, 0.58760751],
+    [0.82740268, 0.59209095, 0.58167944],
+    [0.82498638, 0.5861973, 0.57576866],
+    [0.82258181, 0.5803034, 0.56986307],
+    [0.82016611, 0.57442123, 0.56397539],
+    [0.81776305, 0.56853725, 0.55809173],
+    [0.81534551, 0.56266602, 0.55222741],
+    [0.81294293, 0.55679056, 0.5463651],
+    [0.81052113, 0.55092973, 0.54052443],
+    [0.80811509, 0.54506305, 0.53468464],
+    [0.80568952, 0.53921036, 0.52886622],
+    [0.80327506, 0.53335335, 0.52305077],
+    [0.80084727, 0.52750583, 0.51725256],
+    [0.79842217, 0.5216578, 0.51146173],
+    [0.79599382, 0.51581223, 0.50568155],
+    [0.79355781, 0.50997127, 0.49991444],
+    [0.79112596, 0.50412707, 0.49415289],
+    [0.78867442, 0.49829386, 0.48841129],
+    [0.7862306, 0.49245398, 0.48267247],
+    [0.7837687, 0.48662309, 0.47695216],
+    [0.78130809, 0.4807883, 0.47123805],
+    [0.77884467, 0.47495151, 0.46553236],
+    [0.77636283, 0.46912235, 0.45984473],
+    [0.77388383, 0.46328617, 0.45416141],
+    [0.77138912, 0.45745466, 0.44849398],
+    [0.76888874, 0.45162042, 0.44283573],
+    [0.76638802, 0.44577901, 0.43718292],
+    [0.76386116, 0.43994762, 0.43155211],
+    [0.76133542, 0.43410655, 0.42592523],
+    [0.75880631, 0.42825801, 0.42030488],
+    [0.75624913, 0.42241905, 0.41470727],
+    [0.7536919, 0.41656866, 0.40911347],
+    [0.75112748, 0.41071104, 0.40352792],
+    [0.74854331, 0.40485474, 0.3979589],
+    [0.74594723, 0.39899309, 0.39240088],
+    [0.74334332, 0.39312199, 0.38685075],
+    [0.74073277, 0.38723941, 0.3813074],
+    [0.73809409, 0.38136133, 0.37578553],
+    [0.73544692, 0.37547129, 0.37027123],
+    [0.73278943, 0.36956954, 0.36476549],
+    [0.73011829, 0.36365761, 0.35927038],
+    [0.72743485, 0.35773314, 0.35378465],
+    [0.72472722, 0.35180504, 0.34831662],
+    [0.72200473, 0.34586421, 0.34285937],
+    [0.71927052, 0.33990649, 0.33741033],
+    [0.71652049, 0.33393396, 0.33197219],
+    [0.71375362, 0.32794602, 0.32654545],
+    [0.71096951, 0.32194148, 0.32113016],
+    [0.70816772, 0.31591904, 0.31572637],
+    [0.70534784, 0.30987734, 0.31033414],
+    [0.70250944, 0.30381489, 0.30495353],
+    [0.69965211, 0.2977301, 0.2995846],
+    [0.6967754, 0.29162126, 0.29422741],
+    [0.69388446, 0.28548074, 0.28887769],
+    [0.69097561, 0.2793096, 0.28353795],
+    [0.68803513, 0.27311993, 0.27821876],
+    [0.6850794, 0.26689144, 0.27290694],
+    [0.682108, 0.26062114, 0.26760246],
+    [0.67911013, 0.2543177, 0.26231367],
+    [0.67609393, 0.24796818, 0.25703372],
+    [0.67305921, 0.24156846, 0.25176238],
+    [0.67000176, 0.23511902, 0.24650278],
+    [0.66693423, 0.22859879, 0.24124404],
+    [0.6638441, 0.22201742, 0.2359961],
+    [0.66080672, 0.21526712, 0.23069468]
+]
+_icefire_lut = [
+    [0.73936227, 0.90443867, 0.85757238],
+    [0.72888063, 0.89639109, 0.85488394],
+    [0.71834255, 0.88842162, 0.8521605],
+    [0.70773866, 0.88052939, 0.849422],
+    [0.69706215, 0.87271313, 0.84668315],
+    [0.68629021, 0.86497329, 0.84398721],
+    [0.67543654, 0.85730617, 0.84130969],
+    [0.66448539, 0.84971123, 0.83868005],
+    [0.65342679, 0.84218728, 0.83611512],
+    [0.64231804, 0.83471867, 0.83358584],
+    [0.63117745, 0.827294, 0.83113431],
+    [0.62000484, 0.81991069, 0.82876741],
+    [0.60879435, 0.81256797, 0.82648905],
+    [0.59754118, 0.80526458, 0.82430414],
+    [0.58624247, 0.79799884, 0.82221573],
+    [0.57489525, 0.7907688, 0.82022901],
+    [0.56349779, 0.78357215, 0.81834861],
+    [0.55204294, 0.77640827, 0.81657563],
+    [0.54052516, 0.76927562, 0.81491462],
+    [0.52894085, 0.76217215, 0.81336913],
+    [0.51728854, 0.75509528, 0.81194156],
+    [0.50555676, 0.74804469, 0.81063503],
+    [0.49373871, 0.7410187, 0.80945242],
+    [0.48183174, 0.73401449, 0.80839675],
+    [0.46982587, 0.72703075, 0.80747097],
+    [0.45770893, 0.72006648, 0.80667756],
+    [0.44547249, 0.71311941, 0.80601991],
+    [0.43318643, 0.70617126, 0.80549278],
+    [0.42110294, 0.69916972, 0.80506683],
+    [0.40925101, 0.69211059, 0.80473246],
+    [0.3976693, 0.68498786, 0.80448272],
+    [0.38632002, 0.67781125, 0.80431024],
+    [0.37523981, 0.67057537, 0.80420832],
+    [0.36442578, 0.66328229, 0.80417474],
+    [0.35385939, 0.65593699, 0.80420591],
+    [0.34358916, 0.64853177, 0.8043],
+    [0.33355526, 0.64107876, 0.80445484],
+    [0.32383062, 0.63356578, 0.80467091],
+    [0.31434372, 0.62600624, 0.8049475],
+    [0.30516161, 0.618389, 0.80528692],
+    [0.29623491, 0.61072284, 0.80569021],
+    [0.28759072, 0.60300319, 0.80616055],
+    [0.27923924, 0.59522877, 0.80669803],
+    [0.27114651, 0.5874047, 0.80730545],
+    [0.26337153, 0.57952055, 0.80799113],
+    [0.25588696, 0.57157984, 0.80875922],
+    [0.248686, 0.56358255, 0.80961366],
+    [0.24180668, 0.55552289, 0.81055123],
+    [0.23526251, 0.54739477, 0.8115939],
+    [0.22921445, 0.53918506, 0.81267292],
+    [0.22397687, 0.53086094, 0.8137141],
+    [0.21977058, 0.52241482, 0.81457651],
+    [0.21658989, 0.51384321, 0.81528511],
+    [0.21452772, 0.50514155, 0.81577278],
+    [0.21372783, 0.49630865, 0.81589566],
+    [0.21409503, 0.48734861, 0.81566163],
+    [0.2157176, 0.47827123, 0.81487615],
+    [0.21842857, 0.46909168, 0.81351614],
+    [0.22211705, 0.45983212, 0.81146983],
+    [0.22665681, 0.45052233, 0.80860217],
+    [0.23176013, 0.44119137, 0.80494325],
+    [0.23727775, 0.43187704, 0.80038017],
+    [0.24298285, 0.42261123, 0.79493267],
+    [0.24865068, 0.41341842, 0.78869164],
+    [0.25423116, 0.40433127, 0.78155831],
+    [0.25950239, 0.39535521, 0.77376848],
+    [0.2644736, 0.38651212, 0.76524809],
+    [0.26901584, 0.37779582, 0.75621942],
+    [0.27318141, 0.36922056, 0.746605],
+    [0.27690355, 0.3607736, 0.73659374],
+    [0.28023585, 0.35244234, 0.72622103],
+    [0.28306009, 0.34438449, 0.71500731],
+    [0.28535896, 0.33660243, 0.70303975],
+    [0.28708711, 0.32912157, 0.69034504],
+    [0.28816354, 0.32200604, 0.67684067],
+    [0.28862749, 0.31519824, 0.66278813],
+    [0.28847904, 0.30869064, 0.6482815],
+    [0.28770912, 0.30250126, 0.63331265],
+    [0.28640325, 0.29655509, 0.61811374],
+    [0.28458943, 0.29082155, 0.60280913],
+    [0.28233561, 0.28527482, 0.58742866],
+    [0.27967038, 0.2798938, 0.57204225],
+    [0.27665361, 0.27465357, 0.55667809],
+    [0.27332564, 0.2695165, 0.54145387],
+    [0.26973851, 0.26447054, 0.52634916],
+    [0.2659204, 0.25949691, 0.511417],
+    [0.26190145, 0.25458123, 0.49668768],
+    [0.2577151, 0.24971691, 0.48214874],
+    [0.25337618, 0.24490494, 0.46778758],
+    [0.24890842, 0.24013332, 0.45363816],
+    [0.24433654, 0.23539226, 0.4397245],
+    [0.23967922, 0.23067729, 0.4260591],
+    [0.23495608, 0.22598894, 0.41262952],
+    [0.23018113, 0.22132414, 0.39945577],
+    [0.22534609, 0.21670847, 0.38645794],
+    [0.22048761, 0.21211723, 0.37372555],
+    [0.2156198, 0.20755389, 0.36125301],
+    [0.21074637, 0.20302717, 0.34903192],
+    [0.20586893, 0.19855368, 0.33701661],
+    [0.20101757, 0.19411573, 0.32529173],
+    [0.19619947, 0.18972425, 0.31383846],
+    [0.19140726, 0.18540157, 0.30260777],
+    [0.1866769, 0.1811332, 0.29166583],
+    [0.18201285, 0.17694992, 0.28088776],
+    [0.17745228, 0.17282141, 0.27044211],
+    [0.17300684, 0.16876921, 0.26024893],
+    [0.16868273, 0.16479861, 0.25034479],
+    [0.16448691, 0.16091728, 0.24075373],
+    [0.16043195, 0.15714351, 0.23141745],
+    [0.15652427, 0.15348248, 0.22238175],
+    [0.15277065, 0.14994111, 0.21368395],
+    [0.14918274, 0.14653431, 0.20529486],
+    [0.14577095, 0.14327403, 0.19720829],
+    [0.14254381, 0.14016944, 0.18944326],
+    [0.13951035, 0.13723063, 0.18201072],
+    [0.13667798, 0.13446606, 0.17493774],
+    [0.13405762, 0.13188822, 0.16820842],
+    [0.13165767, 0.12950667, 0.16183275],
+    [0.12948748, 0.12733187, 0.15580631],
+    [0.12755435, 0.1253723, 0.15014098],
+    [0.12586516, 0.12363617, 0.1448459],
+    [0.12442647, 0.12213143, 0.13992571],
+    [0.12324241, 0.12086419, 0.13539995],
+    [0.12232067, 0.11984278, 0.13124644],
+    [0.12166209, 0.11907077, 0.12749671],
+    [0.12126982, 0.11855309, 0.12415079],
+    [0.12114244, 0.11829179, 0.1212385],
+    [0.12127766, 0.11828837, 0.11878534],
+    [0.12284806, 0.1179729, 0.11772022],
+    [0.12619498, 0.11721796, 0.11770203],
+    [0.129968, 0.11663788, 0.11792377],
+    [0.13410011, 0.11625146, 0.11839138],
+    [0.13855459, 0.11606618, 0.11910584],
+    [0.14333775, 0.11607038, 0.1200606],
+    [0.148417, 0.11626929, 0.12125453],
+    [0.15377389, 0.11666192, 0.12268364],
+    [0.15941427, 0.11723486, 0.12433911],
+    [0.16533376, 0.11797856, 0.12621303],
+    [0.17152547, 0.11888403, 0.12829735],
+    [0.17797765, 0.11994436, 0.13058435],
+    [0.18468769, 0.12114722, 0.13306426],
+    [0.19165663, 0.12247737, 0.13572616],
+    [0.19884415, 0.12394381, 0.1385669],
+    [0.20627181, 0.12551883, 0.14157124],
+    [0.21394877, 0.12718055, 0.14472604],
+    [0.22184572, 0.12893119, 0.14802579],
+    [0.22994394, 0.13076731, 0.15146314],
+    [0.23823937, 0.13267611, 0.15502793],
+    [0.24676041, 0.13462172, 0.15870321],
+    [0.25546457, 0.13661751, 0.16248722],
+    [0.26433628, 0.13865956, 0.16637301],
+    [0.27341345, 0.14070412, 0.17034221],
+    [0.28264773, 0.14277192, 0.1743957],
+    [0.29202272, 0.14486161, 0.17852793],
+    [0.30159648, 0.14691224, 0.1827169],
+    [0.31129002, 0.14897583, 0.18695213],
+    [0.32111555, 0.15103351, 0.19119629],
+    [0.33107961, 0.1530674, 0.19543758],
+    [0.34119892, 0.15504762, 0.1996803],
+    [0.35142388, 0.15701131, 0.20389086],
+    [0.36178937, 0.1589124, 0.20807639],
+    [0.37229381, 0.16073993, 0.21223189],
+    [0.38288348, 0.16254006, 0.2163249],
+    [0.39359592, 0.16426336, 0.22036577],
+    [0.40444332, 0.16588767, 0.22434027],
+    [0.41537995, 0.16745325, 0.2282297],
+    [0.42640867, 0.16894939, 0.23202755],
+    [0.43754706, 0.17034847, 0.23572899],
+    [0.44878564, 0.1716535, 0.23932344],
+    [0.4601126, 0.17287365, 0.24278607],
+    [0.47151732, 0.17401641, 0.24610337],
+    [0.48300689, 0.17506676, 0.2492737],
+    [0.49458302, 0.17601892, 0.25227688],
+    [0.50623876, 0.17687777, 0.255096],
+    [0.5179623, 0.17765528, 0.2577162],
+    [0.52975234, 0.17835232, 0.2601134],
+    [0.54159776, 0.17898292, 0.26226847],
+    [0.55348804, 0.17956232, 0.26416003],
+    [0.56541729, 0.18010175, 0.26575971],
+    [0.57736669, 0.180631, 0.26704888],
+    [0.58932081, 0.18117827, 0.26800409],
+    [0.60127582, 0.18175888, 0.26858488],
+    [0.61319563, 0.1824336, 0.2687872],
+    [0.62506376, 0.18324015, 0.26858301],
+    [0.63681202, 0.18430173, 0.26795276],
+    [0.64842603, 0.18565472, 0.26689463],
+    [0.65988195, 0.18734638, 0.26543435],
+    [0.67111966, 0.18948885, 0.26357955],
+    [0.68209194, 0.19216636, 0.26137175],
+    [0.69281185, 0.19535326, 0.25887063],
+    [0.70335022, 0.19891271, 0.25617971],
+    [0.71375229, 0.20276438, 0.25331365],
+    [0.72401436, 0.20691287, 0.25027366],
+    [0.73407638, 0.21145051, 0.24710661],
+    [0.74396983, 0.21631913, 0.24380715],
+    [0.75361506, 0.22163653, 0.24043996],
+    [0.7630579, 0.22731637, 0.23700095],
+    [0.77222228, 0.23346231, 0.23356628],
+    [0.78115441, 0.23998404, 0.23013825],
+    [0.78979746, 0.24694858, 0.22678822],
+    [0.79819286, 0.25427223, 0.22352658],
+    [0.80630444, 0.26198807, 0.22040877],
+    [0.81417437, 0.27001406, 0.21744645],
+    [0.82177364, 0.27837336, 0.21468316],
+    [0.82915955, 0.28696963, 0.21210766],
+    [0.83628628, 0.2958499, 0.20977813],
+    [0.84322168, 0.30491136, 0.20766435],
+    [0.84995458, 0.31415945, 0.2057863],
+    [0.85648867, 0.32358058, 0.20415327],
+    [0.86286243, 0.33312058, 0.20274969],
+    [0.86908321, 0.34276705, 0.20157271],
+    [0.87512876, 0.3525416, 0.20064949],
+    [0.88100349, 0.36243385, 0.19999078],
+    [0.8866469, 0.37249496, 0.1997976],
+    [0.89203964, 0.38273475, 0.20013431],
+    [0.89713496, 0.39318156, 0.20121514],
+    [0.90195099, 0.40380687, 0.20301555],
+    [0.90648379, 0.41460191, 0.20558847],
+    [0.9106967, 0.42557857, 0.20918529],
+    [0.91463791, 0.43668557, 0.21367954],
+    [0.91830723, 0.44790913, 0.21916352],
+    [0.92171507, 0.45922856, 0.22568002],
+    [0.92491786, 0.4705936, 0.23308207],
+    [0.92790792, 0.48200153, 0.24145932],
+    [0.93073701, 0.49341219, 0.25065486],
+    [0.93343918, 0.5048017, 0.26056148],
+    [0.93602064, 0.51616486, 0.27118485],
+    [0.93850535, 0.52748892, 0.28242464],
+    [0.94092933, 0.53875462, 0.29416042],
+    [0.94330011, 0.5499628, 0.30634189],
+    [0.94563159, 0.56110987, 0.31891624],
+    [0.94792955, 0.57219822, 0.33184256],
+    [0.95020929, 0.5832232, 0.34508419],
+    [0.95247324, 0.59419035, 0.35859866],
+    [0.95471709, 0.60510869, 0.37236035],
+    [0.95698411, 0.61595766, 0.38629631],
+    [0.95923863, 0.62676473, 0.40043317],
+    [0.9615041, 0.6375203, 0.41474106],
+    [0.96371553, 0.64826619, 0.42928335],
+    [0.96591497, 0.65899621, 0.44380444],
+    [0.96809871, 0.66971662, 0.45830232],
+    [0.9702495, 0.6804394, 0.47280492],
+    [0.9723881, 0.69115622, 0.48729272],
+    [0.97450723, 0.70187358, 0.50178034],
+    [0.9766108, 0.712592, 0.51626837],
+    [0.97871716, 0.72330511, 0.53074053],
+    [0.98082222, 0.73401769, 0.54520694],
+    [0.9829001, 0.74474445, 0.5597019],
+    [0.98497466, 0.75547635, 0.57420239],
+    [0.98705581, 0.76621129, 0.58870185],
+    [0.98913325, 0.77695637, 0.60321626],
+    [0.99119918, 0.78771716, 0.61775821],
+    [0.9932672, 0.79848979, 0.63231691],
+    [0.99535958, 0.80926704, 0.64687278],
+    [0.99740544, 0.82008078, 0.66150571],
+    [0.9992197, 0.83100723, 0.6764127]
+]
+_luts = [_rocket_lut, _mako_lut, _vlag_lut, _icefire_lut]
+_names = ["rocket", "mako", "vlag", "icefire"]
+for _lut, _name in zip(_luts, _names):
+    _cmap = colors.ListedColormap(_lut, _name)
+    locals()[_name] = _cmap
+    _cmap_r = colors.ListedColormap(_lut[::-1], _name + "_r")
+    locals()[_name + "_r"] = _cmap_r
+    mpl_cm.register(_cmap, name=_name)
+    mpl_cm.register(_cmap_r, name=_name + "_r")

deepTools/source/deeptools/computeGCBias.py ADDED Viewed

	@@ -0,0 +1,800 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import time
+import multiprocessing
+import numpy as np
+import argparse
+from scipy.stats import poisson
+import py2bit
+import sys
+from deeptoolsintervals import GTF
+from deeptools.utilities import tbitToBamChrName, getGC_content
+from deeptools import parserCommon, mapReduce
+from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+from deeptools import bamHandler
+debug = 0
+old_settings = np.seterr(all='ignore')
+def parse_arguments(args=None):
+    parentParser = parserCommon.getParentArgParse(binSize=False, blackList=True)
+    requiredArgs = getRequiredArgs()
+    parser = argparse.ArgumentParser(
+        parents=[requiredArgs, parentParser],
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Computes the GC-bias using Benjamini\'s method '
+        '[Benjamini & Speed (2012). Nucleic Acids Research, 40(10). doi: 10.1093/nar/gks001]. '
+        'The GC-bias is visualized and the resulting table can be used to'
+        'correct the bias with `correctGCBias`.',
+        usage='computeGCBias '
+        '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit -l 200 --GCbiasFrequenciesFile freq.txt\n'
+        'help: computeGCBias -h / computeGCBias --help',
+        conflict_handler='resolve',
+        add_help=False)
+    return parser
+def getRequiredArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--bamfile', '-b',
+                          metavar='bam file',
+                          help='Sorted BAM file. ',
+                          required=True)
+    required.add_argument('--effectiveGenomeSize',
+                          help='The effective genome size is the portion '
+                          'of the genome that is mappable. Large fractions of '
+                          'the genome are stretches of NNNN that should be '
+                          'discarded. Also, if repetitive regions were not '
+                          'included in the mapping of reads, the effective '
+                          'genome size needs to be adjusted accordingly. '
+                          'A table of values is available here: '
+                          'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .',
+                          default=None,
+                          type=int,
+                          required=True)
+    required.add_argument('--genome', '-g',
+                          help='Genome in two bit format. Most genomes can be '
+                          'found here: http://hgdownload.cse.ucsc.edu/gbdb/ '
+                          'Search for the .2bit ending. Otherwise, fasta '
+                          'files can be converted to 2bit using the UCSC '
+                          'programm called faToTwoBit available for different '
+                          'plattforms at '
+                          'http://hgdownload.cse.ucsc.edu/admin/exe/',
+                          metavar='2bit FILE',
+                          required=True)
+    required.add_argument('--GCbiasFrequenciesFile', '-freq', '-o',
+                          help='Path to save the file containing '
+                          'the observed and expected read frequencies per %%GC-'
+                          'content. This file is needed to run the '
+                          'correctGCBias tool. This is a text file.',
+                          type=argparse.FileType('w'),
+                          metavar='FILE',
+                          required=True)
+    # define the optional arguments
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument('--fragmentLength', '-l',
+                          help='Fragment length used for the sequencing. If '
+                          'paired-end reads are used, the fragment length is '
+                          'computed based from the bam file',
+                          type=int)
+    optional.add_argument("--help", "-h", action="help",
+                          help="show this help message and exit")
+    optional.add_argument('--sampleSize',
+                          default=5e7,
+                          help='Number of sampling points to be considered. (Default: %(default)s)',
+                          type=int)
+    optional.add_argument('--extraSampling',
+                          help='BED file containing genomic regions for which '
+                          'extra sampling is required because they are '
+                          'underrepresented in the genome.',
+                          type=argparse.FileType('r'),
+                          metavar='BED file')
+    plot = parser.add_argument_group('Diagnostic plot options')
+    plot.add_argument('--biasPlot',
+                      metavar='FILE NAME',
+                      help='If given, a diagnostic image summarizing '
+                      'the GC-bias will be saved.')
+    plot.add_argument('--plotFileFormat',
+                      metavar='',
+                      help='image format type. If given, this '
+                      'option overrides the '
+                      'image format based on the plotFile ending. '
+                      'The available options are: "png", '
+                      '"eps", "pdf", "plotly" and "svg"',
+                      choices=['png', 'pdf', 'svg', 'eps', 'plotly'])
+    plot.add_argument('--regionSize',
+                      metavar='INT',
+                      type=int,
+                      default=300,
+                      help='To plot the reads per %%GC over a region'
+                      'the size of the region is required. By default, '
+                      'the bin size is set to 300 bases, which is close to the '
+                      'standard fragment size for Illumina machines. However, '
+                      'if the depth of sequencing is low, a larger bin size '
+                      'will be required, otherwise many bins will not '
+                      'overlap with any read (Default: %(default)s)')
+    return parser
+def getPositionsToSample(chrom, start, end, stepSize):
+    """
+    check if the region submitted to the worker
+    overlaps with the region to take extra effort to sample.
+    If that is the case, the regions to sample array is
+    increased to match each of the positions in the extra
+    effort region sampled at the same stepSize along the interval.
+    If a filter out tree is given, then from positions to sample
+    those regions are cleaned
+    """
+    positions_to_sample = np.arange(start, end, stepSize)
+    if global_vars['filter_out']:
+        filter_out_tree = GTF(global_vars['filter_out'])
+    else:
+        filter_out_tree = None
+    if global_vars['extra_sampling_file']:
+        extra_tree = GTF(global_vars['extra_sampling_file'])
+    else:
+        extra_tree = None
+    if extra_tree:
+        orig_len = len(positions_to_sample)
+        try:
+            extra_match = extra_tree.findOverlaps(chrom, start, end)
+        except KeyError:
+            extra_match = []
+        if len(extra_match) > 0:
+            for intval in extra_match:
+                positions_to_sample = np.append(positions_to_sample,
+                                                list(range(intval[0], intval[1], stepSize)))
+        # remove duplicates
+        positions_to_sample = np.unique(np.sort(positions_to_sample))
+        if debug:
+            print("sampling increased to {} from {}".format(
+                len(positions_to_sample),
+                orig_len))
+    # skip regions that are filtered out
+    if filter_out_tree:
+        try:
+            out_match = filter_out_tree.findOverlaps(chrom, start, end)
+        except KeyError:
+            out_match = []
+        if len(out_match) > 0:
+            for intval in out_match:
+                positions_to_sample = \
+                    positions_to_sample[(positions_to_sample < intval[0]) | (positions_to_sample >= intval[1])]
+    return positions_to_sample
+def countReadsPerGC_wrapper(args):
+    return countReadsPerGC_worker(*args)
+def countReadsPerGC_worker(chromNameBam,
+                           start, end, stepSize, regionSize,
+                           chrNameBamToBit, verbose=False):
+    """given a genome region defined by
+    (start, end), the GC content is quantified for
+    regions of size regionSize that are contiguous
+    """
+    chromNameBit = chrNameBamToBit[chromNameBam]
+    tbit = py2bit.open(global_vars['2bit'])
+    bam = bamHandler.openBam(global_vars['bam'])
+    c = 1
+    sub_reads_per_gc = []
+    positions_to_sample = getPositionsToSample(chromNameBit,
+                                               start, end, stepSize)
+    for index in range(len(positions_to_sample)):
+        i = positions_to_sample[index]
+        # stop if region extends over the chromosome end
+        if tbit.chroms(chromNameBit) < i + regionSize:
+            break
+        try:
+            gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize))
+        except Exception as detail:
+            if verbose:
+                print("{}:{}-{}".format(chromNameBit, i, i + regionSize))
+                print(detail)
+            continue
+        numberReads = bam.count(chromNameBam, i, i + regionSize)
+        sub_reads_per_gc.append((numberReads, gc))
+        c += 1
+    return sub_reads_per_gc
+def tabulateGCcontent_wrapper(args):
+    return tabulateGCcontent_worker(*args)
+def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
+                             fragmentLength,
+                             chrNameBamToBit, verbose=False):
+    r""" given genome regions, the GC content of the genome is tabulated for
+    fragments of length 'fragmentLength' each 'stepSize' positions.
+    >>> test = Tester()
+    >>> args = test.testTabulateGCcontentWorker()
+    >>> N_gc, F_gc = tabulateGCcontent_worker(*args)
+    The forward read positions are:
+    [1,  4,  10, 10, 16, 18]
+    which correspond to a GC of
+    [1,  1,  1,  1,  2,  1]
+    The evaluated position are
+    [0,  2,  4,  6,  8, 10, 12, 14, 16, 18]
+    the corresponding GC is
+    [2,  1,  1,  2,  2,  1,  2,  3,  2,  1]
+    >>> print(N_gc)
+    [0 4 5 1]
+    >>> print(F_gc)
+    [0 4 1 0]
+    >>> test.set_filter_out_file()
+    >>> chrNameBam2bit =  {'2L': 'chr2L'}
+    Test for the filter out option
+    >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
+    ... {'median': 3}, chrNameBam2bit)
+    >>> test.unset_filter_out_file()
+    The evaluated positions are
+    [ 0  2  8 10 12 14 16 18]
+    >>> print(N_gc)
+    [0 3 4 1]
+    >>> print(F_gc)
+    [0 3 1 0]
+    Test for extra_sampling option
+    >>> test.set_extra_sampling_file()
+    >>> chrNameBam2bit =  {'2L': 'chr2L'}
+    >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
+    ... {'median': 3}, chrNameBam2bit)
+    The new positions evaluated are
+    [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
+    and the GC is
+    [2, 1, 1, 0, 1, 2, 2, 1,  2,  3,  2,  1]
+    >>> print(res[0])
+    [1 5 5 1]
+    >>> print(res[1])
+    [0 5 1 0]
+    """
+    if start > end:
+        raise NameError("start %d bigger that end %d" % (start, end))
+    chromNameBit = chrNameBamToBit[chromNameBam]
+    # array to keep track of the GC from regions of length 'fragmentLength'
+    # from the genome. The index of the array is used to
+    # indicate the gc content. The values inside the
+    # array are counts. Thus, if N_gc[10] = 3, that means
+    # that 3 regions have a gc_content of 10.
+    subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
+    subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
+    tbit = py2bit.open(global_vars['2bit'])
+    bam = bamHandler.openBam(global_vars['bam'])
+    peak = 0
+    startTime = time.time()
+    if verbose:
+        print("[{:.3f}] computing positions to "
+              "sample".format(time.time() - startTime))
+    positions_to_sample = getPositionsToSample(chromNameBit,
+                                               start, end, stepSize)
+    read_counts = []
+    # Optimize IO.
+    # if the sample regions are far apart from each
+    # other is faster to go to each location and fetch
+    # the reads found there.
+    # Otherwise, if the regions to sample are close to
+    # each other, is faster to load all the reads in
+    # a large region into memory and consider only
+    # those falling into the positions to sample.
+    # The following code gets the reads
+    # that are at sampling positions that lie close together
+    if np.mean(np.diff(positions_to_sample)) < 1000:
+        start_pos = min(positions_to_sample)
+        end_pos = max(positions_to_sample)
+        if verbose:
+            print("[{:.3f}] caching reads".format(time.time() - startTime))
+        counts = np.bincount([r.pos - start_pos
+                              for r in bam.fetch(chromNameBam, start_pos,
+                                                 end_pos + 1)
+                              if not r.is_reverse and not r.is_unmapped and r.pos >= start_pos],
+                             minlength=end_pos - start_pos + 2)
+        read_counts = counts[positions_to_sample - min(positions_to_sample)]
+        if verbose:
+            print("[{:.3f}] finish caching reads.".format(
+                time.time() - startTime))
+    countTime = time.time()
+    c = 1
+    for index in range(len(positions_to_sample)):
+        i = positions_to_sample[index]
+        # stop if the end of the chromosome is reached
+        if i + fragmentLength['median'] > tbit.chroms(chromNameBit):
+            break
+        try:
+            gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False)
+        except Exception as detail:
+            if verbose:
+                print(detail)
+            continue
+        subN_gc[gc] += 1
+        # count all reads at position 'i'
+        if len(read_counts) == 0:  # case when no cache was done
+            num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
+                             if x.is_reverse is False and x.pos == i])
+        else:
+            num_reads = read_counts[index]
+        if num_reads >= global_vars['max_reads']:
+            peak += 1
+            continue
+        subF_gc[gc] += num_reads
+        if verbose:
+            if index % 50000 == 0:
+                endTime = time.time()
+                print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
+                      (multiprocessing.current_process().name,
+                       index, index / (endTime - countTime),
+                       chromNameBit, start, end, stepSize))
+        c += 1
+    if verbose:
+        endTime = time.time()
+        print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
+              (multiprocessing.current_process().name,
+               index, index / (endTime - countTime),
+               chromNameBit, start, end, stepSize))
+        print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
+                                                    (endTime - startTime), chromNameBit, start, end, stepSize))
+    return subN_gc, subF_gc
+def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize,
+                      chromSizes, numberOfProcessors=None, verbose=False,
+                      region=None):
+    r"""
+    Subdivides the genome or the reads into chunks to be analyzed in parallel
+    using several processors. This codes handles the creation of
+    workers that tabulate the GC content for small regions and then
+    collects and integrates the results
+    >>> test = Tester()
+    >>> arg = test.testTabulateGCcontent()
+    >>> res = tabulateGCcontent(*arg)
+    >>> res
+    array([[  0.        ,  18.        ,   1.        ],
+           [  3.        ,  63.        ,   0.45815996],
+           [  7.        , 159.        ,   0.42358185],
+           [ 25.        , 192.        ,   1.25278115],
+           [ 28.        , 215.        ,   1.25301422],
+           [ 16.        , 214.        ,   0.71935396],
+           [ 12.        ,  95.        ,   1.21532959],
+           [  9.        ,  24.        ,   3.60800971],
+           [  3.        ,  11.        ,   2.62400706],
+           [  0.        ,   0.        ,   1.        ],
+           [  0.        ,   0.        ,   1.        ]])
+    """
+    global global_vars
+    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
+    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
+    chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())]
+    imap_res = mapReduce.mapReduce((stepSize,
+                                    fragmentLength, chrNameBamToBit,
+                                    verbose),
+                                   tabulateGCcontent_wrapper,
+                                   chromSizes,
+                                   genomeChunkLength=chunkSize,
+                                   numberOfProcessors=numberOfProcessors,
+                                   region=region)
+    for subN_gc, subF_gc in imap_res:
+        try:
+            F_gc += subF_gc
+            N_gc += subN_gc
+        except NameError:
+            F_gc = subF_gc
+            N_gc = subN_gc
+    if sum(F_gc) == 0:
+        sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter")
+    scaling = float(sum(N_gc)) / float(sum(F_gc))
+    R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling
+                     if N_gc[x] and F_gc[x] > 0 else 1
+                     for x in range(len(F_gc))])
+    data = np.transpose(np.vstack((F_gc, N_gc, R_gc)))
+    return data
+def countReadsPerGC(regionSize, chrNameBitToBam, stepSize,
+                    chromSizes, numberOfProcessors=None, verbose=False,
+                    region=None):
+    r"""
+    Computes for a region of size regionSize, the GC of the region
+    and the number of reads that overlap it.
+    >>> test = Tester()
+    >>> arg = test.testCountReadsPerGC()
+    >>> reads_per_gc = countReadsPerGC(*arg)
+    >>> reads_per_gc[0:5,:]
+    array([[132.        ,   0.44      ],
+           [132.        ,   0.44      ],
+           [133.        ,   0.44      ],
+           [134.        ,   0.43666667],
+           [134.        ,   0.44      ]])
+    """
+    global global_vars
+    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
+    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
+    imap_res = mapReduce.mapReduce((stepSize,
+                                    regionSize, chrNameBamToBit,
+                                    verbose),
+                                   countReadsPerGC_wrapper,
+                                   chromSizes,
+                                   genomeChunkLength=chunkSize,
+                                   numberOfProcessors=numberOfProcessors,
+                                   region=region)
+    reads_per_gc = []
+    for sub_reads_per_gc in imap_res:
+        reads_per_gc += sub_reads_per_gc
+    reads_per_gc = np.asarray(reads_per_gc)
+    return reads_per_gc
+def smooth(x, window_len=3):
+    """
+    *CURRENTLY* not being used
+    smooths the values from the frequencies by taking the average
+    of 'window_len' values.  window_len has to be an odd number
+    """
+    # do not smooth small arrays
+    if len(x) < window_len * 2:
+        return x
+    i = 0
+    y = x[:]
+    half_width = (window_len - 1) / 2
+    for i in range(0, len(x)):
+        if i < half_width or i + half_width + 1 > len(x):
+            continue
+        else:
+            y[i] = np.mean(x[i - half_width:i + half_width + 1])
+    # clip low values, this avoid problems with zeros
+    return y
+def bin_by(x, y, nbins=10):
+    """
+    Bin x by y.
+    Returns the binned "x" values and the left edges of the bins
+    """
+    bins = np.linspace(0, 1, nbins + 1)
+    # To avoid extra bin for the max value
+    bins[-1] += 1
+    indices = np.digitize(y, bins)
+    output = []
+    for i in range(1, len(bins)):
+        output.append(x[indices == i])
+    # Just return the left edges of the bins
+    bins = bins[:-1]
+    return output, bins
+def plotlyGCbias(file_name, frequencies, reads_per_gc, region_size):
+    import plotly.offline as py
+    import plotly.graph_objs as go
+    import matplotlib.cbook as cbook
+    fig = go.Figure()
+    fig['layout']['xaxis1'] = dict(domain=[0.0, 1.0], anchor="y1", title="GC fraction")
+    fig['layout']['yaxis1'] = dict(domain=[0.55, 1.0], anchor="x1", title="Number of reads")
+    fig['layout']['xaxis2'] = dict(domain=[0.0, 1.0], anchor="y2", title="GC fraction", range=[0.2, 0.7])
+    fig['layout']['yaxis2'] = dict(domain=[0.0, 0.45], anchor="x2", title="log2(observed/expected)")
+    text = "reads per {} base region".format(region_size)
+    annos = [{'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 1.0, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False}]
+    text = "normalized observed/expected read counts"
+    annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 0.5, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False})
+    # prepare data for boxplot
+    reads, GC = reads_per_gc.T
+    reads_per_gc, bin_labels = bin_by(reads, GC, nbins=100)
+    to_keep = [idx for idx, x in enumerate(bin_labels) if 0.2 <= x <= 0.7]
+    reads_per_gc = [reads_per_gc[x] for x in to_keep]
+    bin_labels = [bin_labels[x] for x in to_keep]
+    # produce the same boxplot as matplotlib as vastly reduce the output file size
+    bins = []
+    for b in reads_per_gc:
+        s = cbook.boxplot_stats(b)[0]
+        bins.append([s['whislo'], s['q1'], s['q1'], s['med'], s['med'], s['med'], s['q3'], s['q3'], s['whishi']])
+    data = []
+    # top plot
+    for x, y in zip(bin_labels, bins):
+        trace = go.Box(x=x, y=y, xaxis='x1', yaxis='y1', boxpoints='outliers', showlegend=False, name="{}".format(x), line=dict(color='rgb(107,174,214)'))
+        data.append(trace)
+    # bottom plot
+    x = np.linspace(0, 1, frequencies.shape[0])
+    trace = go.Scatter(x=x, y=np.log2(frequencies[:, 2]), xaxis='x2', yaxis='y2', showlegend=False, line=dict(color='rgb(107,174,214)'))
+    data.append(trace)
+    fig.add_traces(data)
+    fig['layout']['annotations'] = annos
+    py.plot(fig, filename=file_name, auto_open=False)
+def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=None):
+    import matplotlib
+    matplotlib.use('Agg')
+    matplotlib.rcParams['pdf.fonttype'] = 42
+    matplotlib.rcParams['svg.fonttype'] = 'none'
+    import matplotlib.pyplot as plt
+    # prepare data for boxplot
+    reads, GC = reads_per_gc.T
+    reads_per_gc, bin_labels = bin_by(reads, GC, nbins=100)
+    to_keep = [idx for idx, x in enumerate(bin_labels) if 0.2 <= x <= 0.7]
+    reads_per_gc = [reads_per_gc[x] for x in to_keep]
+    bin_labels = [bin_labels[x] for x in to_keep]
+    title = "reads per regions of {} bp".format(region_size)
+    fig = plt.figure(figsize=(6, 8))
+    ax1 = fig.add_subplot(211, title=title)
+    ax2 = fig.add_subplot(212,
+                          title='normalized observed/expected read counts')
+    # make boxplot
+    bp = ax1.boxplot(reads_per_gc, notch=0, patch_artist=True)
+    plt.setp(bp['boxes'], color='black', facecolor='LightGreen')
+    plt.setp(bp['medians'], color='black')
+    plt.setp(bp['whiskers'], color='black', linestyle='dashed')
+    plt.setp(bp['fliers'], marker='None')
+    # get the whisker that spands the most
+    y_max = np.nanmax([x.get_data()[1][1] for x in bp['whiskers']])
+    ax1.set_ylim(0 - (y_max * 0.05), y_max * 1.05)
+    ax1.set_ylabel('Number of reads')
+    ax1.set_xlabel('GC fraction')
+    xticks = [idx for idx, x in enumerate(bin_labels) if int(x * 100) % 10 == 0]
+    ax1.set_xticks(xticks)
+    ax1.set_xticklabels(["{:.1f}".format(bin_labels[x]) for x in xticks])
+    x = np.linspace(0, 1, frequencies.shape[0])
+    y = np.log2(frequencies[:, 2])
+    ax2.plot(x, y, color='#8c96f0')
+    ax2.set_xlabel('GC fraction')
+    ax2.set_ylabel('log2ratio observed/expected')
+    ax2.set_xlim(0.2, 0.7)
+    y_max = max(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1])
+    y_min = min(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1])
+    if y_max > 0:
+        y_max *= 1.1
+    else:
+        y_max *= 0.9
+    if y_min < 0:
+        y_min *= 1.1
+    else:
+        y_min *= 0.9
+    ax2.set_ylim(y_min, y_max)
+    plt.tight_layout()
+    plt.savefig(file_name, bbox_inches='tight', dpi=100, format=image_format)
+    plt.close()
+def main(args=None):
+    args = parse_arguments().parse_args(args)
+    if args.extraSampling:
+        extra_sampling_file = args.extraSampling.name
+        args.extraSampling.close()
+    else:
+        extra_sampling_file = None
+    global global_vars
+    global_vars = {}
+    global_vars['2bit'] = args.genome
+    global_vars['bam'] = args.bamfile
+    global_vars['filter_out'] = args.blackListFileName
+    global_vars['extra_sampling_file'] = extra_sampling_file
+    tbit = py2bit.open(global_vars['2bit'])
+    bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors)
+    if args.fragmentLength:
+        fragment_len_dict = \
+            {'median': args.fragmentLength}
+    else:
+        fragment_len_dict, __ = \
+            get_read_and_fragment_length(args.bamfile, None,
+                                         numberOfProcessors=args.numberOfProcessors,
+                                         verbose=args.verbose)
+        if not fragment_len_dict:
+            print("\nPlease provide the fragment length used for the "
+                  "sample preparation.\n")
+            exit(1)
+        fragment_len_dict = {'median': int(fragment_len_dict['median'])}
+    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
+    global_vars['genome_size'] = sum(tbit.chroms().values())
+    global_vars['total_reads'] = mapped
+    global_vars['reads_per_bp'] = \
+        float(global_vars['total_reads']) / args.effectiveGenomeSize
+    confidence_p_value = float(1) / args.sampleSize
+    # chromSizes: list of tuples
+    chromSizes = [(bam.references[i], bam.lengths[i])
+                  for i in range(len(bam.references))]
+    chromSizes = [x for x in chromSizes if x[0] in tbit.chroms()]
+    # use poisson distribution to identify peaks that should be discarted.
+    # I multiply by 4, because the real distribution of reads
+    # vary depending on the gc content
+    # and the global number of reads per bp may a be too low.
+    # empirically, a value of at least 4 times as big as the
+    # reads_per_bp was found.
+    # Similarly for the min value, I divide by 4.
+    global_vars['max_reads'] = poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value)
+    # this may be of not use, unless the depth of sequencing is really high
+    # as this value is close to 0
+    global_vars['min_reads'] = poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value)
+    for key in global_vars:
+        print("{}: {}".format(key, global_vars[key]))
+    print("computing frequencies")
+    # the GC of the genome is sampled each stepSize bp.
+    stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
+    print("stepSize: {}".format(stepSize))
+    data = tabulateGCcontent(fragment_len_dict,
+                             chrNameBitToBam, stepSize,
+                             chromSizes,
+                             numberOfProcessors=args.numberOfProcessors,
+                             verbose=args.verbose,
+                             region=args.region)
+    np.savetxt(args.GCbiasFrequenciesFile.name, data)
+    if args.biasPlot:
+        reads_per_gc = countReadsPerGC(args.regionSize,
+                                       chrNameBitToBam, stepSize * 10,
+                                       chromSizes,
+                                       numberOfProcessors=args.numberOfProcessors,
+                                       verbose=args.verbose,
+                                       region=args.region)
+        if args.plotFileFormat == "plotly":
+            plotlyGCbias(args.biasPlot, data, reads_per_gc, args.regionSize)
+        else:
+            plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
+class Tester():
+    def __init__(self):
+        import os
+        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
+        self.tbitFile = self.root + "sequence.2bit"
+        self.bamFile = self.root + "test.bam"
+        self.mappability = self.root + "mappability.bw"
+        self.chrNameBam = '2L'
+        self.chrNameBit = 'chr2L'
+        bam, mapped, unmapped, stats = bamHandler.openBam(self.bamFile, returnStats=True)
+        tbit = py2bit.open(self.tbitFile)
+        global debug
+        debug = 0
+        global global_vars
+        global_vars = {'2bit': self.tbitFile,
+                       'bam': self.bamFile,
+                       'filter_out': None,
+                       'mappability': self.mappability,
+                       'extra_sampling_file': None,
+                       'max_reads': 5,
+                       'min_reads': 0,
+                       'min_reads': 0,
+                       'reads_per_bp': 0.3,
+                       'total_reads': mapped,
+                       'genome_size': sum(tbit.chroms().values())
+                       }
+    def testTabulateGCcontentWorker(self):
+        stepSize = 2
+        fragmentLength = {'min': 1, 'median': 3, 'max': 5}
+        start = 0
+        end = 20
+        chrNameBam2bit = {'2L': 'chr2L'}
+        return (self.chrNameBam,
+                start, end, stepSize, fragmentLength, chrNameBam2bit)
+    def set_filter_out_file(self):
+        global global_vars
+        global_vars['filter_out'] = self.root + "filter_out.bed"
+    def unset_filter_out_file(self):
+        global global_vars
+        global_vars['filter_out'] = None
+    def set_extra_sampling_file(self):
+        global global_vars
+        global_vars['extra_sampling_file'] = self.root + "extra_sampling.bed"
+    def testTabulateGCcontent(self):
+        fragmentLength = {'median': 10}
+        chrNameBitToBam = {'chr2L': '2L'}
+        stepSize = 1
+        bam = bamHandler.openBam(global_vars['bam'])
+        chromSizes = [(bam.references[i], bam.lengths[i])
+                      for i in range(len(bam.references))]
+        return (fragmentLength,
+                chrNameBitToBam, stepSize, chromSizes, 1)
+    def testCountReadsPerGC(self):
+        regionSize = 300
+        chrNameBitToBam = {'chr2L': '2L'}
+        stepSize = 1
+        bam = bamHandler.openBam(global_vars['bam'])
+        chromSizes = [(bam.references[i], bam.lengths[i])
+                      for i in range(len(bam.references))]
+        return (regionSize,
+                chrNameBitToBam, stepSize, chromSizes, 1)
+if __name__ == "__main__":
+    main()

deepTools/source/deeptools/computeMatrix.py ADDED Viewed

	@@ -0,0 +1,429 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import sys
+from deeptools.parserCommon import writableFile, numberOfProcessors
+from deeptools import parserCommon
+from deeptools import heatmapper
+import deeptools.computeMatrixOperations as cmo
+from importlib.metadata import version
+def parse_arguments(args=None):
+    parser = \
+        argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description="""
+This tool calculates scores per genome regions and prepares an intermediate file that can be used with ``plotHeatmap`` and ``plotProfiles``.
+Typically, the genome regions are genes, but any other regions defined in a BED file can be used.
+computeMatrix accepts multiple score files (bigWig format) and multiple regions files (BED format).
+This tool can also be used to filter and sort regions according
+to their score.
+To learn more about the specific parameters, type:
+$ computeMatrix reference-point --help or
+$ computeMatrix scale-regions --help
+""",
+            epilog='An example usage is:\n  computeMatrix reference-point -S '
+            '<bigwig file(s)> -R <bed file(s)> -b 1000\n \n')
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s {}'.format(version('deeptools')))
+    subparsers = parser.add_subparsers(
+        title='Commands',
+        dest='command',
+        metavar='')
+    # scale-regions mode options
+    subparsers.add_parser(
+        'scale-regions',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[
+            computeMatrixRequiredArgs(),
+            computeMatrixOutputArgs(),
+            computeMatrixOptArgs(case='scale-regions'),
+            parserCommon.gtf_options()
+        ],
+        help="In the scale-regions mode, all regions in the BED file are "
+        "stretched or shrunken to the length (in bases) indicated by the user.",
+        usage='An example usage is:\n  computeMatrix scale-regions -S '
+        '<biwig file(s)> -R <bed file> -b 1000\n\n')
+    # reference point arguments
+    subparsers.add_parser(
+        'reference-point',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[computeMatrixRequiredArgs(),
+                 computeMatrixOutputArgs(),
+                 computeMatrixOptArgs(case='reference-point'),
+                 parserCommon.gtf_options()
+                 ],
+        help="Reference-point refers to a position within a BED region "
+        "(e.g., the starting point). In this mode, only those genomic"
+        "positions before (upstream) and/or after (downstream) of the "
+        "reference point will be plotted.",
+        usage='An example usage is:\n  computeMatrix reference-point -S '
+        '<biwig file(s)> -R <bed file> -a 3000 -b 3000\n\n')
+    return parser
+def computeMatrixRequiredArgs(args=None):
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--regionsFileName', '-R',
+                          metavar='File',
+                          help='File name or names, in BED or GTF format, containing '
+                               'the regions to plot. If multiple bed files are given, each one is considered a '
+                               'group that can be plotted separately. Also, adding a "#" symbol in the bed file '
+                               'causes all the regions until the previous "#" to be considered one group.',
+                          nargs='+',
+                          required=True)
+    required.add_argument('--scoreFileName', '-S',
+                          help='bigWig file(s) containing '
+                          'the scores to be plotted. Multiple files should be separated by spaced. BigWig '
+                          'files can be obtained by using the bamCoverage '
+                          'or bamCompare tools. More information about '
+                          'the bigWig file format can be found at '
+                          'http://genome.ucsc.edu/goldenPath/help/bigWig.html ',
+                          metavar='File',
+                          nargs='+',
+                          required=True)
+    return parser
+def computeMatrixOutputArgs(args=None):
+    parser = argparse.ArgumentParser(add_help=False)
+    output = parser.add_argument_group('Output options')
+    output.add_argument('--outFileName', '-out', '-o',
+                        help='File name to save the gzipped matrix file '
+                        'needed by the "plotHeatmap" and "plotProfile" tools.',
+                        type=writableFile,
+                        required=True)
+    output.add_argument('--outFileNameMatrix',
+                        help='If this option is given, then the matrix '
+                        'of values underlying the heatmap will be saved '
+                        'using the indicated name, e.g. IndividualValues.tab.'
+                        'This matrix can easily be loaded into R or '
+                        'other programs.',
+                        metavar='FILE',
+                        type=writableFile)
+    output.add_argument('--outFileSortedRegions',
+                        help='File name in which the regions are saved '
+                        'after skiping zeros or min/max threshold values. The '
+                        'order of the regions in the file follows the sorting '
+                        'order selected. This is useful, for example, to '
+                        'generate other heatmaps keeping the sorting of the '
+                        'first heatmap. Example: Heatmap1sortedRegions.bed',
+                        metavar='BED file',
+                        type=argparse.FileType('w'))
+    return parser
+def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]):
+    parser = argparse.ArgumentParser(add_help=False)
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument('--version', action='version',
+                          version='%(prog)s {}'.format(version('deeptools')))
+    if case == 'scale-regions':
+        optional.add_argument('--regionBodyLength', '-m',
+                              default=1000,
+                              type=int,
+                              help='Distance in bases to which all regions will '
+                              'be fit. (Default: %(default)s)')
+        optional.add_argument('--startLabel',
+                              default='TSS',
+                              help='Label shown in the plot for the start of '
+                              'the region. Default is TSS (transcription '
+                              'start site), but could be changed to anything, '
+                              'e.g. "peak start". Note that this is only '
+                              'useful if you plan to plot the results yourself '
+                              'and not, for example, with plotHeatmap, which '
+                              'will override this. (Default: %(default)s)')
+        optional.add_argument('--endLabel',
+                              default='TES',
+                              help='Label shown in the plot for the region '
+                              'end. Default is TES (transcription end site). '
+                              'See the --startLabel option for more '
+                              'information. (Default: %(default)s) ')
+        optional.add_argument('--beforeRegionStartLength', '-b', '--upstream',
+                              default=0,
+                              type=int,
+                              help='Distance upstream of the start site of '
+                              'the regions defined in the region file. If the '
+                              'regions are genes, this would be the distance '
+                              'upstream of the transcription start site. (Default: %(default)s)')
+        optional.add_argument('--afterRegionStartLength', '-a', '--downstream',
+                              default=0,
+                              type=int,
+                              help='Distance downstream of the end site '
+                              'of the given regions. If the '
+                              'regions are genes, this would be the distance '
+                              'downstream of the transcription end site. (Default: %(default)s)')
+        optional.add_argument("--unscaled5prime",
+                              default=0,
+                              type=int,
+                              help='Number of bases at the 5-prime end of the '
+                              'region to exclude from scaling. By default, '
+                              'each region is scaled to a given length (see the --regionBodyLength option). In some cases it is useful to look at unscaled signals around region boundaries, so this setting specifies the number of unscaled bases on the 5-prime end of each boundary. (Default: %(default)s)')
+        optional.add_argument("--unscaled3prime",
+                              default=0,
+                              type=int,
+                              help='Like --unscaled5prime, but for the 3-prime '
+                              'end. (Default: %(default)s)')
+    elif case == 'reference-point':
+        optional.add_argument('--referencePoint',
+                              default='TSS',
+                              choices=['TSS', 'TES', 'center'],
+                              help='The reference point for the plotting '
+                              'could be either the region start (TSS), the '
+                              'region end (TES) or the center of the region. '
+                              'Note that regardless of what you specify, '
+                              'plotHeatmap/plotProfile will default to using "TSS" as the '
+                              'label. (Default: %(default)s)')
+        # set region body length to zero for reference point mode
+        optional.add_argument('--regionBodyLength', help=argparse.SUPPRESS,
+                              default=0, type=int)
+        optional.add_argument('--unscaled5prime', default=0, type=int, help=argparse.SUPPRESS)
+        optional.add_argument('--unscaled3prime', default=0, type=int, help=argparse.SUPPRESS)
+        optional.add_argument('--beforeRegionStartLength', '-b', '--upstream',
+                              default=500,
+                              type=int,
+                              metavar='INT bp',
+                              help='Distance upstream of the reference-point '
+                              'selected. (Default: %(default)s)')
+        optional.add_argument('--afterRegionStartLength', '-a', '--downstream',
+                              default=1500,
+                              metavar='INT bp',
+                              type=int,
+                              help='Distance downstream of the '
+                              'reference-point selected. (Default: %(default)s)')
+        optional.add_argument('--nanAfterEnd',
+                              action='store_true',
+                              help='If set, any values after the region end '
+                              'are discarded. This is useful to visualize '
+                              'the region end when not using the '
+                              'scale-regions mode and when the reference-'
+                              'point is set to the TSS.')
+    optional.add_argument('--binSize', '-bs',
+                          help='Length, in bases, of the non-overlapping '
+                          'bins for averaging the score over the '
+                          'regions length. (Default: %(default)s)',
+                          type=int,
+                          default=10)
+    optional.add_argument('--sortRegions',
+                          help='Whether the output file should present the '
+                          'regions sorted. The default is to not sort the regions. '
+                          'Note that this is only useful if you plan to plot '
+                          'the results yourself and not, for example, with '
+                          'plotHeatmap, which will override this. Note also that '
+                          'unsorted output will be in whatever order the regions '
+                          'happen to be processed in and not match the order in '
+                          'the input files. If you require the output order to '
+                          'match that of the input regions, then either specify '
+                          '"keep" or use computeMatrixOperations to resort the '
+                          'results file. (Default: %(default)s)',
+                          choices=["descend", "ascend", "no", "keep"],
+                          default='keep')
+    optional.add_argument('--sortUsing',
+                          help='Indicate which method should be used for '
+                          'sorting. The value is computed for each row.'
+                          'Note that the region_length option will lead '
+                          'to a dotted line within the heatmap that indicates '
+                          'the end of the regions. (Default: %(default)s)',
+                          choices=["mean", "median", "max", "min", "sum",
+                                   "region_length"],
+                          default='mean')
+    optional.add_argument('--sortUsingSamples',
+                          help='List of sample numbers (order as in matrix), '
+                          'that are used for sorting by --sortUsing, '
+                          'no value uses all samples, '
+                          'example: --sortUsingSamples 1 3',
+                          type=int, nargs='+')
+    optional.add_argument('--averageTypeBins',
+                          default='mean',
+                          choices=["mean", "median", "min",
+                                   "max", "std", "sum"],
+                          help='Define the type of statistic that should be '
+                          'used over the bin size range. The '
+                          'options are: "mean", "median", "min", "max", "sum" '
+                          'and "std". The default is "mean". (Default: %(default)s)')
+    optional.add_argument('--missingDataAsZero',
+                          help='If set, missing data (NAs) will be treated as zeros. '
+                          'The default is to ignore such cases, which will be depicted as black areas in '
+                          'a heatmap. (see the --missingDataColor argument '
+                          'of the plotHeatmap command for additional options).',
+                          action='store_true')
+    optional.add_argument('--skipZeros',
+                          help='Whether regions with only scores of zero '
+                          'should be included or not. Default is to include '
+                          'them.',
+                          action='store_true')
+    optional.add_argument('--minThreshold',
+                          default=None,
+                          type=float,
+                          help='Numeric value. Any region containing a '
+                          'value that is less than or equal to this '
+                          'will be skipped. This is useful to skip, '
+                          'for example, genes where the read count is zero '
+                          'for any of the bins. This could be the result of '
+                          'unmappable areas and can bias the overall results. (Default: %(default)s)')
+    optional.add_argument('--maxThreshold',
+                          default=None,
+                          type=float,
+                          help='Numeric value. Any region containing a value '
+                          'greater than or equal to this '
+                          'will be skipped. The maxThreshold is useful to '
+                          'skip those few regions with very high read counts '
+                          '(e.g. micro satellites) that may bias the average '
+                          'values. (Default: %(default)s)')
+    optional.add_argument('--blackListFileName', '-bl',
+                          help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
+                          metavar="BED file",
+                          required=False)
+    optional.add_argument('--samplesLabel',
+                          help='Labels for the samples. This will then be passed to plotHeatmap and plotProfile. The '
+                          'default is to use the file name of the '
+                          'sample. The sample labels should be separated '
+                          'by spaces and quoted if a label itself'
+                          'contains a space E.g. --samplesLabel label-1 "label 2"  ',
+                          nargs='+')
+    optional.add_argument('--smartLabels',
+                          action='store_true',
+                          help='Instead of manually specifying labels for the input '
+                          'bigWig and BED/GTF files, this causes deepTools to use the file name '
+                          'after removing the path and extension.')
+    # in contrast to other tools,
+    # computeMatrix by default outputs
+    # messages and the --quiet flag supresses them
+    optional.add_argument('--quiet', '-q',
+                          help='Set to remove any warning or processing '
+                          'messages.',
+                          action='store_true')
+    optional.add_argument('--verbose',
+                          help='Being VERY verbose in the status messages. --quiet will disable this.',
+                          action='store_true')
+    optional.add_argument('--scale',
+                          help='If set, all values are multiplied by '
+                          'this number. (Default: %(default)s)',
+                          type=float,
+                          default=1)
+    optional.add_argument('--numberOfProcessors', '-p',
+                          help='Number of processors to use. Type "max/2" to '
+                          'use half the maximum number of processors or "max" '
+                          'to use all available processors. (Default: %(default)s)',
+                          metavar="INT",
+                          type=numberOfProcessors,
+                          default=1,
+                          required=False)
+    return parser
+def process_args(args=None):
+    args = parse_arguments().parse_args(args)
+    if len(sys.argv) == 1:
+        parse_arguments().print_help()
+        sys.exit()
+    if args.quiet is True:
+        args.verbose = False
+    # Ensure before and after region length is positive
+    if args.beforeRegionStartLength < 0:
+        print(f"beforeRegionStartLength changed from {args.beforeRegionStartLength} into {abs(args.beforeRegionStartLength)}")
+        args.beforeRegionStartLength = abs(args.beforeRegionStartLength)
+    if args.afterRegionStartLength < 0:
+        print(f"afterRegionStartLength changed from {args.afterRegionStartLength} into {abs(args.afterRegionStartLength)}")
+        args.afterRegionStartLength = abs(args.afterRegionStartLength)
+    if args.command == 'scale-regions':
+        args.nanAfterEnd = False
+        args.referencePoint = None
+    elif args.command == 'reference-point':
+        if args.beforeRegionStartLength == 0 and \
+                args.afterRegionStartLength == 0:
+            sys.exit("\nUpstrean and downstream regions are both "
+                     "set to 0. Nothing to output. Maybe you want to "
+                     "use the scale-regions mode?\n")
+    return args
+def main(args=None):
+    args = process_args(args)
+    parameters = {'upstream': args.beforeRegionStartLength,
+                  'downstream': args.afterRegionStartLength,
+                  'body': args.regionBodyLength,
+                  'bin size': args.binSize,
+                  'ref point': args.referencePoint,
+                  'verbose': args.verbose,
+                  'bin avg type': args.averageTypeBins,
+                  'missing data as zero': args.missingDataAsZero,
+                  'min threshold': args.minThreshold,
+                  'max threshold': args.maxThreshold,
+                  'scale': args.scale,
+                  'skip zeros': args.skipZeros,
+                  'nan after end': args.nanAfterEnd,
+                  'proc number': args.numberOfProcessors,
+                  'sort regions': args.sortRegions,
+                  'sort using': args.sortUsing,
+                  'unscaled 5 prime': args.unscaled5prime,
+                  'unscaled 3 prime': args.unscaled3prime
+                  }
+    hm = heatmapper.heatmapper()
+    scores_file_list = args.scoreFileName
+    hm.computeMatrix(scores_file_list, args.regionsFileName, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args)
+    if args.sortRegions not in ['no', 'keep']:
+        sortUsingSamples = []
+        if args.sortUsingSamples is not None:
+            for i in args.sortUsingSamples:
+                if (i > 0 and i <= hm.matrix.get_num_samples()):
+                    sortUsingSamples.append(i - 1)
+                else:
+                    exit("The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples()))
+            print('Samples used for ordering within each group: ', sortUsingSamples)
+        hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples)
+    elif args.sortRegions == 'keep':
+        hm.parameters['group_labels'] = hm.matrix.group_labels
+        hm.parameters["group_boundaries"] = hm.matrix.group_boundaries
+        cmo.sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator, verbose=not args.quiet)
+    hm.save_matrix(args.outFileName)
+    if args.outFileNameMatrix:
+        hm.save_matrix_values(args.outFileNameMatrix)
+    if args.outFileSortedRegions:
+        hm.save_BED(args.outFileSortedRegions)

deepTools/source/deeptools/computeMatrixOperations.py ADDED Viewed

	@@ -0,0 +1,852 @@

+#!/usr/bin/env python
+import deeptools.heatmapper as heatmapper
+import deeptoolsintervals.parse as dti
+import numpy as np
+import argparse
+import sys
+import os
+import csv
+from importlib.metadata import version
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="""
+This tool performs a variety of operations on files produced by computeMatrix.
+detailed help:
+  computeMatrixOperations info -h
+or
+  computeMatrixOperations relabel -h
+or
+  computeMatrixOperations subset -h
+or
+  computeMatrixOperations filterStrand -h
+or
+  computeMatrixOperations filterValues -h
+or
+  computeMatrixOperations rbind -h
+or
+  computeMatrixOperations cbind -h
+or
+  computeMatrixOperations sort -h
+or
+  computeMatrixOperations dataRange -h
+""",
+        epilog='example usages:\n'
+               'computeMatrixOperations subset -m input.mat.gz -o output.mat.gz --group "group 1" "group 2" --samples "sample 3" "sample 10"\n\n'
+               ' \n\n')
+    subparsers = parser.add_subparsers(
+        title='Commands',
+        dest='command',
+        metavar='')
+    # info
+    subparsers.add_parser(
+        'info',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[infoArgs()],
+        help="Print group and sample information",
+        usage='An example usage is:\n  computeMatrixOperations info -m input.mat.gz\n\n')
+    # relabel
+    subparsers.add_parser(
+        'relabel',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[infoArgs(), relabelArgs()],
+        help="Change sample and/or group label information",
+        usage='An example usage is:\n  computeMatrixOperations relabel -m input.mat.gz -o output.mat.gz --sampleLabels "sample 1" "sample 2"\n\n')
+    # subset
+    subparsers.add_parser(
+        'subset',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[infoArgs(), subsetArgs()],
+        help="Actually subset the matrix. The group and sample orders are honored, so one can also reorder files.",
+        usage='An example usage is:\n  computeMatrixOperations subset -m '
+        'input.mat.gz -o output.mat.gz --groups "group 1" "group 2" '
+        '--samples "sample 3" "sample 10"\n\n')
+    # filterStrand
+    subparsers.add_parser(
+        'filterStrand',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[infoArgs(), filterStrandArgs()],
+        help="Filter entries by strand.",
+        usage='Example usage:\n  computeMatrixOperations filterStrand -m '
+        'input.mat.gz -o output.mat.gz --strand +\n\n')
+    # filterValues
+    subparsers.add_parser(
+        'filterValues',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[infoArgs(), filterValuesArgs()],
+        help="Filter entries by min/max value.",
+        usage='Example usage:\n  computeMatrixOperations filterValues -m '
+        'input.mat.gz -o output.mat.gz --min 10 --max 1000\n\n')
+    # rbind
+    subparsers.add_parser(
+        'rbind',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[bindArgs()],
+        help="merge multiple matrices by concatenating them head to tail. This assumes that the same samples are present in each in the same order.",
+        usage='Example usage:\n  computeMatrixOperations rbind -m '
+        'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n')
+    # cbind
+    subparsers.add_parser(
+        'cbind',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[bindArgs()],
+        help="merge multiple matrices by concatenating them left to right. No assumptions are made about the row order. Regions not present in the first file specified are ignored. Regions missing in subsequent files will result in NAs. Regions are matches based on the first 6 columns of the computeMatrix output (essentially the columns in a BED file).",
+        usage='Example usage:\n  computeMatrixOperations cbind -m '
+        'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n')
+    # sort
+    subparsers.add_parser(
+        'sort',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[sortArgs()],
+        help='Sort a matrix file to correspond to the order of entries in the desired input file(s). The groups of regions designated by the files must be present in the order found in the output of computeMatrix (otherwise, use the subset command first). Note that this subcommand can also be used to remove unwanted regions, since regions not present in the input file(s) will be omitted from the output.',
+        usage='Example usage:\n  computeMatrixOperations sort -m input.mat.gz -R regions1.bed regions2.bed regions3.gtf -o input.sorted.mat.gz\n\n')
+    # dataRange
+    subparsers.add_parser(
+        'dataRange',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[infoArgs()],
+        help='Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.',
+        usage='Example usage:\n  computeMatrixOperations dataRange -m input.mat.gz\n\n')
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s {}'.format(version('deeptools')))
+    return parser
+def bindArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--matrixFile', '-m',
+                          help='Matrix files from the computeMatrix tool.',
+                          nargs='+',
+                          required=True)
+    required.add_argument('--outFileName', '-o',
+                          help='Output file name',
+                          required=True)
+    return parser
+def infoArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--matrixFile', '-m',
+                          help='Matrix file from the computeMatrix tool.',
+                          required=True)
+    return parser
+def relabelArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--outFileName', '-o',
+                          help='Output file name',
+                          required=True)
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument('--groupLabels',
+                          nargs='+',
+                          help="Groups labels. If none are specified then the current labels will be kept.")
+    optional.add_argument('--sampleLabels',
+                          nargs='+',
+                          help="Sample labels. If none are specified then the current labels will be kept.")
+    return parser
+def subsetArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--outFileName', '-o',
+                          help='Output file name',
+                          required=True)
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument('--groups',
+                          nargs='+',
+                          help="Groups to include. If none are specified then all will be included.")
+    optional.add_argument('--samples',
+                          nargs='+',
+                          help="Samples to include. If none are specified then all will be included.")
+    return parser
+def filterStrandArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--outFileName', '-o',
+                          help='Output file name',
+                          required=True)
+    required.add_argument('--strand', '-s',
+                          help='Strand',
+                          choices=['+', '-', '.'],
+                          required=True)
+    return parser
+def filterValuesArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--outFileName', '-o',
+                          help='Output file name',
+                          required=True)
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument('--min',
+                          help='Minimum value. Any row having a single entry less than this will be excluded. The default is no minimum.',
+                          type=float,
+                          default=None)
+    optional.add_argument('--max',
+                          help='Maximum value. Any row having a single entry more than this will be excluded. The default is no maximum.',
+                          type=float,
+                          default=None)
+    return parser
+def sortArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--matrixFile', '-m',
+                          help='Matrix file from the computeMatrix tool.',
+                          required=True)
+    required.add_argument('--outFileName', '-o',
+                          help='Output file name',
+                          required=True)
+    required.add_argument('--regionsFileName', '-R',
+                          help='File name(s), in BED or GTF format, containing the regions. '
+                               'If multiple bed files are given, each one is '
+                               'considered a group that can be plotted separately. '
+                               'Also, adding a "#" symbol in the bed file causes all '
+                               'the regions until the previous "#" to be considered '
+                               'one group. Alternatively for BED files, putting '
+                               'deepTools_group in the header can be used to indicate a '
+                               'column with group labels. Note that these should be '
+                               'sorted such that all group entries are together.',
+                          required=True,
+                          nargs='+')
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument('--transcriptID',
+                          default='transcript',
+                          help='When a GTF file is used to provide regions, only '
+                          'entries with this value as their feature (column 3) '
+                          'will be processed as transcripts. (Default: %(default)s)')
+    optional.add_argument('--transcript_id_designator',
+                          default='transcript_id',
+                          help='Each region has an ID (e.g., ACTB) assigned to it, '
+                          'which for BED files is either column 4 (if it exists) '
+                          'or the interval bounds. For GTF files this is instead '
+                          'stored in the last column as a key:value pair (e.g., as '
+                          '\'transcript_id "ACTB"\', for a key of transcript_id '
+                          'and a value of ACTB). In some cases it can be '
+                          'convenient to use a different identifier. To do so, set '
+                          'this to the desired key. (Default: %(default)s)')
+    return parser
+def printInfo(matrix):
+    """
+    Print the groups and samples
+    """
+    print("Groups:")
+    for group in matrix.matrix.group_labels:
+        print("\t{0}".format(group))
+    print("Samples:")
+    for sample in matrix.matrix.sample_labels:
+        print("\t{0}".format(sample))
+def printDataRange(matrix):
+    """
+    Prints the min, max, median, 10th and 90th percentile of the matrix values per sample.
+    """
+    print("Samples\tMin\tMax\tMedian\t10th\t90th")
+    for i, sample in enumerate(matrix.matrix.sample_labels):
+        start = matrix.matrix.sample_boundaries[i]
+        end = matrix.matrix.sample_boundaries[i + 1]
+        sample_matrix = matrix.matrix.matrix[..., start:end]
+        print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sample, np.amin(sample_matrix),
+                                                    np.amax(sample_matrix),
+                                                    np.ma.median(sample_matrix),
+                                                    np.percentile(sample_matrix, 10),
+                                                    np.percentile(sample_matrix, 90)))
+def relabelMatrix(matrix, args):
+    """
+    Relabel the samples and groups in a matrix
+    """
+    if args.groupLabels:
+        if len(args.groupLabels) != len(matrix.matrix.group_labels):
+            sys.exit("You specified {} group labels, but {} are required.\n".format(len(args.groupLabels), len(matrix.matrix.group_labels)))
+        matrix.matrix.group_labels = args.groupLabels
+    if args.sampleLabels:
+        if len(args.sampleLabels) != len(matrix.matrix.sample_labels):
+            sys.exit("You specified {} sample labels, but {} are required.\n".format(len(args.sampleLabels), len(matrix.matrix.sample_labels)))
+        matrix.matrix.sample_labels = args.sampleLabels
+def getGroupBounds(args, matrix):
+    """
+    Given the group labels, return an indexing array and the resulting boundaries
+    """
+    bounds = matrix.parameters['group_boundaries']
+    if args.groups is None:
+        return range(0, matrix.matrix.matrix.shape[0]), np.array(bounds)
+    else:
+        o = list()
+        obounds = [0]
+        for group in args.groups:
+            if group not in matrix.matrix.group_labels:
+                sys.exit("Error: '{0}' is not a valid group\n".format(group))
+            idx = matrix.matrix.group_labels.index(group)
+            o.extend(range(bounds[idx], bounds[idx + 1]))
+            obounds.append(bounds[idx + 1] - bounds[idx])
+        return o, np.cumsum(obounds)
+def getSampleBounds(args, matrix):
+    """
+    Given the sample labels, return an indexing array
+    """
+    bounds = matrix.parameters['sample_boundaries']
+    if args.samples is None:
+        return np.arange(0, matrix.matrix.matrix.shape[1])
+    else:
+        o = list()
+        for sample in args.samples:
+            if sample not in matrix.matrix.sample_labels:
+                sys.exit("Error: '{0}' is not a valid sample\n".format(sample))
+            idx = matrix.matrix.sample_labels.index(sample)
+            o.extend(range(bounds[idx], bounds[idx + 1]))
+        return o
+def subsetRegions(hm, bounds):
+    out = []
+    for x in bounds:
+        reg = hm.matrix.regions[x]
+        # we need to add a list of [chrom, [(start, end), (start, end)], name, 0, strand, score)]
+        if isinstance(reg, dict):
+            # This happens on occasion
+            starts = reg["start"].split(",")
+            starts = [int(x) for x in starts]
+            ends = reg["end"].split(",")
+            ends = [int(x) for x in ends]
+            regs = [(x, y) for x, y in zip(starts, ends)]
+            out.append([reg["chrom"], regs, reg["name"], 0, reg["strand"], reg["score"]])
+        else:
+            out.append(reg)
+    return out
+def filterHeatmap(hm, args):
+    bounds = [0]
+    regions = []
+    keep = []
+    for region in hm.matrix.regions:
+        if region[4] == args.strand:
+            keep.append(True)
+            regions.append(region)
+        else:
+            keep.append(False)
+    keep = np.array(keep)
+    # Get the new bounds
+    for idx in range(1, len(hm.matrix.group_boundaries)):
+        i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]]))
+        bounds.append(bounds[idx - 1] + i)
+    hm.matrix.group_boundaries = bounds
+    # subset the matrix
+    hm.matrix.matrix = hm.matrix.matrix[keep, :]
+    hm.matrix.regions = regions
+def filterHeatmapValues(hm, minVal, maxVal):
+    bounds = [0]
+    regions = []
+    keep = []
+    if minVal is None:
+        minVal = -np.inf
+    if maxVal is None:
+        maxVal = np.inf
+    np.warnings.filterwarnings('ignore')
+    for i, (x, y) in enumerate(zip(np.nanmin(hm.matrix.matrix, axis=1), np.nanmax(hm.matrix.matrix, axis=1))):
+        # x/y will be nan iff a row is entirely nan. Don't filter.
+        if np.isnan(x) or (x >= minVal and y <= maxVal):
+            keep.append(True)
+            regions.append(hm.matrix.regions[i])
+        else:
+            keep.append(False)
+    keep = np.array(keep)
+    # Get the new bounds
+    for idx in range(1, len(hm.matrix.group_boundaries)):
+        i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]]))
+        bounds.append(bounds[idx - 1] + i)
+    hm.matrix.group_boundaries = bounds
+    # subset the matrix
+    hm.matrix.matrix = hm.matrix.matrix[keep, :]
+    hm.matrix.regions = regions
+def insertMatrix(hm, hm2, groupName):
+    """
+    Given two heatmapper objects and a region group name, insert the regions and
+    values from hm2 for that group to the end of those for hm.
+    """
+    # get the bounds for hm
+    idx = hm.parameters["group_labels"].index(groupName)
+    hmEnd = hm.parameters["group_boundaries"][idx + 1]
+    # get the bounds for hm2
+    idx2 = hm2.parameters["group_labels"].index(groupName)
+    hm2Start = hm2.parameters["group_boundaries"][idx2]
+    hm2End = hm2.parameters["group_boundaries"][idx2 + 1]
+    # Insert the subset hm2 into hm along axis 0
+    hm.matrix.matrix = np.insert(hm.matrix.matrix, hmEnd, hm2.matrix.matrix[hm2Start:hm2End, :], axis=0)
+    # Insert the regions
+    hm.matrix.regions[hmEnd:hmEnd] = hm2.matrix.regions[hm2Start:hm2End]
+    # Increase the group boundaries
+    bounds = []
+    for idx3, bound in enumerate(hm.parameters["group_boundaries"]):
+        if idx3 > idx:
+            bound += hm2End - hm2Start
+        bounds.append(bound)
+    hm.parameters["group_boundaries"] = bounds
+def appendMatrix(hm, hm2, groupName):
+    """
+    Given two heatmapper objects and a region group name, append the values from
+    that group in hm2 onto the end of hm.
+    """
+    # get the bounds for hm2
+    idx2 = hm2.parameters["group_labels"].index(groupName)
+    hm2Start = hm2.parameters["group_boundaries"][idx2]
+    hm2End = hm2.parameters["group_boundaries"][idx2 + 1]
+    # Append the matrix
+    hm.matrix.matrix = np.concatenate([hm.matrix.matrix, hm2.matrix.matrix[hm2Start:hm2End, :]], axis=0)
+    # Update the bounds
+    hm.parameters["group_boundaries"].append(hm.parameters["group_boundaries"][-1] + hm2End - hm2Start)
+    # Append the regions
+    hm.matrix.regions.extend(hm2.matrix.regions[hm2Start:hm2End])
+def rbindMatrices(hm, args):
+    """
+    Bind matrices, top to bottom while accounting for the groups.
+    It's assumed that the same samples are present in both and in the exact same order
+    """
+    hm2 = heatmapper.heatmapper()
+    hm.read_matrix_file(args.matrixFile[0])
+    for idx in range(1, len(args.matrixFile)):
+        hm2.read_matrix_file(args.matrixFile[idx])
+        for idx, group in enumerate(hm2.parameters["group_labels"]):
+            if group in hm.parameters["group_labels"]:
+                insertMatrix(hm, hm2, group)
+            else:
+                appendMatrix(hm, hm2, group)
+                hm.parameters["group_labels"].append(group)
+    # Update the group boundaries attribute
+    hm.matrix.group_labels = hm.parameters['group_labels']
+    hm.matrix.group_boundaries = hm.parameters['group_boundaries']
+def cbindMatrices(hm, args):
+    """
+    Bind columns from different matrices according to the group and region names
+    Missing regions are left as NA
+    """
+    hm2 = heatmapper.heatmapper()
+    # Make a dict of region name:row associations
+    hm.read_matrix_file(args.matrixFile[0])
+    d = dict({x: dict() for x in hm.parameters["group_labels"]})
+    for idx, group in enumerate(hm.parameters["group_labels"]):
+        s = hm.parameters["group_boundaries"][idx]
+        e = hm.parameters["group_boundaries"][idx + 1]
+        for idx2, reg in enumerate(hm.matrix.regions[s:e]):
+            d[group][reg[2]] = idx2 + s
+    # Iterate through the other matrices
+    for idx in range(1, len(args.matrixFile)):
+        hm2.read_matrix_file(args.matrixFile[idx])
+        # Add the sample labels
+        hm.parameters['sample_labels'].extend(hm2.parameters['sample_labels'])
+        # Add the sample boundaries
+        lens = [x + hm.parameters['sample_boundaries'][-1] for x in hm2.parameters['sample_boundaries']][1:]
+        hm.parameters['sample_boundaries'].extend(lens)
+        # Add on additional NA initialized columns
+        ncol = hm.matrix.matrix.shape[1]
+        hm.matrix.matrix = np.hstack((hm.matrix.matrix, np.empty(hm2.matrix.matrix.shape)))
+        hm.matrix.matrix[:, ncol:] = np.nan
+        # Update the values
+        for idx2, group in enumerate(hm2.parameters["group_labels"]):
+            if group not in d:
+                continue
+            s = hm2.parameters["group_boundaries"][idx2]
+            e = hm2.parameters["group_boundaries"][idx2 + 1]
+            for idx3, reg in enumerate(hm2.matrix.regions[s:e]):
+                if reg[2] not in d[group]:
+                    continue
+                hm.matrix.matrix[d[group][reg[2]], ncol:] = hm2.matrix.matrix[s + idx3, :]
+        # Append the special params
+        for s in hm.special_params:
+            hm.parameters[s].extend(hm2.parameters[s])
+    # Update the sample parameters
+    hm.matrix.sample_labels = hm.parameters['sample_labels']
+    hm.matrix.sample_boundaries = hm.parameters['sample_boundaries']
+def loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup):
+    """
+    Given a first line, possibly a label column and a list of labels and regions, add the labels and regions in the file to them
+    """
+    # This is largely parseBED from deeptoolsintervals
+    labelIdx = None
+    localRegions = {}
+    cols = line.strip().split("\t")
+    if labelColumn is not None:
+        label = cols.pop(labelColumn)
+        if label not in labels:
+            labels[label] = len(labels)
+        labelIdx = labels[label]
+        if labelIdx >= len(regions):
+            regions.append(localRegions)
+        else:
+            localRegions = regions[labelIdx]
+    if len(cols) >= 6:
+        name = cols[3]
+    else:
+        name = "{0}:{1}-{2}".format(cols[0], cols[1], cols[2])
+    localRegions[name] = len(localRegions)
+    for line in fp:
+        if line.startswith("#") and labelColumn is None:
+            if len(localRegions) > 0:
+                label = line[1:].strip()
+                if len(label):
+                    labels[dti.findRandomLabel(labels, label)] = len(labels)
+                else:
+                    labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len(labels)
+                regions.append(localRegions)
+                localRegions = dict()
+            continue
+        elif line.startswith("#") and labelColumn is not None:
+            continue
+        cols = line.strip().split("\t")
+        if len(cols) < 3:
+            continue
+        if labelColumn is not None:
+            label = cols.pop(labelColumn)
+            if label not in labels:
+                labels[label] = len(labels)
+            labelIdx = labels[label]
+            if labelIdx >= len(regions):
+                regions.append({})
+            localRegions = regions[labelIdx]
+        if len(cols) >= 6:
+            name = cols[3]
+        else:
+            name = "{0}:{1}-{2}".format(cols[0], cols[1], cols[2])
+        name = dti.findRandomLabel(localRegions, name)
+        localRegions[name] = len(localRegions)
+    # Handle the last group if there is no label
+    if labelIdx is None and len(localRegions) > 0:
+        if defaultGroup is not None:
+            labels[dti.findRandomLabel(labels, defaultGroup)] = len(labels)
+        else:
+            labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len(labels)
+        regions.append(localRegions)
+def loadGTFtranscript(cols, label, defaultGroup, transcript_id_designator):
+    s = next(csv.reader([cols[8]], delimiter=' '))
+    if "deepTools_group" in s and s[-1] != "deepTools_group":
+        label = s[s.index("deepTools_group") + 1].rstrip(";")
+    elif defaultGroup is not None:
+        label = defaultGroup
+    if transcript_id_designator not in s or s[-1] == transcript_id_designator:
+        sys.stderr.write("Warning: {0} is malformed!\n".format("\t".join(cols)))
+        return None, None
+    name = s[s.index(transcript_id_designator) + 1].rstrip(";")
+    return label, name
+def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup):
+    """
+    Like loadBED, but for a GTF file
+    This is largely a copy of what's in deeptoolsintervals
+    """
+    file_label = dti.findRandomLabel(labels, os.path.basename(fname))
+    # handle the first line
+    cols = line.split("\t")
+    if cols[2].lower() == transcriptID.lower():
+        label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator)
+        if label is not None:
+            if label not in labels:
+                labels[label] = len(labels)
+                regions.append(dict())
+            labelIdx = labels[label]
+            regions[labelIdx][name] = len(regions[labelIdx])
+    for line in fp:
+        if not isinstance(line, str):
+            line = line.decode('ascii')
+        if not line.startswith('#'):
+            cols = line.strip().split('\t')
+            if len(cols) == 0:
+                continue
+            if cols[2].lower() == transcriptID:
+                label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator)
+                if label is None:
+                    continue
+                if label not in labels:
+                    labels[label] = len(labels)
+                    regions.append(dict())
+                labelIdx = labels[label]
+                regions[labelIdx][name] = len(regions[labelIdx])
+def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator, verbose=True):
+    """
+    Iterate through the files noted by regionsFileName and sort hm accordingly
+    """
+    labels = dict()
+    regions = []
+    defaultGroup = None
+    if len(regionsFileName) == 1:
+        defaultGroup = "genes"
+    for fname in regionsFileName:
+        fp = dti.openPossiblyCompressed(fname)
+        line = dti.getNext(fp)
+        labelColumn = None
+        while line.startswith("#"):
+            if not labelColumn:
+                labelColumn = dti.getLabel(line)
+            line = dti.getNext(fp)
+        while line.startswith("track "):
+            line = dti.getNext(fp)
+        # Find the label column
+        subtract = 0
+        if labelColumn is not None:
+            subtract = 1
+        # Determine the file type and load into a list (or list of lists)
+        cols = line.strip().split("\t")
+        if len(cols) - subtract < 3:
+            raise RuntimeError('{0} does not seem to be a recognized file type!'.format(fname))
+        elif len(cols) - subtract <= 6:
+            loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
+        elif len(cols) and dti.seemsLikeGTF(cols):
+            loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup)
+        else:
+            loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
+        fp.close()
+    # Do some sanity checking on the group labels and region names within them
+    s1 = set(hm.parameters['group_labels'])
+    if verbose:
+        for e in labels:
+            if e not in s1:
+                sys.exit("The computeMatrix output is missing the '{}' region group. It has {} but the specified regions have {}.\n".format(e, s1, labels.keys()))
+    # Make a dictionary out of current labels and regions
+    d = dict()
+    pos = 0
+    groupSizes = dict()
+    for idx, label in enumerate(hm.parameters['group_labels']):
+        s = hm.parameters['group_boundaries'][idx]
+        e = hm.parameters['group_boundaries'][idx + 1]
+        if label not in labels:
+            continue
+        d[label] = dict()
+        groupSize = 0
+        for reg in hm.matrix.regions[s:e]:
+            d[label][reg[2]] = pos
+            pos += 1
+            groupSize += 1
+        groupSizes[label] = groupSize
+    # Convert labels to an ordered list
+    labelsList = [""] * len(labels)
+    for k, v in labels.items():
+        labelsList[v] = k
+    # Reorder
+    order = []
+    boundaries = [0]
+    for idx, label in enumerate(labelsList):
+        # Make an ordered list out of the region names in this region group
+        _ = [""] * len(regions[idx])
+        for k, v in regions[idx].items():
+            _[v] = k
+        sz = 0  # Track the number of enries actually matched
+        for name in _:
+            if name not in d[label]:
+                if verbose:
+                    sys.stderr.write("Skipping {}, due to being absent in the computeMatrix output.\n".format(name))
+                continue
+            sz += 1
+            order.append(d[label][name])
+        if sz == 0 and verbose:
+            sys.exit("The region group {} had no matching entries!\n".format(label))
+        boundaries.append(sz + boundaries[-1])
+    hm.matrix.regions = [hm.matrix.regions[i] for i in order]
+    order = np.array(order)
+    hm.matrix.matrix = hm.matrix.matrix[order, :]
+    # Update the parameters
+    hm.parameters["group_labels"] = labelsList
+    hm.matrix.group_labels = labelsList
+    hm.parameters["group_boundaries"] = boundaries
+    hm.matrix.group_boundaries = boundaries
+def main(args=None):
+    # if args none is need since otherwise pytest passes 'pytest' as sys.argv
+    if args is None:
+        if len(sys.argv) == 1:
+            args = ["-h"]
+        if len(sys.argv) == 2:
+            args = [sys.argv[1], "-h"]
+    args = parse_arguments().parse_args(args)
+    hm = heatmapper.heatmapper()
+    if not isinstance(args.matrixFile, list):
+        hm.read_matrix_file(args.matrixFile)
+    if args.command == 'info':
+        printInfo(hm)
+    elif args.command == 'dataRange':
+        printDataRange(hm)
+    elif args.command == 'subset':
+        sIdx = getSampleBounds(args, hm)
+        gIdx, gBounds = getGroupBounds(args, hm)
+        # groups
+        hm.matrix.regions = subsetRegions(hm, gIdx)
+        # matrix
+        hm.matrix.matrix = hm.matrix.matrix[gIdx, :]
+        hm.matrix.matrix = hm.matrix.matrix[:, sIdx]
+        # boundaries
+        if args.samples is None:
+            args.samples = hm.matrix.sample_labels
+        hm.matrix.sample_boundaries = hm.matrix.sample_boundaries[0:len(args.samples) + 1]
+        hm.matrix.group_boundaries = gBounds.tolist()
+        # special params
+        keepIdx = set()
+        for _, sample in enumerate(hm.matrix.sample_labels):
+            if sample in args.samples:
+                keepIdx.add(_)
+        for param in hm.special_params:
+            hm.parameters[param] = [v for k, v in enumerate(hm.parameters[param]) if k in keepIdx]
+        # labels
+        hm.matrix.sample_labels = args.samples
+        if args.groups is None:
+            args.groups = hm.matrix.group_labels
+        hm.matrix.group_labels = args.groups
+        # save
+        hm.save_matrix(args.outFileName)
+    elif args.command == 'filterStrand':
+        filterHeatmap(hm, args)
+        hm.save_matrix(args.outFileName)
+    elif args.command == 'filterValues':
+        filterHeatmapValues(hm, args.min, args.max)
+        hm.save_matrix(args.outFileName)
+    elif args.command == 'rbind':
+        rbindMatrices(hm, args)
+        hm.save_matrix(args.outFileName)
+    elif args.command == 'cbind':
+        cbindMatrices(hm, args)
+        hm.save_matrix(args.outFileName)
+    elif args.command == 'sort':
+        sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator)
+        hm.save_matrix(args.outFileName)
+    elif args.command == 'relabel':
+        relabelMatrix(hm, args)
+        hm.save_matrix(args.outFileName)
+    else:
+        sys.exit("Unknown command {0}!\n".format(args.command))

deepTools/source/deeptools/correctGCBias.py ADDED Viewed

	@@ -0,0 +1,746 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import shutil
+import time
+import subprocess
+import sys
+import py2bit
+import pysam
+import multiprocessing
+import numpy as np
+import argparse
+from scipy.stats import binom
+from deeptools.utilities import tbitToBamChrName, getGC_content
+from deeptools import writeBedGraph, parserCommon, mapReduce
+from deeptools import utilities
+from deeptools.bamHandler import openBam
+old_settings = np.seterr(all='ignore')
+def parse_arguments(args=None):
+    parentParser = parserCommon.getParentArgParse(binSize=True, blackList=False)
+    requiredArgs = getRequiredArgs()
+    parser = argparse.ArgumentParser(
+        parents=[requiredArgs, parentParser],
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='This tool corrects the GC-bias using the'
+        ' method proposed by [Benjamini & Speed (2012). '
+        'Nucleic Acids Research, 40(10)]. It will remove reads'
+        ' from regions with too high coverage compared to the'
+        ' expected values (typically GC-rich regions) and will'
+        ' add reads to regions where too few reads are seen '
+        '(typically AT-rich regions). '
+        'The tool ``computeGCBias`` needs to be run first to generate the '
+        'frequency table needed here.',
+        usage='correctGCBias '
+        '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit '
+        '--GCbiasFrequenciesFile freq.txt -o gc_corrected.bam\n'
+        'help: correctGCBias -h / correctGCBias --help',
+        conflict_handler='resolve',
+        add_help=False)
+    return parser
+def process_args(args=None):
+    args = parse_arguments().parse_args(args)
+    return args
+def getRequiredArgs():
+    parser = argparse.ArgumentParser(add_help=False)
+    required = parser.add_argument_group('Required arguments')
+    # define the arguments
+    required.add_argument('--bamfile', '-b',
+                          metavar='BAM file',
+                          help='Sorted BAM file to correct.',
+                          required=True)
+    required.add_argument('--effectiveGenomeSize',
+                          help='The effective genome size is the portion '
+                          'of the genome that is mappable. Large fractions of '
+                          'the genome are stretches of NNNN that should be '
+                          'discarded. Also, if repetitive regions were not '
+                          'included in the mapping of reads, the effective '
+                          'genome size needs to be adjusted accordingly. '
+                          'A table of values is available here: '
+                          'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .',
+                          default=None,
+                          type=int,
+                          required=True)
+    required.add_argument('--genome', '-g',
+                          help='Genome in two bit format. Most genomes can be '
+                          'found here: http://hgdownload.cse.ucsc.edu/gbdb/  '
+                          'Search for the .2bit ending. Otherwise, fasta '
+                          'files can be converted to 2bit using faToTwoBit '
+                          'available here: '
+                          'http://hgdownload.cse.ucsc.edu/admin/exe/',
+                          metavar='two bit file',
+                          required=True)
+    required.add_argument('--GCbiasFrequenciesFile', '-freq',
+                          help='Indicate the output file from '
+                          'computeGCBias containing '
+                          'the observed and expected read frequencies per GC-'
+                          'content.',
+                          type=argparse.FileType('r'),
+                          metavar='FILE',
+                          required=True)
+    output = parser.add_argument_group('Output options')
+    output.add_argument('--correctedFile', '-o',
+                        help='Name of the corrected file. The ending will '
+                        'be used to decide the output file format. The options '
+                        'are ".bam", ".bw" for a bigWig file, ".bg" for a '
+                        'bedGraph file.',
+                        metavar='FILE',
+                        type=argparse.FileType('w'),
+                        required=True)
+    # define the optional arguments
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument("--help", "-h", action="help",
+                          help="show this help message and exit")
+    return parser
+def getReadGCcontent(tbit, read, fragmentLength, chrNameBit):
+    """
+    The fragments for forward and reverse reads are defined as follows::
+           |- read.pos       |- read.aend
+        ---+=================>-----------------------+---------    Forward strand
+           |-fragStart                               |-fragEnd
+        ---+-----------------------<=================+---------    Reverse strand
+                                   |-read.pos        |-read.aend
+           |-----------------------------------------|
+                            read.tlen
+    """
+    fragStart = None
+    fragEnd = None
+    if read.is_paired and read.is_proper_pair and abs(read.tlen) < 2 * fragmentLength:
+        if read.is_reverse and read.tlen < 0:
+            fragEnd = read.reference_end
+            fragStart = read.reference_end + read.template_length
+        elif read.template_length >= read.query_alignment_length:
+            fragStart = read.pos
+            fragEnd = read.pos + read.template_length
+    if not fragStart:
+        if read.is_reverse:
+            fragEnd = read.reference_end
+            fragStart = read.reference_end - fragmentLength
+        else:
+            fragStart = read.pos
+            fragEnd = fragStart + fragmentLength
+    fragStart = max(0, fragStart)
+    try:
+        gc = getGC_content(tbit, chrNameBit, fragStart, fragEnd)
+    except Exception:
+        return None
+    if gc is None:
+        return None
+    # match the gc to the given fragmentLength
+    gc = int(np.round(gc * fragmentLength))
+    return gc
+def writeCorrected_wrapper(args):
+    return writeCorrected_worker(*args)
+def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
+    r"""writes a bedgraph file containing the GC correction of
+    a region from the genome
+    >>> test = Tester()
+    >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
+    >>> open(tempFile, 'r').readlines()
+    ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
+    >>> os.remove(tempFile)
+    """
+    global R_gc
+    fragmentLength = len(R_gc) - 1
+    cvg_corr = np.zeros(end - start)
+    i = 0
+    tbit = py2bit.open(global_vars['2bit'])
+    bam = openBam(global_vars['bam'])
+    read_repetitions = 0
+    removed_duplicated_reads = 0
+    startTime = time.time()
+    # caching seems to be faster
+    # r.flag & 4 == 0 is to skip unmapped
+    # reads that nevertheless are asigned
+    # to a genomic position
+    reads = [r for r in bam.fetch(chrNameBam, start, end)
+             if r.flag & 4 == 0]
+    bam.close()
+    r_index = -1
+    for read in reads:
+        if read.is_unmapped:
+            continue
+        r_index += 1
+        try:
+            # calculate GC content of read fragment
+            gc = getReadGCcontent(tbit, read, fragmentLength,
+                                  chrNameBit)
+        except Exception as detail:
+            print(detail)
+            """ this exception happens when the end of a
+            chromosome is reached """
+            continue
+        if not gc:
+            continue
+        # is this read in the same orientation and position as the previous?
+        if r_index > 0 and read.pos == reads[r_index - 1].pos and \
+                read.is_reverse == reads[r_index - 1].is_reverse \
+                and read.pnext == reads[r_index - 1].pnext:
+            read_repetitions += 1
+            if read_repetitions >= global_vars['max_dup_gc'][gc]:
+                removed_duplicated_reads += 1
+                continue
+        else:
+            read_repetitions = 0
+        try:
+            fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True)
+            vectorStart = max(fragmentStart - start, 0)
+            vectorEnd = min(fragmentEnd - start, end - start)
+        except TypeError:
+            # the get_fragment_from_read functions returns None in some cases.
+            # Those cases are to be skipped, hence the continue line.
+            continue
+        cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
+        i += 1
+    try:
+        if debug:
+            endTime = time.time()
+            print("{}, processing {} ({:.1f} per sec) "
+                  "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
+                                            i, i / (endTime - startTime),
+                                            chrNameBit, start, end))
+    except NameError:
+        pass
+    if i == 0:
+        return None
+    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
+    # save in bedgraph format
+    for bin in range(0, len(cvg_corr), step):
+        value = np.mean(cvg_corr[bin:min(bin + step, end)])
+        if value > 0:
+            writeStart = start + bin
+            writeEnd = min(start + bin + step, end)
+            _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart,
+                                                writeEnd, value))
+    tempFileName = _file.name
+    _file.close()
+    return tempFileName
+def numCopiesOfRead(value):
+    """
+    Based int he R_gc value, decides
+    whether to keep, duplicate, triplicate or delete the read.
+    It returns an integer, that tells the number of copies of the read
+    that should be keep.
+    >>> np.random.seed(1)
+    >>> numCopiesOfRead(0.8)
+    1
+    >>> numCopiesOfRead(2.5)
+    2
+    >>> numCopiesOfRead(None)
+    1
+    """
+    copies = 1
+    if value:
+        copies = int(value) + (1 if np.random.rand() < value % 1 else 0)
+    return copies
+def writeCorrectedSam_wrapper(args):
+    return writeCorrectedSam_worker(*args)
+def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
+                             step=None,
+                             tag_but_not_change_number=False,
+                             verbose=True):
+    r"""
+    Writes a BAM file, deleting and adding some reads in order to compensate
+    for the GC bias. **This is a stochastic method.**
+    >>> np.random.seed(1)
+    >>> test = Tester()
+    >>> args = test.testWriteCorrectedSam()
+    >>> tempFile = writeCorrectedSam_worker(*args, \
+    ... tag_but_not_change_number=True, verbose=False)
+    >>> try:
+    ...     import StringIO
+    ... except ImportError:
+    ...     from io import StringIO
+    >>> ostdout = sys.stdout
+    >>> import tempfile
+    >>> sys.stdout = tempfile.TemporaryFile()
+    >>> idx = pysam.index(tempFile)
+    >>> sys.stdout = ostdout
+    >>> bam = pysam.Samfile(tempFile)
+    >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
+    >>> res = os.remove(tempFile)
+    >>> res = os.remove(tempFile+".bai")
+    >>> tempFile = \
+    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
+    ... tag_but_not_change_number=True, verbose=False)
+    >>> sys.stdout = tempfile.TemporaryFile()
+    >>> idx = pysam.index(tempFile)
+    >>> sys.stdout = ostdout
+    >>> bam = pysam.Samfile(tempFile)
+    >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    >>> res = os.remove(tempFile)
+    >>> res = os.remove(tempFile+".bai")
+    """
+    global R_gc
+    fragmentLength = len(R_gc) - 1
+    if verbose:
+        print("Sam for %s %s %s " % (chrNameBit, start, end))
+    i = 0
+    tbit = py2bit.open(global_vars['2bit'])
+    bam = openBam(global_vars['bam'])
+    tempFileName = utilities.getTempFileName(suffix='.bam')
+    outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
+    startTime = time.time()
+    matePairs = {}
+    read_repetitions = 0
+    removed_duplicated_reads = 0
+    # cache data
+    # r.flag & 4 == 0 is to filter unmapped reads that
+    # have a genomic position
+    reads = [r for r in bam.fetch(chrNameBam, start, end)
+             if r.pos > start and r.flag & 4 == 0]
+    r_index = -1
+    for read in reads:
+        if read.pos <= start or read.is_unmapped:
+            continue
+        r_index += 1
+        copies = None
+        gc = None
+        # check if a mate has already been procesed
+        # to apply the same correction
+        try:
+            copies = matePairs[read.qname]['copies']
+            gc = matePairs[read.qname]['gc']
+            del matePairs[read.qname]
+        except:
+            # this exception happens when a mate is
+            # not present. This could
+            # happen because of removal of the mate
+            # by some filtering
+            gc = getReadGCcontent(tbit, read, fragmentLength,
+                                  chrNameBit)
+            if gc:
+                copies = numCopiesOfRead(float(1) / R_gc[gc])
+            else:
+                copies = 1
+        # is this read in the same orientation and position as the previous?
+        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
+                and read.is_reverse == reads[r_index - 1].is_reverse \
+                and read.pnext == reads[r_index - 1].pnext:
+            read_repetitions += 1
+            if read_repetitions >= global_vars['max_dup_gc'][gc]:
+                copies = 0  # in other words do not take into account this read
+                removed_duplicated_reads += 1
+        else:
+            read_repetitions = 0
+        readName = read.qname
+        # Each tag is a tuple of (tag name, value, type)
+        # Note that get_tags() returns ord(type) rather than type and this must
+        # be fixed!
+        # It turns out that the "with_value_type" option only started working in
+        # pysam-0.8.4, so we can't reliably add tags on earlier versions without
+        # potentially creating BAM files that break HTSJDK/IGV/etc.
+        readTag = read.get_tags(with_value_type=True)
+        replace_tags = False
+        if len(readTag) > 0:
+            if len(readTag[0]) == 3:
+                if type(readTag[2]) is int:
+                    readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
+                replace_tags = True
+        else:
+            replace_tags = True
+        if gc:
+            GC = int(100 * np.round(float(gc) / fragmentLength,
+                                    decimals=2))
+            readTag.append(
+                ('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
+            readTag.append(('YN', copies, "i"))
+        else:
+            GC = -1
+        readTag.append(('YG', GC, "i"))
+        if replace_tags:
+            read.set_tags(readTag)
+        if read.is_paired and read.is_proper_pair \
+                and not read.mate_is_unmapped \
+                and not read.is_reverse:
+            matePairs[readName] = {'copies': copies,
+                                   'gc': gc}
+        """
+        outfile.write(read)
+        """
+        if tag_but_not_change_number:
+            outfile.write(read)
+            continue
+        for numCop in range(1, copies + 1):
+            # the read has to be renamed such that newly
+            # formed pairs will match
+            if numCop > 1:
+                read.qname = readName + "_%d" % (numCop)
+            outfile.write(read)
+        if verbose:
+            if i % 500000 == 0 and i > 0:
+                endTime = time.time()
+                print("{},  processing {} ({:.1f} per sec) reads "
+                      "@ {}:{}-{}".format(multiprocessing.current_process().name,
+                                          i, i / (endTime - startTime),
+                                          chrNameBit, start, end))
+        i += 1
+    outfile.close()
+    if verbose:
+        endTime = time.time()
+        print("{},  processing {} ({:.1f} per sec) reads "
+              "@ {}:{}-{}".format(multiprocessing.current_process().name,
+                                  i, i / (endTime - startTime),
+                                  chrNameBit, start, end))
+        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
+            if len(reads) > 0 else 0
+        print("duplicated reads removed %d of %d (%.2f) " %
+              (removed_duplicated_reads, len(reads), percentage))
+    return tempFileName
+def getFragmentFromRead(read, defaultFragmentLength, extendPairedEnds=True):
+    """
+    The read has to be pysam object.
+    The following values are defined (for forward reads)::
+             |--          -- read.tlen --              --|
+             |-- read.alen --|
+        -----|===============>------------<==============|----
+             |               |            |
+          read.pos      read.aend      read.pnext
+          and for reverse reads
+             |--             -- read.tlen --           --|
+                                         |-- read.alen --|
+        -----|===============>-----------<===============|----
+             |                           |               |
+          read.pnext                   read.pos      read.aend
+    this is a sketch of a pair-end reads
+    The function returns the fragment start and end, either
+    using the paired end information (if available) or
+    extending the read in the appropriate direction if this
+    is single-end.
+    Parameters
+    ----------
+    read : pysam read object
+    Returns
+    -------
+    tuple
+        (fragment start, fragment end)
+    """
+    # convert reads to fragments
+    # this option indicates that the paired ends correspond
+    # to the fragment ends
+    # condition read.tlen < maxPairedFragmentLength is added to avoid read pairs
+    # that span thousands of base pairs
+    if extendPairedEnds is True and read.is_paired and 0 < abs(read.tlen) < 1000:
+        if read.is_reverse:
+            fragmentStart = read.pnext
+            fragmentEnd = read.aend
+        else:
+            fragmentStart = read.pos
+            # the end of the fragment is defined as
+            # the start of the forward read plus the insert length
+            fragmentEnd = read.pos + read.tlen
+    else:
+        if defaultFragmentLength <= read.aend - read.pos:
+            fragmentStart = read.pos
+            fragmentEnd = read.aend
+        else:
+            if read.is_reverse:
+                fragmentStart = read.aend - defaultFragmentLength
+                fragmentEnd = read.aend
+            else:
+                fragmentStart = read.pos
+                fragmentEnd = read.pos + defaultFragmentLength
+    return fragmentStart, fragmentEnd
+def run_shell_command(command):
+    """
+    Runs the given shell command. Report
+    any errors found.
+    """
+    try:
+        subprocess.check_call(command, shell=True)
+    except subprocess.CalledProcessError as error:
+        sys.stderr.write('Error{}\n'.format(error))
+        exit(1)
+    except Exception as error:
+        sys.stderr.write('Error: {}\n'.format(error))
+        exit(1)
+def main(args=None):
+    args = process_args(args)
+    global F_gc, N_gc, R_gc
+    data = np.loadtxt(args.GCbiasFrequenciesFile.name)
+    F_gc = data[:, 0]
+    N_gc = data[:, 1]
+    R_gc = data[:, 2]
+    global global_vars
+    global_vars = {}
+    global_vars['2bit'] = args.genome
+    global_vars['bam'] = args.bamfile
+    # compute the probability to find more than one read (a redundant read)
+    # at a certain position based on the gc of the read fragment
+    # the binomial function is used for that
+    max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x])
+                  if F_gc[x] > 0 and N_gc[x] > 0 else 1
+                  for x in range(len(F_gc))]
+    global_vars['max_dup_gc'] = max_dup_gc
+    tbit = py2bit.open(global_vars['2bit'])
+    bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)
+    global_vars['genome_size'] = sum(tbit.chroms().values())
+    global_vars['total_reads'] = mapped
+    global_vars['reads_per_bp'] = \
+        float(global_vars['total_reads']) / args.effectiveGenomeSize
+    # apply correction
+    print("applying correction")
+    # divide the genome in fragments containing about 4e5 reads.
+    # This amount of reads takes about 20 seconds
+    # to process per core (48 cores, 256 Gb memory)
+    chunkSize = int(4e5 / global_vars['reads_per_bp'])
+    # chromSizes: list of tuples
+    chromSizes = [(bam.references[i], bam.lengths[i])
+                  for i in range(len(bam.references))]
+    regionStart = 0
+    if args.region:
+        chromSizes, regionStart, regionEnd, chunkSize = \
+            mapReduce.getUserRegion(chromSizes, args.region,
+                                    max_chunk_size=chunkSize)
+    print("genome partition size for multiprocessing: {}".format(chunkSize))
+    print("using region {}".format(args.region))
+    mp_args = []
+    bedGraphStep = args.binSize
+    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
+    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
+    print(chrNameBitToBam, chrNameBamToBit)
+    c = 1
+    for chrom, size in chromSizes:
+        start = 0 if regionStart == 0 else regionStart
+        for i in range(start, size, chunkSize):
+            try:
+                chrNameBamToBit[chrom]
+            except KeyError:
+                print("no sequence information for ")
+                "chromosome {} in 2bit file".format(chrom)
+                print("Reads in this chromosome will be skipped")
+                continue
+            length = min(size, i + chunkSize)
+            mp_args.append((chrom, chrNameBamToBit[chrom], i, length,
+                            bedGraphStep))
+            c += 1
+    pool = multiprocessing.Pool(args.numberOfProcessors)
+    if args.correctedFile.name.endswith('bam'):
+        if len(mp_args) > 1 and args.numberOfProcessors > 1:
+            print(("using {} processors for {} "
+                   "number of tasks".format(args.numberOfProcessors,
+                                            len(mp_args))))
+            res = pool.map_async(
+                writeCorrectedSam_wrapper, mp_args).get(9999999)
+        else:
+            res = list(map(writeCorrectedSam_wrapper, mp_args))
+        if len(res) == 1:
+            command = "cp {} {}".format(res[0], args.correctedFile.name)
+            run_shell_command(command)
+        else:
+            print("concatenating (sorted) intermediate BAMs")
+            header = pysam.Samfile(res[0])
+            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
+            header.close()
+            for f in res:
+                f = pysam.Samfile(f)
+                for e in f.fetch(until_eof=True):
+                    of.write(e)
+                f.close()
+            of.close()
+        print("indexing BAM")
+        pysam.index(args.correctedFile.name)
+        for tempFileName in res:
+            os.remove(tempFileName)
+    if args.correctedFile.name.endswith('bg') or \
+            args.correctedFile.name.endswith('bw'):
+        if len(mp_args) > 1 and args.numberOfProcessors > 1:
+            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
+        else:
+            res = list(map(writeCorrected_wrapper, mp_args))
+        oname = args.correctedFile.name
+        args.correctedFile.close()
+        if oname.endswith('bg'):
+            f = open(oname, 'wb')
+            for tempFileName in res:
+                if tempFileName:
+                    shutil.copyfileobj(open(tempFileName, 'rb'), f)
+                    os.remove(tempFileName)
+            f.close()
+        else:
+            chromSizes = [(k, v) for k, v in tbit.chroms().items()]
+            writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
+class Tester():
+    def __init__(self):
+        import os
+        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
+        self.tbitFile = self.root + "sequence.2bit"
+        self.bamFile = self.root + "test.bam"
+        self.chrNameBam = '2L'
+        self.chrNameBit = 'chr2L'
+        bam, mapped, unmapped, stats = openBam(self.bamFile, returnStats=True)
+        tbit = py2bit.open(self.tbitFile)
+        global debug
+        debug = 0
+        global global_vars
+        global_vars = {'2bit': self.tbitFile,
+                       'bam': self.bamFile,
+                       'filter_out': None,
+                       'extra_sampling_file': None,
+                       'max_reads': 5,
+                       'min_reads': 0,
+                       'min_reads': 0,
+                       'reads_per_bp': 0.3,
+                       'total_reads': mapped,
+                       'genome_size': sum(tbit.chroms().values())}
+    def testWriteCorrectedChunk(self):
+        """ prepare arguments for test
+        """
+        global R_gc, R_gc_min, R_gc_max
+        R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
+        global_vars['max_dup_gc'] = np.ones(301)
+        start = 200
+        end = 300
+        bedGraphStep = 25
+        return (self.chrNameBam,
+                self.chrNameBit, start, end, bedGraphStep)
+    def testWriteCorrectedSam(self):
+        """ prepare arguments for test
+        """
+        global R_gc, R_gc_min, R_gc_max
+        R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
+        global_vars['max_dup_gc'] = np.ones(301)
+        start = 200
+        end = 250
+        return (self.chrNameBam,
+                self.chrNameBit, start, end)
+    def testWriteCorrectedSam_paired(self):
+        """ prepare arguments for test.
+        """
+        global R_gc, R_gc_min, R_gc_max
+        R_gc = np.loadtxt(self.root + "R_gc_paired.txt")
+        start = 0
+        end = 500
+        global global_vars
+        global_vars['bam'] = self.root + "paired.bam"
+        return 'chr2L', 'chr2L', start, end
+if __name__ == "__main__":
+    main()

deepTools/source/deeptools/correlation.py ADDED Viewed

	@@ -0,0 +1,706 @@

+import sys
+import itertools
+import copy
+import numpy as np
+import scipy.cluster.hierarchy as sch
+import scipy.stats
+import matplotlib as mpl
+mpl.use('Agg')
+mpl.rcParams['pdf.fonttype'] = 42
+mpl.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm  # noqa: F401
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import matplotlib.ticker
+import matplotlib.mlab
+import matplotlib.markers
+import matplotlib.colors as pltcolors
+from deeptools.utilities import toString, convertCmap
+import plotly.offline as offline
+import plotly.graph_objs as go
+import plotly.figure_factory as ff
+old_settings = np.seterr(all='ignore')
+class Correlation:
+    """
+    class to work with matrices
+    having sample data
+    to compute correlations, plot
+    them and make scatter plots
+    """
+    def __init__(self, matrix_file,
+                 corr_method=None,
+                 labels=None,
+                 remove_outliers=False,
+                 skip_zeros=False,
+                 log1p=False):
+        self.load_matrix(matrix_file)
+        self.skip_zeros = skip_zeros
+        self.corr_method = corr_method
+        self.corr_matrix = None  # correlation matrix
+        self.column_order = None
+        self.rowCenter = False
+        if labels is not None:
+            # test that the length of labels
+            # corresponds to the length of
+            # samples
+            self.labels = labels
+        self.labels = [toString(x) for x in self.labels]
+        if self.matrix.shape[1] == 1:
+            # There's nothing that can be done with a single sample
+            sys.exit("\nPlease use a matrix with more than one sample\n")
+        if skip_zeros is True:
+            # remove rows containing only nans or zeros
+            # that could be unmappable regions.
+            self.remove_rows_of_zeros()
+        if remove_outliers is True:
+            # remove outliers, otherwise outliers will produce a very
+            # high pearson correlation. Unnecessary for spearman correlation
+            self.remove_outliers()
+        if log1p is True:
+            self.matrix = np.log1p(self.matrix)
+        if corr_method:
+            self.compute_correlation()
+    def load_matrix(self, matrix_file):
+        """
+        loads a matrix file saved using the numpy
+        savez method. Two keys are expected:
+        'matrix' and 'labels'. The matrix should
+        contain one sample per row
+        """
+        _ma = np.load(matrix_file)
+        # matrix:  cols correspond to  samples
+        self.matrix = np.asarray(_ma['matrix'].tolist())
+        if np.any(np.isnan(self.matrix)):
+            num_nam = len(np.flatnonzero(np.isnan(self.matrix.flatten())))
+            sys.stderr.write("*Warning*. {} NaN values were found. They will be removed along with the "
+                             "corresponding bins in other samples for the computation "
+                             "and plotting\n".format(num_nam))
+            self.matrix = np.ma.compress_rows(np.ma.masked_invalid(self.matrix))
+        self.labels = list(map(toString, _ma['labels']))
+        assert len(self.labels) == self.matrix.shape[1], "ERROR, length of labels is not equal " \
+                                                         "to length of matrix samples"
+    @staticmethod
+    def get_outlier_indices(data, max_deviation=200):
+        """
+        The method is based on the median absolute deviation. See
+        Boris Iglewicz and David Hoaglin (1993),
+        "Volume 16: How to Detect and Handle Outliers",
+        The ASQC Basic References in Quality Control:
+        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
+        returns the list, without the outliers
+        The max_deviation=200 is like selecting a z-score
+        larger than 200, just that it is based on the median
+        and the median absolute deviation instead of the
+        mean and the standard deviation.
+        """
+        median = np.median(data)
+        b_value = 1.4826  # value set for a normal distribution
+        mad = b_value * np.median(np.abs(data))
+        outliers = []
+        if mad > 0:
+            deviation = abs(data - median) / mad
+            """
+            outliers = data[deviation > max_deviation]
+            print "outliers removed {}".format(len(outliers))
+            print outliers
+            """
+            outliers = np.flatnonzero(deviation > max_deviation)
+        return outliers
+    def remove_outliers(self, verbose=True):
+        """
+        get the outliers *per column* using the median absolute
+        deviation method
+        Returns the filtered matrix
+        """
+        unfiltered = len(self.matrix)
+        to_remove = None
+        for col in self.matrix.T:
+            outliers = self.get_outlier_indices(col)
+            if to_remove is None:
+                to_remove = set(outliers)
+            else:
+                # only set to remove those bins in which
+                # the outliers are present in all cases (colums)
+                # that's why the intersection is used
+                to_remove = to_remove.intersection(outliers)
+        if len(to_remove):
+            to_keep = [x for x in range(self.matrix.shape[0])
+                       if x not in to_remove]
+            self.matrix = self.matrix[to_keep, :]
+            if verbose:
+                sys.stderr.write(
+                    "total/filtered/left: "
+                    "{}/{}/{}\n".format(unfiltered,
+                                        unfiltered - len(to_keep),
+                                        len(to_keep)))
+        return self.matrix
+    def remove_rows_of_zeros(self):
+        # remove rows containing all zeros or all nans
+        _mat = np.nan_to_num(self.matrix)
+        to_keep = _mat.sum(1) != 0
+        self.matrix = self.matrix[to_keep, :]
+    def save_corr_matrix(self, file_handle):
+        """
+        saves the correlation matrix
+        """
+        if self.column_order:
+            self.corr_matrix = self.corr_matrix[:, self.column_order][self.column_order]
+            self.labels = [self.labels[i] for i in self.column_order]
+        self.labels = [toString(x) for x in self.labels]
+        file_handle.write("\t'" + "'\t'".join(self.labels) + "'\n")
+        fmt = "\t".join(np.repeat('%.4f', self.corr_matrix.shape[1])) + "\n"
+        i = 0
+        for row in self.corr_matrix:
+            file_handle.write(
+                "'%s'\t" % self.labels[i] + fmt % tuple(row))
+            i += 1
+    def compute_correlation(self):
+        """
+        computes spearman or pearson
+        correlation for the samples in the matrix
+        The matrix should contain the values of each sample per column
+        that's why the transpose is used.
+        >>> matrix = np.array([[1, 2, 3, np.nan],
+        ...                    [1, 2, 3, 4],
+        ...                    [6, 4, 3, 1]]).T
+        >>> np.savez_compressed("/tmp/test_matrix.npz", matrix=matrix, labels=['a', 'b', 'c'])
+        >>> c = Correlation("/tmp/test_matrix.npz", corr_method='pearson')
+        the results should be  as in R
+        >>> c.compute_correlation().filled(np.nan)
+        array([[ 1.        ,  1.        , -0.98198051],
+               [ 1.        ,  1.        , -0.98198051],
+               [-0.98198051, -0.98198051,  1.        ]])
+        >>> c.corr_method = 'spearman'
+        >>> c.corr_matrix = None
+        >>> c.compute_correlation()
+        array([[ 1.,  1., -1.],
+               [ 1.,  1., -1.],
+               [-1., -1.,  1.]])
+        """
+        if self.corr_matrix is not None:
+            return self.corr_matrix
+        num_samples = len(self.labels)
+        # initialize correlation matrix
+        if self.corr_method == 'pearson':
+            self.corr_matrix = np.ma.corrcoef(self.matrix.T, allow_masked=True)
+        else:
+            corr_matrix = np.zeros((num_samples, num_samples), dtype='float')
+            # do an all vs all correlation using the
+            # indices of the upper triangle
+            rows, cols = np.triu_indices(num_samples)
+            for index in range(len(rows)):
+                row = rows[index]
+                col = cols[index]
+                corr_matrix[row, col] = scipy.stats.spearmanr(self.matrix[:, row], self.matrix[:, col])[0]
+            # make the matrix symmetric
+            self.corr_matrix = corr_matrix + np.triu(corr_matrix, 1).T
+        return self.corr_matrix
+    def plotly_correlation(self, corr_matrix, plot_filename, labels, plot_title='',
+                           vmax=None, vmin=None, plot_numbers=True,
+                           colormap='jet'):
+        """plot_correlation, but using plotly"""
+        textElement = []
+        for row in range(corr_matrix.shape[0]):
+            trow = []
+            for col in range(corr_matrix.shape[0]):
+                if plot_numbers:
+                    trow.append("{:0.2f}".format(corr_matrix[row, col]))
+                else:
+                    trow.append('')
+            textElement.append(trow)
+        zauto = True
+        if vmax is not None or vmin is not None:
+            zauto = False
+        convertedCmap = convertCmap(colormap)
+        fig = ff.create_annotated_heatmap(corr_matrix, x=labels, y=labels, colorscale=convertedCmap, showscale=True, zauto=zauto, zmin=vmin, zmax=vmax, annotation_text=textElement)
+        fig.layout['title'] = plot_title
+        offline.plot(fig, filename=plot_filename, auto_open=False)
+    def plot_correlation(self, plot_filename, plot_title='', vmax=None,
+                         vmin=None, colormap='jet', image_format=None,
+                         plot_numbers=False, plotWidth=11, plotHeight=9.5):
+        """
+        plots a correlation using a symmetric heatmap
+        """
+        num_rows = len(self.labels)
+        corr_matrix = self.compute_correlation()
+        # set a font size according to figure length
+        if num_rows < 6:
+            font_size = 14
+        elif num_rows > 40:
+            font_size = 5
+        else:
+            font_size = int(14 - 0.25 * num_rows)
+        mpl.rcParams.update({'font.size': font_size})
+        # set the minimum and maximum values
+        if vmax is None:
+            vmax = 1
+        if vmin is None:
+            vmin = 0 if corr_matrix .min() >= 0 else -1
+        # Compute and plot dendrogram.
+        fig = plt.figure(figsize=(plotWidth, plotHeight))
+        plt.suptitle(plot_title)
+        axdendro = fig.add_axes([0.015, 0.1, 0.1, 0.7])
+        axdendro.set_axis_off()
+        y_var = sch.linkage(corr_matrix, method='centroid')
+        z_var = sch.dendrogram(y_var, orientation='left',
+                               link_color_func=lambda k: 'darkred')
+        axdendro.set_xticks([])
+        axdendro.set_yticks([])
+        cmap = copy.copy(plt.get_cmap(colormap))
+        # this line simply makes a new cmap, based on the original
+        # colormap that goes from 0.0 to 0.9
+        # This is done to avoid colors that
+        # are too dark at the end of the range that do not offer
+        # a good contrast between the correlation numbers that are
+        # plotted on black.
+        if plot_numbers:
+            cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped",
+                                                               cmap(np.linspace(0, 0.9, 10)))
+        cmap.set_under((0., 0., 1.))
+        # Plot distance matrix.
+        axmatrix = fig.add_axes([0.12, 0.1, 0.6, 0.7])
+        index = z_var['leaves']
+        corr_matrix = corr_matrix[index, :]
+        corr_matrix = corr_matrix[:, index]
+        if corr_matrix.shape[0] > 30:
+            # when there are too many rows it is better to remove
+            # the black lines surrounding the boxes in the heatmap
+            edge_color = 'none'
+        else:
+            edge_color = 'black'
+        if image_format == "plotly":
+            self.plotly_correlation(corr_matrix,
+                                    plot_filename,
+                                    self.labels,
+                                    plot_title=plot_title,
+                                    vmax=vmax,
+                                    vmin=vmin,
+                                    colormap=colormap,
+                                    plot_numbers=plot_numbers)
+            return
+        img_mat = axmatrix.pcolormesh(corr_matrix,
+                                      edgecolors=edge_color,
+                                      cmap=cmap,
+                                      vmax=vmax,
+                                      vmin=vmin)
+        axmatrix.set_xlim(0, num_rows)
+        axmatrix.set_ylim(0, num_rows)
+        axmatrix.yaxis.tick_right()
+        axmatrix.set_yticks(np.arange(corr_matrix .shape[0]) + 0.5)
+        axmatrix.set_yticklabels(np.array(self.labels).astype('str')[index])
+        axmatrix.xaxis.set_tick_params(labeltop=True)
+        axmatrix.xaxis.set_tick_params(labelbottom=False)
+        axmatrix.set_xticks(np.arange(corr_matrix .shape[0]) + 0.5)
+        axmatrix.set_xticklabels(np.array(self.labels).astype('str')[index], rotation=45, ha='left')
+        axmatrix.tick_params(
+            axis='x',
+            which='both',
+            bottom=False,
+            top=False)
+        axmatrix.tick_params(
+            axis='y',
+            which='both',
+            left=False,
+            right=False)
+        # Plot colorbar
+        axcolor = fig.add_axes([0.12, 0.065, 0.6, 0.02])
+        cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal')
+        cobar.solids.set_edgecolor("face")
+        if plot_numbers:
+            for row in range(num_rows):
+                for col in range(num_rows):
+                    axmatrix.text(row + 0.5, col + 0.5,
+                                  "{:.2f}".format(corr_matrix[row, col]),
+                                  ha='center', va='center')
+        self.column_order = index
+        fig.savefig(plot_filename, format=image_format)
+        plt.close()
+    def plotly_scatter(self, plot_filename, corr_matrix, plot_title='', minXVal=None, maxXVal=None, minYVal=None, maxYVal=None):
+        """Make the scatter plot of a matrix with plotly"""
+        n = self.matrix.shape[1]
+        self.matrix = self.matrix
+        fig = go.Figure()
+        domainWidth = 1. / n
+        annos = []
+        for i in range(n):
+            x = domainWidth * (i + 1)
+            y = 1 - (domainWidth * i + 0.5 * domainWidth)
+            anno = dict(text=self.labels[i], showarrow=False, xref='paper', yref='paper', x=x, y=y, xanchor='right', yanchor='middle')
+            annos.append(anno)
+        data = []
+        zMin = np.inf
+        zMax = -np.inf
+        for x in range(n):
+            xanchor = 'x{}'.format(x + 1)
+            base = x * domainWidth
+            domain = [base, base + domainWidth]
+            if x > 0:
+                base = 1 - base
+                fig['layout']['xaxis{}'.format(x + 1)] = dict(domain=domain, range=[minXVal, maxXVal], anchor='free', position=base)
+            for y in range(0, n):
+                yanchor = 'y{}'.format(y + 1)
+                if x == 1:
+                    base = 1 - y * domainWidth
+                    domain = [base - domainWidth, base]
+                    fig['layout']['yaxis{}'.format(y + 1)] = dict(domain=domain, range=[minYVal, maxYVal], side='right', anchor='free', position=1.0)
+                if x > y:
+                    vector1 = self.matrix[:, x]
+                    vector2 = self.matrix[:, y]
+                    Z, xEdges, yEdges = np.histogram2d(vector1, vector2, bins=50)
+                    Z = np.log10(Z)
+                    if np.min(Z) < zMin:
+                        zMin = np.min(Z)
+                    if np.max(Z) > zMax:
+                        zMax = np.max(Z)
+                    name = '{}={:.2f}'.format(self.corr_method, corr_matrix[x, y])
+                    trace = go.Heatmap(z=Z, x=xEdges, y=yEdges, showlegend=False, xaxis=xanchor, yaxis=yanchor, name=name, showscale=False)
+                    data.append(trace)
+        # Fix the colorbar bounds
+        for trace in data:
+            trace.update(zmin=zMin, zmax=zMax)
+        data[-1]['colorbar'].update(title="log10(instances per bin)", titleside="right")
+        data[-1].update(showscale=True)
+        fig.add_traces(data)
+        fig['layout'].update(title=plot_title, showlegend=False, annotations=annos)
+        offline.plot(fig, filename=plot_filename, auto_open=False)
+    def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=False, xRange=None, yRange=None):
+        """
+        Plot the scatter plots of a matrix
+        in which each row is a sample
+        """
+        num_samples = self.matrix.shape[1]
+        corr_matrix = self.compute_correlation()
+        grids = gridspec.GridSpec(num_samples, num_samples)
+        grids.update(wspace=0, hspace=0)
+        fig = plt.figure(figsize=(2 * num_samples, 2 * num_samples))
+        plt.rcParams['font.size'] = 8.0
+        plt.suptitle(plot_title)
+        if log1p is True:
+            self.matrix = np.log1p(self.matrix)
+        min_xvalue = self.matrix.min()
+        max_xvalue = self.matrix.max()
+        min_yvalue = min_xvalue
+        max_yvalue = max_xvalue
+        if xRange is not None:
+            min_xvalue = xRange[0]
+            max_xvalue = xRange[1]
+        if yRange is not None:
+            min_yvalue = yRange[0]
+            max_yvalue = yRange[1]
+        if (min_xvalue % 2 == 0 and max_xvalue % 2 == 0) or \
+                (min_xvalue % 1 == 0 and max_xvalue % 2 == 1):
+            # make one value odd and the other even
+            max_xvalue += 1
+        if (min_yvalue % 2 == 0 and max_yvalue % 2 == 0) or \
+                (min_yvalue % 1 == 0 and max_yvalue % 2 == 1):
+            # make one value odd and the other even
+            max_yvalue += 1
+        # plotly output
+        if image_format == 'plotly':
+            self.plotly_scatter(plot_filename, corr_matrix, plot_title=plot_title, minXVal=min_xvalue, maxXVal=max_xvalue, minYVal=min_yvalue, maxYVal=max_yvalue)
+            return
+        rows, cols = np.triu_indices(num_samples)
+        for index in range(len(rows)):
+            row = rows[index]
+            col = cols[index]
+            if row == col:
+                # add titles as
+                # empty plot in the diagonal
+                ax = fig.add_subplot(grids[row, col])
+                ax.text(0.5, 0.5, self.labels[row],
+                        verticalalignment='center',
+                        horizontalalignment='center',
+                        fontsize=10, fontweight='bold',
+                        transform=ax.transAxes)
+                ax.set_axis_off()
+                continue
+            ax = fig.add_subplot(grids[row, col])
+            vector1 = self.matrix[:, row]
+            vector2 = self.matrix[:, col]
+            ax.text(0.2, 0.8, "{}={:.2f}".format(self.corr_method,
+                                                 corr_matrix[row, col]),
+                    horizontalalignment='left',
+                    transform=ax.transAxes)
+            ax.get_yaxis().set_tick_params(
+                which='both',
+                left=False,
+                right=False,
+                direction='out')
+            ax.get_xaxis().set_tick_params(
+                which='both',
+                top=False,
+                bottom=False,
+                direction='out')
+            ax.get_xaxis().set_tick_params(
+                which='major',
+                labelrotation=45)
+            if col != num_samples - 1:
+                ax.set_yticklabels([])
+            else:
+                ax.yaxis.tick_right()
+                ax.get_yaxis().set_tick_params(
+                    which='both',
+                    left=False,
+                    right=True,
+                    direction='out')
+            if col - row == 1:
+                ax.xaxis.tick_bottom()
+                ax.get_xaxis().set_tick_params(
+                    which='both',
+                    top=False,
+                    bottom=True,
+                    direction='out')
+                ax.get_xaxis().set_tick_params(
+                    which='major',
+                    labelrotation=45)
+            else:
+                ax.set_xticklabels([])
+            ax.set_xlim(min_xvalue, max_xvalue)
+            ax.set_ylim(min_yvalue, max_yvalue)
+            ax.hist2d(vector2, vector1, bins=200, cmin=0.1)
+        plt.savefig(plot_filename, format=image_format)
+        plt.close()
+    def plotly_pca(self, plotFile, Wt, pvar, PCs, eigenvalues, cols, plotTitle):
+        """
+        A plotly version of plot_pca, that's called by it to do the actual plotting
+        """
+        fig = go.Figure()
+        fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1])}
+        fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1])}
+        fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'title': 'Principal Component'}
+        fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Eigenvalue', 'rangemode': 'tozero', 'showgrid': False}
+        fig['layout']['yaxis3'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Cumulative variability', 'rangemode': 'tozero', 'side': 'right', 'overlaying': 'y2'}
+        fig['layout'].update(title=plotTitle)
+        # PCA
+        if cols is not None:
+            colors = itertools.cycle(cols)
+        n = len(self.labels)
+        data = []
+        for i in range(n):
+            trace = go.Scatter(x=[Wt[PCs[0] - 1, i]],
+                               y=[Wt[PCs[1] - 1, i]],
+                               mode='marker',
+                               xaxis='x1',
+                               yaxis='y1',
+                               name=self.labels[i])
+            trace['marker'].update(size=20)
+            if cols is not None:
+                trace['marker'].update(color=next(colors))
+            data.append(trace)
+        # Scree plot
+        trace = go.Bar(showlegend=False,
+                       name='Eigenvalues',
+                       x=range(1, n + 1),
+                       y=eigenvalues[:n],
+                       xaxis='x2',
+                       yaxis='y2')
+        data.append(trace)
+        # Cumulative variability
+        trace = go.Scatter(showlegend=False,
+                           x=range(1, n + 1),
+                           y=pvar.cumsum()[:n],
+                           mode='lines+markers',
+                           name='Cumulative variability',
+                           xaxis='x2',
+                           yaxis='y3',
+                           line={'color': 'red'},
+                           marker={'symbol': 'circle-open-dot', 'color': 'black'})
+        data.append(trace)
+        annos = []
+        annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'PCA', 'y': 1.0, 'x': 0.25, 'font': {'size': 16}, 'showarrow': False})
+        annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'Scree plot', 'y': 1.0, 'x': 0.75, 'font': {'size': 16}, 'showarrow': False})
+        fig.add_traces(data)
+        fig['layout']['annotations'] = annos
+        offline.plot(fig, filename=plotFile, auto_open=False)
+    def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=None, log1p=False, plotWidth=5, plotHeight=10, cols=None, marks=None):
+        """
+        Plot the PCA of a matrix
+        Returns the matrix of plotted values.
+        """
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(plotWidth, plotHeight))
+        # Filter
+        m = self.matrix
+        rvs = m.var(axis=1)
+        if self.transpose:
+            m = m[np.nonzero(rvs)[0], :]
+            rvs = rvs[np.nonzero(rvs)[0]]
+        if self.ntop > 0 and m.shape[0] > self.ntop:
+            m = m[np.argpartition(rvs, -self.ntop)[-self.ntop:], :]
+            rvs = rvs[np.argpartition(rvs, -self.ntop)[-self.ntop:]]
+        # log2 (if requested)
+        if self.log2:
+            self.matrix = np.log2(self.matrix + 0.01)
+        # Row center / transpose
+        if self.rowCenter and not self.transpose:
+            _ = self.matrix.mean(axis=1)
+            self.matrix -= _[:, None]
+        if self.transpose:
+            m = m.T
+        # Center and scale
+        m2 = (m - np.mean(m, axis=0))
+        m2 /= np.std(m2, axis=0, ddof=1)  # Use the unbiased std. dev.
+        # SVD
+        U, s, Vh = np.linalg.svd(m2, full_matrices=False, compute_uv=True)  # Is full_matrices ever needed?
+        # % variance, eigenvalues
+        eigenvalues = s**2
+        variance = eigenvalues / float(np.max([1, m2.shape[1] - 1]))
+        pvar = variance / variance.sum()
+        # Weights/projections
+        Wt = Vh
+        if self.transpose:
+            # Use the projected coordinates for the transposed matrix
+            Wt = np.dot(m2, Vh.T).T
+        if plot_filename is not None:
+            n = n_bars = len(self.labels)
+            if eigenvalues.size < n:
+                n_bars = eigenvalues.size
+            markers = itertools.cycle(matplotlib.markers.MarkerStyle.filled_markers)
+            if cols is not None:
+                colors = itertools.cycle(cols)
+            else:
+                colors = itertools.cycle(plt.cm.gist_rainbow(np.linspace(0, 1, n)))
+            if marks is not None:
+                markers = itertools.cycle(marks)
+            if image_format == 'plotly':
+                self.plotly_pca(plot_filename, Wt, pvar, PCs, eigenvalues, cols, plot_title)
+            else:
+                ax1.axhline(y=0, color="black", linestyle="dotted", zorder=1)
+                ax1.axvline(x=0, color="black", linestyle="dotted", zorder=2)
+                for i in range(n):
+                    color = next(colors)
+                    marker = next(markers)
+                    if isinstance(color, np.ndarray):
+                        color = pltcolors.to_hex(color, keep_alpha=True)
+                    ax1.scatter(Wt[PCs[0] - 1, i], Wt[PCs[1] - 1, i],
+                                marker=marker, color=color, s=150, label=self.labels[i], zorder=i + 3)
+                if plot_title == '':
+                    ax1.set_title('PCA')
+                else:
+                    ax1.set_title(plot_title)
+                ax1.set_xlabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1]))
+                ax1.set_ylabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1]))
+                lgd = ax1.legend(scatterpoints=1, loc='center left', borderaxespad=0.5,
+                                 bbox_to_anchor=(1, 0.5),
+                                 prop={'size': 12}, markerscale=0.9)
+                # Scree plot
+                ind = np.arange(n_bars)  # the x locations for the groups
+                width = 0.35        # the width of the bars
+                if mpl.__version__ >= "2.0.0":
+                    ax2.bar(2 * width + ind, eigenvalues[:n_bars], width * 2)
+                else:
+                    ax2.bar(width + ind, eigenvalues[:n_bars], width * 2)
+                ax2.set_ylabel('Eigenvalue')
+                ax2.set_xlabel('Principal Component')
+                ax2.set_title('Scree plot')
+                ax2.set_xticks(ind + width * 2)
+                ax2.set_xticklabels(ind + 1)
+                ax3 = ax2.twinx()
+                ax3.axhline(y=1, color="black", linestyle="dotted")
+                ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "r-")
+                ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "wo", markeredgecolor="black")
+                ax3.set_ylim([0, 1.05])
+                ax3.set_ylabel('Cumulative variability')
+                plt.subplots_adjust(top=3.85)
+                plt.tight_layout()
+                plt.savefig(plot_filename, format=image_format, bbox_extra_artists=(lgd,), bbox_inches='tight')
+                plt.close()
+        return Wt, eigenvalues

deepTools/source/deeptools/correlation_heatmap.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from matplotlib import use as mplt_use
+mplt_use('Agg')
+from deeptools import cm  # noqa: F401
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.cluster.hierarchy as sch
+from matplotlib import rcParams
+import matplotlib.colors as pltcolors
+import copy
+rcParams['pdf.fonttype'] = 42
+rcParams['svg.fonttype'] = 'none'
+old_settings = np.seterr(all='ignore')
+def plot_correlation(corr_matrix, labels, plotFileName, vmax=None,
+                     vmin=None, colormap='jet', image_format=None,
+                     plot_numbers=False, plot_title=''):
+    num_rows = corr_matrix.shape[0]
+    # set a font size according to figure length
+    if num_rows < 6:
+        font_size = 14
+    elif num_rows > 40:
+        font_size = 5
+    else:
+        font_size = int(14 - 0.25 * num_rows)
+    rcParams.update({'font.size': font_size})
+    # set the minimum and maximum values
+    if vmax is None:
+        vmax = 1
+    if vmin is None:
+        vmin = 0 if corr_matrix.min() >= 0 else -1
+    # Compute and plot dendrogram.
+    fig = plt.figure(figsize=(11, 9.5))
+    if plot_title:
+        plt.suptitle(plot_title)
+    axdendro = fig.add_axes([0.02, 0.12, 0.1, 0.66])
+    axdendro.set_axis_off()
+    y_var = sch.linkage(corr_matrix, method='complete')
+    z_var = sch.dendrogram(y_var, orientation='right',
+                           link_color_func=lambda k: 'darkred')
+    axdendro.set_xticks([])
+    axdendro.set_yticks([])
+    cmap = copy.copy(plt.get_cmap(colormap))
+    # this line simply makes a new cmap, based on the original
+    # colormap that goes from 0.0 to 0.9
+    # This is done to avoid colors that
+    # are too dark at the end of the range that do not offer
+    # a good contrast between the correlation numbers that are
+    # plotted on black.
+    if plot_numbers:
+        cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped",
+                                                           cmap(np.linspace(0, 0.9, 10)))
+    cmap.set_under((0., 0., 1.))
+    # Plot distance matrix.
+    axmatrix = fig.add_axes([0.13, 0.1, 0.6, 0.7])
+    index = z_var['leaves']
+    corr_matrix = corr_matrix[index, :]
+    corr_matrix = corr_matrix[:, index]
+    img_mat = axmatrix.pcolormesh(corr_matrix,
+                                  edgecolors='black',
+                                  cmap=cmap,
+                                  vmax=vmax,
+                                  vmin=vmin)
+    axmatrix.set_xlim(0, num_rows)
+    axmatrix.set_ylim(0, num_rows)
+    axmatrix.yaxis.tick_right()
+    axmatrix.set_yticks(np.arange(corr_matrix.shape[0]) + 0.5)
+    axmatrix.set_yticklabels(np.array(labels).astype('str')[index])
+#    axmatrix.xaxis.set_label_position('top')
+    axmatrix.xaxis.set_tick_params(labeltop=True)
+    axmatrix.xaxis.set_tick_params(labelbottom=False)
+    axmatrix.set_xticks(np.arange(corr_matrix.shape[0]) + 0.5)
+    axmatrix.set_xticklabels(np.array(labels).astype('str')[index],
+                             rotation=45,
+                             ha='left')
+    axmatrix.tick_params(
+        axis='x',
+        which='both',
+        bottom=False,
+        top=False)
+    axmatrix.tick_params(
+        axis='y',
+        which='both',
+        left=False,
+        right=False)
+    #    axmatrix.set_xticks([])
+    # Plot colorbar.
+    axcolor = fig.add_axes([0.13, 0.065, 0.6, 0.02])
+    cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal')
+    cobar.solids.set_edgecolor("face")
+    if plot_numbers:
+        for row in range(num_rows):
+            for col in range(num_rows):
+                axmatrix.text(row + 0.5, col + 0.5,
+                              "{:.2f}".format(corr_matrix[row, col]),
+                              ha='center', va='center')
+    fig.savefig(plotFileName, format=image_format)
+    fig.close()

deepTools/source/deeptools/countReadsPerBin.py ADDED Viewed

	@@ -0,0 +1,1033 @@

+import shutil
+import os
+import time
+import sys
+import multiprocessing
+import numpy as np
+# deepTools packages
+import deeptools.utilities
+from deeptools import bamHandler
+from deeptools import mapReduce
+from deeptoolsintervals import GTF
+import pyBigWig
+debug = 0
+old_settings = np.seterr(all='ignore')
+def countReadsInRegions_wrapper(args):
+    """
+    Passes the arguments to countReadsInRegions_worker.
+    This is a step required given
+    the constrains from the multiprocessing module.
+    The args var, contains as first element the 'self' value
+    from the countReadsPerBin object
+    """
+    return CountReadsPerBin.count_reads_in_region(*args)
+class CountReadsPerBin(object):
+    r"""Collects coverage over multiple bam files using multiprocessing
+    This function collects read counts (coverage) from several bam files and returns
+    an numpy array with the results. This class uses multiprocessing to compute the coverage.
+    Parameters
+    ----------
+    bamFilesList : list
+        List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']
+    binLength : int
+        Length of the window/bin. This value is overruled by ``bedFile`` if present.
+    numberOfSamples : int
+        Total number of samples. The genome is divided into ``numberOfSamples``, each
+        with a window/bin length equal to ``binLength``. This value is overruled
+        by ``stepSize`` in case such value is present and by ``bedFile`` in which
+        case the number of samples and bins are defined in the bed file
+    numberOfProcessors : int
+        Number of processors to use. Default is 4
+    verbose : bool
+        Output messages. Default: False
+    region : str
+        Region to limit the computation in the form chrom:start:end.
+    bedFile : list of file_handles.
+        Each file handle corresponds to a bed file containing the regions for which to compute the coverage. This option
+        overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.
+    blackListFileName : str
+        A string containing a BED file with blacklist regions.
+    extendReads : bool, int
+        Whether coverage should be computed for the extended read length (i.e. the region covered
+        by the two mates or the regions expected to be covered by single-reads).
+        If the value is 'int', then then this is interpreted as the fragment length to extend reads
+        that are not paired. For Illumina reads, usual values are around 300.
+        This value can be determined using the peak caller MACS2 or can be
+        approximated by the fragment lengths computed when preparing the library for sequencing. If the value
+        is of the variable is true and not value is given, the fragment size is sampled from the library but
+        only if the library is paired-end. Default: False
+    minMappingQuality : int
+        Reads of a mapping quality less than the give value are not considered. Default: None
+    ignoreDuplicates : bool
+        Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
+        to be excluded. Default: false
+    chrToSkip: list
+        List with names of chromosomes that do not want to be included in the coverage computation.
+        This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').
+    stepSize : int
+        the positions for which the coverage is computed are defined as follows:
+        ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
+        the coverage at each base pair. If the stepSize is equal to the
+        binLength then the coverage is computed for consecutive bins. If seepSize is
+        smaller than the binLength, then teh bins will overlap.
+    center_read : bool
+        Determines if reads should be centered with respect to the fragment length.
+    samFlag_include : int
+        Extracts only those reads having the SAM flag. For example, to get only
+        reads that are the first mates a samFlag of 64 could be used. Similarly, the
+        samFlag_include can be used to select only reads mapping on the reverse strand
+        or to get only properly paired reads.
+    samFlag_exclude : int
+        Removes reads that match the SAM flag. For example to get all reads
+        that map to the forward strand a samFlag_exlude 16 should be used. Which
+        translates into exclude all reads that map to the reverse strand.
+    zerosToNans : bool
+        If true, zero values encountered are transformed to Nans. Default false.
+    skipZeroOverZero : bool
+        If true, skip bins where all input BAM files have no coverage (only applicable to bamCompare).
+    minFragmentLength : int
+        If greater than 0, fragments below this size are excluded.
+    maxFragmentLength : int
+        If greater than 0, fragments above this size are excluded.
+    out_file_for_raw_data : str
+        File name to save the raw counts computed
+    statsList : list
+        For each BAM file in bamFilesList, the associated per-chromosome statistics returned by openBam
+    mappedList : list
+        For each BAM file in bamFilesList, the number of mapped reads in the file.
+    bed_and_bin : boolean
+        If true AND a bedFile is given, compute coverage of each bin of the given size in each region of bedFile
+    genomeChunkSize : int
+        If not None, the length of the genome used for multiprocessing.
+    Returns
+    -------
+    numpy array
+        Each row correspond to each bin/bed region and each column correspond to each of
+        the bamFiles.
+    Examples
+    --------
+    The test data contains reads for 200 bp.
+    >>> test = Tester()
+    The transpose function is used to get a nicer looking output.
+    The first line corresponds to the number of reads per bin in bam file 1
+    >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 50, 4)
+    >>> np.transpose(c.run())
+    array([[0., 0., 1., 1.],
+           [0., 1., 1., 2.]])
+    """
+    def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1,
+                 verbose=False, region=None,
+                 bedFile=None, extendReads=False,
+                 genomeChunkSize=None,
+                 blackListFileName=None,
+                 minMappingQuality=None,
+                 ignoreDuplicates=False,
+                 chrsToSkip=[],
+                 stepSize=None,
+                 center_read=False,
+                 samFlag_include=None,
+                 samFlag_exclude=None,
+                 zerosToNans=False,
+                 skipZeroOverZero=False,
+                 smoothLength=0,
+                 minFragmentLength=0,
+                 maxFragmentLength=0,
+                 out_file_for_raw_data=None,
+                 bed_and_bin=False,
+                 statsList=[],
+                 mappedList=[]):
+        self.bamFilesList = bamFilesList
+        self.binLength = binLength
+        self.numberOfSamples = numberOfSamples
+        self.blackListFileName = blackListFileName
+        self.statsList = statsList
+        self.mappedList = mappedList
+        self.skipZeroOverZero = skipZeroOverZero
+        self.bed_and_bin = bed_and_bin
+        self.genomeChunkSize = genomeChunkSize
+        if extendReads and len(bamFilesList):
+            from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+            frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0],
+                                                                        return_lengths=False,
+                                                                        blackListFileName=blackListFileName,
+                                                                        numberOfProcessors=numberOfProcessors,
+                                                                        verbose=verbose)
+            if extendReads is True:
+                # try to guess fragment length if the bam file contains paired end reads
+                if frag_len_dict:
+                    self.defaultFragmentLength = int(frag_len_dict['median'])
+                else:
+                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
+                if verbose:
+                    print(("Fragment length based on paired en data "
+                          "estimated to be {}".format(frag_len_dict['median'])))
+            elif extendReads < read_len_dict['median']:
+                sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
+                                 "Reads will not be extended.\n".format(int(read_len_dict['median'])))
+                self.defaultFragmentLength = 'read length'
+            elif extendReads > 2000:
+                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads))
+            else:
+                self.defaultFragmentLength = int(extendReads)
+        else:
+            self.defaultFragmentLength = 'read length'
+        self.numberOfProcessors = numberOfProcessors
+        self.verbose = verbose
+        self.region = region
+        self.bedFile = bedFile
+        self.minMappingQuality = minMappingQuality
+        self.ignoreDuplicates = ignoreDuplicates
+        self.chrsToSkip = chrsToSkip
+        self.stepSize = stepSize
+        self.center_read = center_read
+        self.samFlag_include = samFlag_include
+        self.samFlag_exclude = samFlag_exclude
+        self.minFragmentLength = minFragmentLength
+        self.maxFragmentLength = maxFragmentLength
+        self.zerosToNans = zerosToNans
+        self.smoothLength = smoothLength
+        if out_file_for_raw_data:
+            self.save_data = True
+            self.out_file_for_raw_data = out_file_for_raw_data
+        else:
+            self.save_data = False
+            self.out_file_for_raw_data = None
+        # check that wither numberOfSamples or stepSize are set
+        if numberOfSamples is None and stepSize is None and bedFile is None:
+            raise ValueError("either stepSize, numberOfSamples or bedFile have to be set")
+        if self.defaultFragmentLength != 'read length':
+            self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
+        else:
+            self.maxPairedFragmentLength = 1000
+        if self.maxFragmentLength > 0:
+            self.maxPairedFragmentLength = self.maxFragmentLength
+        if len(self.mappedList) == 0:
+            try:
+                for fname in self.bamFilesList:
+                    bam, mapped, unmapped, stats = bamHandler.openBam(fname, returnStats=True, nThreads=self.numberOfProcessors)
+                    self.mappedList.append(mapped)
+                    self.statsList.append(stats)
+                    bam.close()
+            except:
+                self.mappedList = []
+                self.statsList = []
+    def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths):
+        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
+        # workers for analysis. If too short, too much time is spent loading the files
+        # if too long, some processors end up free.
+        # the following values are empirical
+        if self.stepSize is None:
+            if self.region is None:
+                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
+            else:
+                # compute the step size, based on the number of samples
+                # and the length of the region studied
+                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
+                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)
+        # number of samples is better if large
+        if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
+            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
+            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))
+        max_mapped = 0
+        if len(self.mappedList) > 0:
+            max_mapped = max(self.mappedList)
+        # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
+        if max_mapped == 0:
+            chunkSize = 10000 * self.binLength
+            self.stepSize = self.binLength
+        else:
+            reads_per_bp = float(max_mapped) / genomeSize
+            chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandles)))
+        # Ensure that chunkSize is always at least self.stepSize
+        if chunkSize < self.stepSize:
+            chunkSize = self.stepSize
+        # Ensure that chunkSize is always at least self.binLength
+        if self.binLength and chunkSize < self.binLength:
+            chunkSize = self.binLength
+        return chunkSize
+    def run(self, allArgs=None):
+        bamFilesHandles = []
+        for x in self.bamFilesList:
+            try:
+                y = bamHandler.openBam(x)
+            except SystemExit:
+                sys.exit(sys.exc_info()[1])
+            except:
+                y = pyBigWig.open(x)
+            bamFilesHandles.append(y)
+        chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose)
+        # skip chromosome in the list. This is usually for the
+        # X chromosome which may have either one copy  in a male sample
+        # or a mixture of male/female and is unreliable.
+        # Also the skip may contain heterochromatic regions and
+        # mitochondrial DNA
+        if len(self.chrsToSkip):
+            chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]
+        chrNames, chrLengths = list(zip(*chromsizes))
+        genomeSize = sum(chrLengths)
+        chunkSize = None
+        if self.bedFile is None:
+            if self.genomeChunkSize is None:
+                chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths)
+            else:
+                chunkSize = self.genomeChunkSize
+        [bam_h.close() for bam_h in bamFilesHandles]
+        if self.verbose:
+            print("step size is {}".format(self.stepSize))
+        if self.region:
+            # in case a region is used, append the tilesize
+            self.region += ":{}".format(self.binLength)
+        # Handle GTF options
+        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
+        # use map reduce to call countReadsInRegions_wrapper
+        imap_res = mapReduce.mapReduce([],
+                                       countReadsInRegions_wrapper,
+                                       chromsizes,
+                                       self_=self,
+                                       genomeChunkLength=chunkSize,
+                                       bedFile=self.bedFile,
+                                       blackListFileName=self.blackListFileName,
+                                       region=self.region,
+                                       numberOfProcessors=self.numberOfProcessors,
+                                       transcriptID=transcriptID,
+                                       exonID=exonID,
+                                       keepExons=keepExons,
+                                       transcript_id_designator=transcript_id_designator)
+        if self.out_file_for_raw_data:
+            if len(non_common):
+                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
+                                 "the chromosomes that were not common between the bigwig files\n")
+            # concatenate intermediary bedgraph files
+            ofile = open(self.out_file_for_raw_data, "w")
+            for _values, tempFileName in imap_res:
+                if tempFileName:
+                    # concatenate all intermediate tempfiles into one
+                    _foo = open(tempFileName, 'r')
+                    shutil.copyfileobj(_foo, ofile)
+                    _foo.close()
+                    os.remove(tempFileName)
+            ofile.close()
+        try:
+            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
+            return num_reads_per_bin
+        except ValueError:
+            if self.bedFile:
+                sys.exit('\nNo coverage values could be computed.\n\n'
+                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
+                         'The valid chromosome names are:\n{}'.format(chrNames))
+            else:
+                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
+                         'contain mapped reads.')
+    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
+        """Counts the reads in each bam file at each 'stepSize' position
+        within the interval (start, end) for a window or bin of size binLength.
+        The stepSize controls the distance between bins. For example,
+        a step size of 20 and a bin size of 20 will create bins next to
+        each other. If the step size is smaller than the bin size the
+        bins will overlap.
+        If a list of bedRegions is given, then the number of reads
+        that overlaps with each region is counted.
+        Parameters
+        ----------
+        chrom : str
+            Chrom name
+        start : int
+            start coordinate
+        end : int
+            end coordinate
+        bed_regions_list: list
+            List of list of tuples of the form (start, end)
+            corresponding to bed regions to be processed.
+            If not bed file was passed to the object constructor
+            then this list is empty.
+        Returns
+        -------
+        numpy array
+            The result is a numpy array that as rows each bin
+            and as columns each bam file.
+        Examples
+        --------
+        Initialize some useful values
+        >>> test = Tester()
+        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)
+        The transpose is used to get better looking numbers. The first line
+        corresponds to the number of reads per bin in the first bamfile.
+        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
+        >>> _array
+        array([[0., 0.],
+               [0., 1.],
+               [1., 1.],
+               [1., 2.]])
+        """
+        if start > end:
+            raise NameError("start %d bigger that end %d" % (start, end))
+        if self.stepSize is None and bed_regions_list is None:
+            raise ValueError("stepSize is not set!")
+        # array to keep the read counts for the regions
+        subnum_reads_per_bin = []
+        start_time = time.time()
+        bam_handles = []
+        for fname in self.bamFilesList:
+            try:
+                bam_handles.append(bamHandler.openBam(fname))
+            except SystemExit:
+                sys.exit(sys.exc_info()[1])
+            except:
+                bam_handles.append(pyBigWig.open(fname))
+        blackList = None
+        if self.blackListFileName is not None:
+            blackList = GTF(self.blackListFileName)
+        # A list of lists of tuples
+        transcriptsToConsider = []
+        if bed_regions_list is not None:
+            if self.bed_and_bin:
+                transcriptsToConsider.append([(x[1][0][0], x[1][0][1], self.binLength) for x in bed_regions_list])
+            else:
+                transcriptsToConsider = [x[1] for x in bed_regions_list]
+        else:
+            if self.stepSize == self.binLength:
+                transcriptsToConsider.append([(start, end, self.binLength)])
+            else:
+                for i in range(start, end, self.stepSize):
+                    if i + self.binLength > end:
+                        break
+                    if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength):
+                        continue
+                    transcriptsToConsider.append([(i, i + self.binLength)])
+        if self.save_data:
+            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
+            _file_name = _file.name
+        else:
+            _file_name = ''
+        for bam in bam_handles:
+            for trans in transcriptsToConsider:
+                tcov = self.get_coverage_of_region(bam, chrom, trans)
+                if bed_regions_list is not None and not self.bed_and_bin:
+                    subnum_reads_per_bin.append(np.sum(tcov))
+                else:
+                    subnum_reads_per_bin.extend(tcov)
+        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')
+        if self.save_data:
+            idx = 0
+            for i, trans in enumerate(transcriptsToConsider):
+                if len(trans[0]) != 3:
+                    starts = ",".join([str(x[0]) for x in trans])
+                    ends = ",".join([str(x[1]) for x in trans])
+                    _file.write("\t".join([chrom, starts, ends]) + "\t")
+                    _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n")
+                else:
+                    for exon in trans:
+                        for startPos in range(exon[0], exon[1], exon[2]):
+                            if idx >= subnum_reads_per_bin.shape[0]:
+                                # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
+                                # Counts there are added to the bin before them, but range() will still try to include them.
+                                break
+                            _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, min(startPos + exon[2], exon[1])))
+                            _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n")
+                            idx += 1
+            _file.close()
+        if self.verbose:
+            endTime = time.time()
+            rows = subnum_reads_per_bin.shape[0]
+            print("%s countReadsInRegions_worker: processing %d "
+                  "(%.1f per sec) @ %s:%s-%s" %
+                  (multiprocessing.current_process().name,
+                   rows, rows / (endTime - start_time), chrom, start, end))
+        return subnum_reads_per_bin, _file_name
+    def get_coverage_of_region(self, bamHandle, chrom, regions,
+                               fragmentFromRead_func=None):
+        """
+        Returns a numpy array that corresponds to the number of reads
+        that overlap with each tile.
+        >>> test = Tester()
+        >>> import pysam
+        >>> c = CountReadsPerBin([], stepSize=1, extendReads=300)
+        For this case the reads are length 36. The number of overlapping
+        read fragments is 4 and 5 for the positions tested.
+        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
+        ... [(5000833, 5000834), (5000834, 5000835)])
+        array([4., 5.])
+        In the following example a paired read is extended to the fragment length which is 100
+        The first mate starts at 5000000 and the second at 5000064. Each mate is
+        extended to the fragment length *independently*
+        At position 500090-500100 one fragment  of length 100 overlap, and after position 5000101
+        there should be zero reads.
+        >>> c.zerosToNans = True
+        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
+        ... [(5000090, 5000100), (5000100, 5000110)])
+        array([ 1., nan])
+        In the following  case the reads length is 50. Reads are not extended.
+        >>> c.extendReads=False
+        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
+        array([1., 2., 2.])
+        """
+        if not fragmentFromRead_func:
+            fragmentFromRead_func = self.get_fragment_from_read
+        nbins = len(regions)
+        if len(regions[0]) == 3:
+            nbins = 0
+            for reg in regions:
+                nbins += (reg[1] - reg[0]) // reg[2]
+                if (reg[1] - reg[0]) % reg[2] > 0:
+                    nbins += 1
+        coverages = np.zeros(nbins, dtype='float64')
+        if self.defaultFragmentLength == 'read length':
+            extension = 0
+        else:
+            extension = self.maxPairedFragmentLength
+        blackList = None
+        if self.blackListFileName is not None:
+            blackList = GTF(self.blackListFileName)
+        vector_start = 0
+        for idx, reg in enumerate(regions):
+            if len(reg) == 3:
+                tileSize = int(reg[2])
+                nRegBins = (reg[1] - reg[0]) // tileSize
+                if (reg[1] - reg[0]) % tileSize > 0:
+                    # Don't eliminate small bins! Issue 887
+                    nRegBins += 1
+            else:
+                nRegBins = 1
+                tileSize = int(reg[1] - reg[0])
+            # Blacklisted regions have a coverage of 0
+            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
+                continue
+            regStart = int(max(0, reg[0] - extension))
+            regEnd = reg[1] + int(extension)
+            # If alignments are extended and there's a blacklist, ensure that no
+            # reads originating in a blacklist are fetched
+            if blackList and reg[0] > 0 and extension > 0:
+                o = blackList.findOverlaps(chrom, regStart, reg[0])
+                if o is not None and len(o) > 0:
+                    regStart = o[-1][1]
+                o = blackList.findOverlaps(chrom, reg[1], regEnd)
+                if o is not None and len(o) > 0:
+                    regEnd = o[0][0]
+            start_time = time.time()
+            # caching seems faster. TODO: profile the function
+            c = 0
+            if chrom not in bamHandle.references:
+                raise NameError("chromosome {} not found in bam file".format(chrom))
+            prev_pos = set()
+            lpos = None
+            # of previous processed read pair
+            for read in bamHandle.fetch(chrom, regStart, regEnd):
+                if read.is_unmapped:
+                    continue
+                if self.minMappingQuality and read.mapq < self.minMappingQuality:
+                    continue
+                # filter reads based on SAM flag
+                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
+                    continue
+                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
+                    continue
+                # Fragment lengths
+                tLen = deeptools.utilities.getTLen(read)
+                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
+                    continue
+                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
+                    continue
+                # get rid of duplicate reads that have same position on each of the
+                # pairs
+                if self.ignoreDuplicates:
+                    # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+                    if tLen >= 0:
+                        s = read.pos
+                        e = s + tLen
+                    else:
+                        s = read.pnext
+                        e = s - tLen
+                    if read.reference_id != read.next_reference_id:
+                        e = read.pnext
+                    if lpos is not None and lpos == read.reference_start \
+                            and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+                        continue
+                    if lpos != read.reference_start:
+                        prev_pos.clear()
+                    lpos = read.reference_start
+                    prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+                # since reads can be split (e.g. RNA-seq reads) each part of the
+                # read that maps is called a position block.
+                try:
+                    position_blocks = fragmentFromRead_func(read)
+                except TypeError:
+                    # the get_fragment_from_read functions returns None in some cases.
+                    # Those cases are to be skipped, hence the continue line.
+                    continue
+                last_eIdx = None
+                for fragmentStart, fragmentEnd in position_blocks:
+                    if fragmentEnd is None or fragmentStart is None:
+                        continue
+                    fragmentLength = fragmentEnd - fragmentStart
+                    if fragmentLength == 0:
+                        continue
+                    # skip reads that are not in the region being
+                    # evaluated.
+                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
+                        continue
+                    if fragmentStart < reg[0]:
+                        fragmentStart = reg[0]
+                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
+                        fragmentEnd = reg[0] + len(coverages) * tileSize
+                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
+                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
+                    if last_eIdx is not None:
+                        sIdx = max(last_eIdx, sIdx)
+                        if sIdx >= eIdx:
+                            continue
+                    sIdx = int(sIdx)
+                    eIdx = int(eIdx)
+                    coverages[sIdx:eIdx] += 1
+                    last_eIdx = eIdx
+                c += 1
+            if self.verbose:
+                endTime = time.time()
+                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
+                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))
+            vector_start += nRegBins
+        # change zeros to NAN
+        if self.zerosToNans:
+            coverages[coverages == 0] = np.nan
+        return coverages
+    def getReadLength(self, read):
+        return len(read)
+    @staticmethod
+    def is_proper_pair(read, maxPairedFragmentLength):
+        """
+        Checks if a read is proper pair meaning that both mates are facing each other and are in
+        the same chromosome and are not to far away. The sam flag for proper pair can not
+        always be trusted. Note that if the fragment size is > maxPairedFragmentLength (~2kb
+        usually) that False will be returned.
+        :return: bool
+        >>> import pysam
+        >>> import os
+        >>> from deeptools.countReadsPerBin import CountReadsPerBin as cr
+        >>> root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+        >>> bam = pysam.AlignmentFile("{}/test_proper_pair_filtering.bam".format(root))
+        >>> iter = bam.fetch()
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "keep" read
+        True
+        >>> cr.is_proper_pair(read, 200) # "keep" read, but maxPairedFragmentLength is too short
+        False
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "improper pair"
+        False
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "mismatch chr"
+        False
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "same orientation1"
+        False
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "same orientation2"
+        False
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "rev first"
+        False
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "rev first OK"
+        True
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "for first"
+        False
+        >>> read = next(iter)
+        >>> cr.is_proper_pair(read, 1000) # "for first"
+        True
+        """
+        if not read.is_proper_pair:
+            return False
+        if read.reference_id != read.next_reference_id:
+            return False
+        if abs(read.template_length) > maxPairedFragmentLength:
+            return False
+        # check that the mates face each other (inward)
+        if read.is_reverse is read.mate_is_reverse:
+            return False
+        if read.is_reverse:
+            if read.reference_start >= read.next_reference_start:
+                return True
+        else:
+            if read.reference_start <= read.next_reference_start:
+                return True
+        return False
+    def get_fragment_from_read(self, read):
+        """Get read start and end position of a read.
+        If given, the reads are extended as follows:
+        If reads are paired end, each read mate is extended to match
+        the fragment length, otherwise, a default fragment length
+        is used. If reads are split (give by the CIGAR string) then
+        the multiple positions of the read are returned.
+        When reads are extended the cigar information is
+        skipped.
+        Parameters
+        ----------
+        read: pysam object.
+        The following values are defined (for forward reads)::
+                 |--          -- read.tlen --              --|
+                 |-- read.alen --|
+            -----|===============>------------<==============|----
+                 |               |            |
+            read.reference_start
+                        read.reference_end  read.pnext
+              and for reverse reads
+                 |--             -- read.tlen --           --|
+                                             |-- read.alen --|
+            -----|===============>-----------<===============|----
+                 |                           |               |
+              read.pnext           read.reference_start  read.reference_end
+        this is a sketch of a pair-end reads
+        The function returns the fragment start and end, either
+        using the paired end information (if available) or
+        extending the read in the appropriate direction if this
+        is single-end.
+        Parameters
+        ----------
+        read : pysam read object
+        Returns
+        -------
+        list of tuples
+            [(fragment start, fragment end)]
+        >>> test = Tester()
+        >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True)
+        >>> c.defaultFragmentLength=100
+        >>> c.get_fragment_from_read(test.getRead("paired-forward"))
+        [(5000000, 5000100)]
+        >>> c.get_fragment_from_read(test.getRead("paired-reverse"))
+        [(5000000, 5000100)]
+        >>> c.defaultFragmentLength = 200
+        >>> c.get_fragment_from_read(test.getRead("single-forward"))
+        [(5001491, 5001691)]
+        >>> c.get_fragment_from_read(test.getRead("single-reverse"))
+        [(5001536, 5001736)]
+        >>> c.defaultFragmentLength = 'read length'
+        >>> c.get_fragment_from_read(test.getRead("single-forward"))
+        [(5001491, 5001527)]
+        >>> c.defaultFragmentLength = 'read length'
+        >>> c.extendReads = False
+        >>> c.get_fragment_from_read(test.getRead("paired-forward"))
+        [(5000000, 5000036)]
+        Tests for read centering.
+        >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True, center_read=True)
+        >>> c.defaultFragmentLength = 100
+        >>> assert c.get_fragment_from_read(test.getRead("paired-forward")) == [(5000032, 5000068)]
+        >>> c.defaultFragmentLength = 200
+        >>> assert c.get_fragment_from_read(test.getRead("single-reverse")) == [(5001618, 5001654)]
+        """
+        # if no extension is needed, use pysam get_blocks
+        # to identify start and end reference positions.
+        # get_blocks return a list of start and end positions
+        # based on the CIGAR if skipped regions are found.
+        # E.g for a cigar of 40M260N22M
+        # get blocks return two elements for the first 40 matches
+        # and the for the last 22 matches.
+        if self.defaultFragmentLength == 'read length':
+            return read.get_blocks()
+        else:
+            if self.is_proper_pair(read, self.maxPairedFragmentLength):
+                if read.is_reverse:
+                    fragmentStart = read.next_reference_start
+                    fragmentEnd = read.reference_end
+                else:
+                    fragmentStart = read.reference_start
+                    # the end of the fragment is defined as
+                    # the start of the forward read plus the insert length
+                    fragmentEnd = read.reference_start + abs(read.template_length)
+            # Extend using the default fragment length
+            else:
+                if read.is_reverse:
+                    fragmentStart = read.reference_end - self.defaultFragmentLength
+                    fragmentEnd = read.reference_end
+                else:
+                    fragmentStart = read.reference_start
+                    fragmentEnd = read.reference_start + self.defaultFragmentLength
+        if self.center_read:
+            fragmentCenter = fragmentEnd - (fragmentEnd - fragmentStart) / 2
+            fragmentStart = int(fragmentCenter - read.infer_query_length(always=False) / 2)
+            fragmentEnd = fragmentStart + read.infer_query_length(always=False)
+        assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \
+                                            "end for read {}".format(read.query_name)
+        return [(fragmentStart, fragmentEnd)]
+    def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition):
+        """
+        Given a tile index position and a tile size (length), return the a new indices
+        over a larger range, called the smoothRange.
+        This region is centered in the tileIndex  an spans on both sizes
+        to cover the smoothRange. The smoothRange is trimmed in case it is less
+        than zero or greater than  maxPosition ::
+             ---------------|==================|------------------
+                        tileStart
+                   |--------------------------------------|
+                   |    <--      smoothRange     -->      |
+                   |
+             tileStart - (smoothRange-tileSize)/2
+        Test for a smooth range that spans 3 tiles.
+        Examples
+        --------
+        >>> c = CountReadsPerBin([], 1, 1, 1, 0)
+        >>> c.getSmoothRange(5, 1, 3, 10)
+        (4, 7)
+        Test smooth range truncated on start.
+        >>> c.getSmoothRange(0, 10, 30, 200)
+        (0, 2)
+        Test smooth range truncated on start.
+        >>> c.getSmoothRange(1, 10, 30, 4)
+        (0, 3)
+        Test smooth range truncated on end.
+        >>> c.getSmoothRange(5, 1, 3, 5)
+        (4, 5)
+        Test smooth range not multiple of tileSize.
+        >>> c.getSmoothRange(5, 10, 24, 10)
+        (4, 6)
+        """
+        smoothTiles = int(smoothRange / tileSize)
+        if smoothTiles == 1:
+            return (tileIndex, tileIndex + 1)
+        smoothTilesSide = float(smoothTiles - 1) / 2
+        smoothTilesLeft = int(np.ceil(smoothTilesSide))
+        smoothTilesRight = int(np.floor(smoothTilesSide)) + 1
+        indexStart = max(tileIndex - smoothTilesLeft, 0)
+        indexEnd = min(maxPosition, tileIndex + smoothTilesRight)
+        return (indexStart, indexEnd)
+def remove_row_of_zeros(matrix):
+    # remove rows containing all zeros or all nans
+    _mat = np.nan_to_num(matrix)
+    to_keep = _mat.sum(1) != 0
+    return matrix[to_keep, :]
+def estimateSizeFactors(m):
+    """
+    Compute size factors in the same way as DESeq2.
+    The inverse of that is returned, as it's then compatible with bamCoverage.
+    m : a numpy ndarray
+    >>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]])
+    >>> sf = estimateSizeFactors(m)
+    >>> assert np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4)
+    >>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]])
+    >>> sf = estimateSizeFactors(m)
+    >>> assert np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4)
+    """
+    loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1]
+    # Mask after computing the geometric mean
+    m = np.ma.masked_where(m <= 0, m)
+    loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans)
+    # DESeq2 ratio-based size factor
+    sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0))
+    return 1. / sf
+class Tester(object):
+    def __init__(self):
+        """
+        The distribution of reads between the two bam files is as follows.
+        They cover 200 bp
+          0                              100                           200
+          |------------------------------------------------------------|
+        A                                ===============
+                                                        ===============
+        B                 ===============               ===============
+                                         ===============
+                                                        ===============
+        """
+        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+        # self.root = "./test/test_data/"
+        self.bamFile1 = self.root + "testA.bam"
+        self.bamFile2 = self.root + "testB.bam"
+        self.bamFile_PE = self.root + "test_paired2.bam"
+        self.chrom = '3R'
+        global debug
+        debug = 0
+    def getRead(self, readType):
+        """ prepare arguments for test
+        """
+        bam = bamHandler.openBam(self.bamFile_PE)
+        if readType == 'paired-reverse':
+            read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
+        elif readType == 'single-forward':
+            read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
+        elif readType == 'single-reverse':
+            read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
+        else:  # by default a forward paired read is returned
+            read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
+        return read

deepTools/source/deeptools/deeptools_list_tools.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import sys
+from importlib.metadata import version
+def parse_arguments(args=None):
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="""
+deepTools is a suite of python tools particularly developed for the efficient analysis of
+high-throughput sequencing data, such as ChIP-seq, RNA-seq or MNase-seq.
+Each tool should be called by its own name as in the following example:
+ $ bamCoverage -b reads.bam -o coverage.bw
+If you find deepTools useful for your research please cite as:
+Ramírez, Fidel, Devon P. Ryan, Björn Grüning, Vivek Bhardwaj, Fabian Kilpert,
+Andreas S. Richter, Steffen Heyne, Friederike Dündar,
+and Thomas Manke. 2016. "deepTools2: A next Generation Web Server for Deep-Sequencing
+Data Analysis." Nucleic Acids Research, April. doi:10.1093/nar/gkw257.
+[ Tools for BAM and bigWig file processing ]
+    multiBamSummary         compute read coverages over bam files. Output used for plotCorrelation or plotPCA
+    multiBigwigSummary      extract scores from bigwig files. Output used for plotCorrelation or plotPCA
+    correctGCBias           corrects GC bias from bam file. Don't use it with ChIP data
+    bamCoverage             computes read coverage per bins or regions
+    bamCompare              computes log2 ratio and other operations of read coverage of two samples per bins or regions
+    bigwigCompare           computes log2 ratio and other operations from bigwig scores of two samples per bins or regions
+    bigwigAverage           computes average from bigwig scores of multiple samples per bins or regions
+    computeMatrix           prepares the data from bigwig scores for plotting with plotHeatmap or plotProfile
+    alignmentSieve          filters BAM alignments according to specified parameters, optionally producing a BEDPE file
+[ Tools for QC ]
+    plotCorrelation         plots heatmaps or scatterplots of data correlation
+    plotPCA                 plots PCA
+    plotFingerprint         plots the distribution of enriched regions
+    bamPEFragmentSize       returns the read length and paired-end distance from a bam file
+    computeGCBias           computes and plots the GC bias of a sample
+    plotCoverage            plots a histogram of read coverage
+    estimateReadFiltering   estimates the number of reads that will be filtered from a BAM file or files given certain criteria
+[Heatmaps and summary plots]
+    plotHeatmap             plots one or multiple heatmaps of user selected regions over different genomic scores
+    plotProfile             plots the average profile of user selected regions over different genomic scores
+    plotEnrichment          plots the read/fragment coverage of one or more sets of regions
+[Miscellaneous]
+    computeMatrixOperations Modifies the output of computeMatrix in a variety of ways.
+For more information visit: http://deeptools.readthedocs.org
+""")
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s {}'.format(version('deeptools')))
+    return parser
+def process_args(args=None):
+    args = parse_arguments().parse_args(args)
+    return args
+def main(args=None):
+    if args is None and len(sys.argv) == 1:
+        args = ["--help"]
+    process_args(args)

deepTools/source/deeptools/estimateReadFiltering.py ADDED Viewed

	@@ -0,0 +1,376 @@

+#!/usr/bin/env python
+import argparse
+import sys
+from deeptools import parserCommon, bamHandler, utilities
+from deeptools.mapReduce import mapReduce
+from deeptools.utilities import smartLabels
+from importlib.metadata import version
+def parseArguments():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="""
+This tool estimates the number of reads that would be filtered given a set of
+settings and prints this to the terminal. Further, it tracks the number of singleton reads. The following metrics will always be tracked regardless of what you specify (the order output also matches this):
+ * Total reads (including unmapped)
+ * Mapped reads
+ * Reads in blacklisted regions (--blackListFileName)
+The following metrics are estimated according to the --binSize and --distanceBetweenBins parameters
+ * Estimated mapped reads filtered (the total number of mapped reads filtered for any reason)
+ * Alignments with a below threshold MAPQ (--minMappingQuality)
+ * Alignments with at least one missing flag (--samFlagInclude)
+ * Alignments with undesirable flags (--samFlagExclude)
+ * Duplicates determined by deepTools (--ignoreDuplicates)
+ * Duplicates marked externally (e.g., by picard)
+ * Singletons (paired-end reads with only one mate aligning)
+ * Wrong strand (due to --filterRNAstrand)
+The sum of these may be more than the total number of reads. Note that alignments are sampled from bins of size --binSize spaced --distanceBetweenBins apart.
+""",
+        usage='estimateReadFiltering -b sample1.bam sample2.bam\n'
+        'help: estimateReadFiltering -h / estimateReadFiltering --help'
+    )
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument('--bamfiles', '-b',
+                          metavar='FILE1 FILE2',
+                          help='List of indexed bam files separated by spaces.',
+                          nargs='+',
+                          required=True)
+    general = parser.add_argument_group('General arguments')
+    general.add_argument('--outFile', '-o',
+                         type=parserCommon.writableFile,
+                         help='The file to write results to. By default, results are printed to the console')
+    general.add_argument('--sampleLabels',
+                         help='Labels for the samples. The '
+                         'default is to use the file name of the '
+                         'sample. The sample labels should be separated '
+                         'by spaces and quoted if a label itself'
+                         'contains a space E.g. --sampleLabels label-1 "label 2"  ',
+                         nargs='+')
+    general.add_argument('--smartLabels',
+                         action='store_true',
+                         help='Instead of manually specifying labels for the input '
+                         'BAM files, this causes deepTools to use the '
+                         'file name after removing the path and extension.')
+    general.add_argument('--binSize', '-bs',
+                         metavar='INT',
+                         help='Length in bases of the window used to sample the genome. (Default: %(default)s)',
+                         default=1000000,
+                         type=int)
+    general.add_argument('--distanceBetweenBins', '-n',
+                         metavar='INT',
+                         help='To reduce the computation time, not every possible genomic '
+                         'bin is sampled. This option allows you to set the distance '
+                         'between bins actually sampled from. Larger numbers are sufficient '
+                         'for high coverage samples, while smaller values are useful for '
+                         'lower coverage samples. Note that if you specify a value that '
+                         'results in too few (<1000) reads sampled, the value will be '
+                         'decreased. (Default: %(default)s)',
+                         default=10000,
+                         type=int)
+    general.add_argument('--numberOfProcessors', '-p',
+                         help='Number of processors to use. Type "max/2" to '
+                         'use half the maximum number of processors or "max" '
+                         'to use all available processors. (Default: %(default)s)',
+                         metavar="INT",
+                         type=parserCommon.numberOfProcessors,
+                         default=1,
+                         required=False)
+    general.add_argument('--verbose', '-v',
+                         help='Set to see processing messages.',
+                         action='store_true')
+    general.add_argument('--version', action='version',
+                         version='%(prog)s {}'.format(version('deeptools')))
+    filtering = parser.add_argument_group('Optional arguments')
+    filtering.add_argument('--filterRNAstrand',
+                           help='Selects RNA-seq reads (single-end or paired-end) in '
+                                'the given strand. (Default: %(default)s)',
+                           choices=['forward', 'reverse'],
+                           default=None)
+    filtering.add_argument('--ignoreDuplicates',
+                           help='If set, reads that have the same orientation '
+                           'and start position will be considered only '
+                           'once. If reads are paired, the mate\'s position '
+                           'also has to coincide to ignore a read.',
+                           action='store_true')
+    filtering.add_argument('--minMappingQuality',
+                           metavar='INT',
+                           help='If set, only reads that have a mapping '
+                           'quality score of at least this are '
+                           'considered.',
+                           type=int)
+    filtering.add_argument('--samFlagInclude',
+                           help='Include reads based on the SAM flag. For example, '
+                           'to get only reads that are the first mate, use a flag of 64. '
+                           'This is useful to count properly paired reads only once, '
+                           'as otherwise the second mate will be also considered for the '
+                           'coverage. (Default: %(default)s)',
+                           metavar='INT',
+                           default=None,
+                           type=int,
+                           required=False)
+    filtering.add_argument('--samFlagExclude',
+                           help='Exclude reads based on the SAM flag. For example, '
+                           'to get only reads that map to the forward strand, use '
+                           '--samFlagExclude 16, where 16 is the SAM flag for reads '
+                           'that map to the reverse strand. (Default: %(default)s)',
+                           metavar='INT',
+                           default=None,
+                           type=int,
+                           required=False)
+    filtering.add_argument('--blackListFileName', '-bl',
+                           help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.",
+                           metavar="BED file",
+                           nargs="+",
+                           required=False)
+    return parser
+def getFiltered_worker(arglist):
+    chrom, start, end, args = arglist
+    # Fix the bounds
+    if end - start > args.binSize and end - start > args.distanceBetweenBins:
+        end -= args.distanceBetweenBins
+    if end <= start:
+        end = start + 1
+    o = []
+    for fname in args.bamfiles:
+        fh = bamHandler.openBam(fname)
+        chromUse = utilities.mungeChromosome(chrom, fh.references)
+        prev_pos = set()
+        lpos = None
+        minMapq = 0
+        samFlagInclude = 0
+        samFlagExclude = 0
+        internalDupes = 0
+        externalDupes = 0
+        singletons = 0
+        filterRNAstrand = 0
+        nFiltered = 0
+        total = 0  # This is only used to estimate the percentage affected
+        for read in fh.fetch(chromUse, start, end):
+            filtered = 0
+            if read.pos < start:
+                # ensure that we never double count (in case distanceBetweenBins == 0)
+                continue
+            if read.flag & 4:
+                # Ignore unmapped reads, they were counted already
+                continue
+            if args.minMappingQuality and read.mapq < args.minMappingQuality:
+                filtered = 1
+                minMapq += 1
+            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
+                filtered = 1
+                samFlagInclude += 1
+            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
+                filtered = 1
+                samFlagExclude += 1
+            if args.ignoreDuplicates:
+                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+                if read.tlen >= 0:
+                    s = read.pos
+                    e = s + read.tlen
+                else:
+                    s = read.pnext
+                    e = s - read.tlen
+                if read.reference_id != read.next_reference_id:
+                    e = read.pnext
+                if lpos is not None and lpos == read.reference_start \
+                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+                    filtered = 1
+                    internalDupes += 1
+                if lpos != read.reference_start:
+                    prev_pos.clear()
+                lpos = read.reference_start
+                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+            if read.is_duplicate:
+                filtered = 1
+                externalDupes += 1
+            if read.is_paired and read.mate_is_unmapped:
+                filtered = 1
+                singletons += 1
+            # filterRNAstrand
+            if args.filterRNAstrand:
+                if read.is_paired:
+                    if args.filterRNAstrand == 'forward':
+                        if read.flag & 144 == 128 or read.flag & 96 == 64:
+                            pass
+                        else:
+                            filtered = 1
+                            filterRNAstrand += 1
+                    elif args.filterRNAstrand == 'reverse':
+                        if read.flag & 144 == 144 or read.flag & 96 == 96:
+                            pass
+                        else:
+                            filtered = 1
+                            filterRNAstrand += 1
+                else:
+                    if args.filterRNAstrand == 'forward':
+                        if read.flag & 16 == 16:
+                            pass
+                        else:
+                            filtered = 1
+                            filterRNAstrand += 1
+                    elif args.filterRNAstrand == 'reverse':
+                        if read.flag & 16 == 0:
+                            pass
+                        else:
+                            filtered = 1
+                            filterRNAstrand += 1
+            total += 1
+            nFiltered += filtered
+        fh.close()
+        # Append a tuple to the output
+        tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand)
+        o.append(tup)
+    return o
+def main(args=None):
+    args = parseArguments().parse_args(args)
+    if not args.sampleLabels and args.smartLabels:
+        args.sampleLabels = smartLabels(args.bamfiles)
+    if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
+        sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n")
+        sys.exit(1)
+    if args.outFile is None:
+        of = sys.stdout
+    else:
+        of = open(args.outFile, "w")
+    bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles]
+    mapped = [x[1] for x in bhs]
+    unmappedList = [x[2] for x in bhs]
+    bhs = [x[0] for x in bhs]
+    # Get the reads in blacklisted regions
+    if args.blackListFileName:
+        blacklisted = []
+        for bh in bhs:
+            blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors))
+    else:
+        blacklisted = [0] * len(bhs)
+    # Get the total and mapped reads
+    total = [x + y for x, y in list(zip(mapped, unmappedList))]
+    chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
+    for x in bhs:
+        x.close()
+    # Get the remaining metrics
+    res = mapReduce([args],
+                    getFiltered_worker,
+                    chrom_sizes,
+                    genomeChunkLength=args.binSize + args.distanceBetweenBins,
+                    blackListFileName=args.blackListFileName,
+                    numberOfProcessors=args.numberOfProcessors,
+                    verbose=args.verbose)
+    totals = [0] * len(args.bamfiles)
+    nFiltered = [0] * len(args.bamfiles)
+    MAPQs = [0] * len(args.bamfiles)
+    flagIncludes = [0] * len(args.bamfiles)
+    flagExcludes = [0] * len(args.bamfiles)
+    internalDupes = [0] * len(args.bamfiles)
+    externalDupes = [0] * len(args.bamfiles)
+    singletons = [0] * len(args.bamfiles)
+    rnaStrand = [0] * len(args.bamfiles)
+    for x in res:
+        for idx, r in enumerate(x):
+            totals[idx] += r[0]
+            nFiltered[idx] += r[1]
+            MAPQs[idx] += r[2]
+            flagIncludes[idx] += r[3]
+            flagExcludes[idx] += r[4]
+            internalDupes[idx] += r[5]
+            externalDupes[idx] += r[6]
+            singletons[idx] += r[7]
+            rnaStrand[idx] += r[8]
+    # Print some output
+    of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n")
+    for idx, _ in enumerate(args.bamfiles):
+        if args.sampleLabels:
+            of.write(args.sampleLabels[idx])
+        else:
+            of.write(args.bamfiles[idx])
+        of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx]))
+        # nFiltered
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        # MAPQ
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        # samFlagInclude
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        # samFlagExclude
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        # Internally determined duplicates
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        # Externally marked duplicates
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        # Singletons
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        # filterRNAstrand
+        metric = 0.0
+        if totals[idx] > 0:
+            metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
+        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
+        of.write("\n")
+    if args.outFile is not None:
+        of.close()
+    return 0

deepTools/source/deeptools/estimateScaleFactor.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import sys
+from deeptools.SES_scaleFactor import estimateScaleFactor
+from deeptools.parserCommon import numberOfProcessors
+from importlib.metadata import version
+debug = 0
+def parseArguments(args=None):
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Given two BAM files, this estimates scaling factors '
+        '(bigger to smaller).',
+        usage='estimateScaleFactor -b sample1.bam sample2.bam\n'
+        'help: estimateScaleFactor -h / estimateScaleFactor --help'
+    )
+    # define the arguments
+    parser.add_argument('--bamfiles', '-b',
+                        metavar='list of bam files',
+                        help='List of indexed BAM files, space delineated',
+                        nargs='+',
+                        required=True)
+    parser.add_argument('--ignoreForNormalization', '-ignore',
+                        help='A comma-separated list of chromosome names, '
+                        'limited by quotes, '
+                        'containing those '
+                        'chromosomes that should be excluded '
+                        'during normalization computations. For example, '
+                        '--ignoreForNormalization "chrX, chrM" ')
+    parser.add_argument('--sampleWindowLength', '-l',
+                        help='Length in bases for a window used to '
+                        'sample the genome and compute the size or scaling '
+                        'factors',
+                        default=1000,
+                        type=int)
+    parser.add_argument('--numberOfSamples', '-n',
+                        help='Number of samplings taken from the genome '
+                        'to compute the scaling factors',
+                        default=100000,
+                        type=int)
+    parser.add_argument('--normalizationLength', '-nl',
+                        help='By default, data is normalized to 1 '
+                        'fragment per 100 bases. The expected value is an '
+                        'integer. For example, if normalizationLength '
+                        'is 1000, then the resulting scaling factor '
+                        'will cause the average coverage of the BAM file to '
+                        'have on  average 1 fragment per kilobase',
+                        type=int,
+                        default=10)
+    parser.add_argument('--skipZeros',
+                        help='If set, then zero counts that happen for *all* '
+                        'BAM files given are ignored. This will result in a '
+                        'reduced number of read counts than that specified '
+                        'in --numberOfSamples',
+                        action='store_true',
+                        required=False)
+    parser.add_argument('--numberOfProcessors', '-p',
+                        help='Number of processors to use. The default is '
+                        'to use half the maximum number of processors.',
+                        metavar="INT",
+                        type=numberOfProcessors,
+                        default="max/2",
+                        required=False)
+    parser.add_argument('--verbose', '-v',
+                        help='Set to see processing messages.',
+                        action='store_true')
+    parser.add_argument('--version',
+                        action='version',
+                        version='%(prog)s {}'.format(version('deeptools')))
+    args = parser.parse_args(args)
+    if args.ignoreForNormalization:
+        args.ignoreForNormalization = [
+            x.strip() for x in args.ignoreForNormalization.split(',')
+        ]
+    else:
+        args.ignoreForNormalization = []
+    return args
+def main(args=None):
+    """
+    The algorithm samples the genome a number of times as specified
+    by the --numberOfSamples parameter to estimate scaling factors of
+    between to samples
+    """
+    args = parseArguments(args)
+    if len(args.bamfiles) > 2:
+        print("SES method to estimate scale factors only works for two samples")
+        exit(0)
+    sys.stderr.write("{:,} number of samples will be computed.\n".format(args.numberOfSamples))
+    sizeFactorsDict = estimateScaleFactor(args.bamfiles, args.sampleWindowLength,
+                                          args.numberOfSamples,
+                                          args.normalizationLength,
+                                          numberOfProcessors=args.numberOfProcessors,
+                                          chrsToSkip=args.ignoreForNormalization,
+                                          verbose=args.verbose)
+    for k, v in sizeFactorsDict.items():
+        print("{}: {}".format(k, v))

deepTools/source/deeptools/getFragmentAndReadSize.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import numpy as np
+# own tools
+from deeptools import bamHandler
+from deeptools import mapReduce
+old_settings = np.seterr(all='ignore')
+def getFragmentLength_wrapper(args):
+    return getFragmentLength_worker(*args)
+def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
+    """
+    Queries the reads at the given region for the distance between
+    reads and the read length
+    Parameters
+    ----------
+    chrom : str
+        chromosome name
+    start : int
+        region start
+    end : int
+        region end
+    bamFile : str
+        BAM file name
+    distanceBetweenBins : int
+        the number of bases at the end of each bin to ignore
+    Returns
+    -------
+    np.array
+        an np.array, where first column is fragment length, the
+        second is for read length
+    """
+    bam = bamHandler.openBam(bamFile)
+    end = max(start + 1, end - distanceBetweenBins)
+    if chrom in bam.references:
+        reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
+                          for r in bam.fetch(chrom, start, end)
+                          if r.is_proper_pair and r.is_read1 and not r.is_unmapped])
+        if not len(reads):
+            # if the previous operation produces an empty list
+            # it could be that the data is not paired, then
+            # we try with out filtering
+            reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
+                              for r in bam.fetch(chrom, start, end) if not r.is_unmapped])
+    else:
+        raise NameError("chromosome {} not found in bam file".format(chrom))
+    if not len(reads):
+        reads = np.array([]).reshape(0, 2)
+    return reads
+def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
+                                 binSize=50000, distanceBetweenBins=1000000,
+                                 numberOfProcessors=None, verbose=False):
+    """
+    Estimates the fragment length and read length through sampling
+    Parameters
+    ----------
+    bamFile : str
+        BAM file name
+    return_lengths : bool
+    numberOfProcessors : int
+    verbose : bool
+    binSize : int
+    distanceBetweenBins : int
+    Returns
+    -------
+    d : dict
+        tuple of two dictionaries, one for the fragment length and the other
+for the read length. The dictionaries summarise the mean, median etc. values
+    """
+    bam_handle = bamHandler.openBam(bamFile)
+    chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
+    distanceBetweenBins *= 2
+    fl = []
+    # Fix issue #522, allow distanceBetweenBins == 0
+    if distanceBetweenBins == 0:
+        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
+                                       getFragmentLength_wrapper,
+                                       chrom_sizes,
+                                       genomeChunkLength=binSize,
+                                       blackListFileName=blackListFileName,
+                                       numberOfProcessors=numberOfProcessors,
+                                       verbose=verbose)
+        fl = np.concatenate(imap_res)
+    # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed
+    while len(fl) < 1000 and distanceBetweenBins > 1:
+        distanceBetweenBins /= 2
+        stepsize = binSize + distanceBetweenBins
+        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
+                                       getFragmentLength_wrapper,
+                                       chrom_sizes,
+                                       genomeChunkLength=stepsize,
+                                       blackListFileName=blackListFileName,
+                                       numberOfProcessors=numberOfProcessors,
+                                       verbose=verbose)
+        fl = np.concatenate(imap_res)
+    if len(fl):
+        fragment_length = fl[:, 0]
+        read_length = fl[:, 1]
+        if fragment_length.mean() > 0:
+            fragment_len_dict = {'sample_size': len(fragment_length),
+                                 'min': fragment_length.min(),
+                                 'qtile25': np.percentile(fragment_length, 25),
+                                 'mean': np.mean(fragment_length),
+                                 'median': np.median(fragment_length),
+                                 'qtile75': np.percentile(fragment_length, 75),
+                                 'max': fragment_length.max(),
+                                 'std': np.std(fragment_length),
+                                 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))),
+                                 'qtile10': np.percentile(fragment_length, 10),
+                                 'qtile20': np.percentile(fragment_length, 20),
+                                 'qtile30': np.percentile(fragment_length, 30),
+                                 'qtile40': np.percentile(fragment_length, 40),
+                                 'qtile60': np.percentile(fragment_length, 60),
+                                 'qtile70': np.percentile(fragment_length, 70),
+                                 'qtile80': np.percentile(fragment_length, 80),
+                                 'qtile90': np.percentile(fragment_length, 90),
+                                 'qtile99': np.percentile(fragment_length, 99)}
+        else:
+            fragment_len_dict = None
+        if return_lengths and fragment_len_dict is not None:
+            fragment_len_dict['lengths'] = fragment_length
+        read_len_dict = {'sample_size': len(read_length),
+                         'min': read_length.min(),
+                         'qtile25': np.percentile(read_length, 25),
+                         'mean': np.mean(read_length),
+                         'median': np.median(read_length),
+                         'qtile75': np.percentile(read_length, 75),
+                         'max': read_length.max(),
+                         'std': np.std(read_length),
+                         'mad': np.median(np.abs(read_length - np.median(read_length))),
+                         'qtile10': np.percentile(read_length, 10),
+                         'qtile20': np.percentile(read_length, 20),
+                         'qtile30': np.percentile(read_length, 30),
+                         'qtile40': np.percentile(read_length, 40),
+                         'qtile60': np.percentile(read_length, 60),
+                         'qtile70': np.percentile(read_length, 70),
+                         'qtile80': np.percentile(read_length, 80),
+                         'qtile90': np.percentile(read_length, 90),
+                         'qtile99': np.percentile(read_length, 99)}
+        if return_lengths:
+            read_len_dict['lengths'] = read_length
+    else:
+        fragment_len_dict = None
+        read_len_dict = None
+    return fragment_len_dict, read_len_dict

deepTools/source/deeptools/getRatio.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import numpy as np
+old_settings = np.seterr(all='ignore')
+def compute_ratio(value1, value2, args):
+    value1 = value1 + args['pseudocount'][0]
+    value2 = value2 + args['pseudocount'][1]
+    ratio = float(value1) / value2
+    if args['valueType'] == 'log2':
+        ratio = np.log2(ratio)
+    elif args['valueType'] == 'reciprocal_ratio':
+        # the reciprocal ratio of a/b
+        # is a/b if a/b > 1 else -1* b/a
+        ratio = ratio if ratio >= 1 else -1.0 / ratio
+    return ratio
+def getRatio(tileCoverage, args):
+    r"""
+    The mapreduce method calls this function
+    for each tile. The parameters (args) are fixed
+    in the main method.
+    >>> funcArgs= {'valueType': 'ratio', 'scaleFactors': (1,1), 'pseudocount': [1, 1]}
+    >>> getRatio([9, 19], funcArgs)
+    0.5
+    >>> getRatio([0, 0], funcArgs)
+    1.0
+    >>> getRatio([np.nan, np.nan], funcArgs)
+    nan
+    >>> getRatio([np.nan, 1.0], funcArgs)
+    nan
+    >>> funcArgs['valueType'] ='subtract'
+    >>> getRatio([20, 10], funcArgs)
+    10
+    >>> funcArgs['scaleFactors'] = (1, 0.5)
+    >>> getRatio([10, 20], funcArgs)
+    0.0
+    The reciprocal ratio is of a and b is:
+    is a/b if a/b > 1 else -1* b/a
+    >>> funcArgs['valueType'] ='reciprocal_ratio'
+    >>> funcArgs['scaleFactors'] = (1, 1)
+    >>> funcArgs['pseudocount'] = [0, 0]
+    >>> getRatio([2, 1], funcArgs)
+    2.0
+    >>> getRatio([1, 2], funcArgs)
+    -2.0
+    >>> getRatio([1, 1], funcArgs)
+    1.0
+    """
+    value1 = args['scaleFactors'][0] * tileCoverage[0]
+    value2 = args['scaleFactors'][1] * tileCoverage[1]
+    # if any of the two values to compare
+    # is nan, return nan
+    if np.isnan(value1) or np.isnan(value2):
+        return np.nan
+    # ratio case
+    if args['valueType'] in ['ratio', 'log2', 'reciprocal_ratio']:
+        bin_value = compute_ratio(value1, value2, args)
+    # non ratio case (diff, sum etc)
+    else:
+        if args['valueType'] == 'subtract':
+            bin_value = value1 - value2
+        elif args['valueType'] == 'add':
+            bin_value = value1 + value2
+        elif args['valueType'] == 'first':
+            bin_value = value1
+        elif args['valueType'] == 'second':
+            bin_value = value2
+        elif args['valueType'] == 'mean':
+            bin_value = (value1 + value2) / 2.0
+    return bin_value

deepTools/source/deeptools/getScaleFactor.py ADDED Viewed

	@@ -0,0 +1,305 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import numpy as np
+import deeptools.mapReduce as mapReduce
+from deeptools import bamHandler
+from deeptools import utilities
+import sys
+debug = 0
+def getFractionKept_wrapper(args):
+    return getFractionKept_worker(*args)
+def getFractionKept_worker(chrom, start, end, bamFile, args, offset):
+    """
+    Queries the BAM file and counts the number of alignments kept/found in the
+    first 50000 bases.
+    """
+    bam = bamHandler.openBam(bamFile)
+    start += offset * 50000
+    end = min(end, start + 50000)
+    tot = 0
+    filtered = 0
+    if end <= start:
+        return (filtered, tot)
+    prev_pos = set()
+    lpos = None
+    if chrom in bam.references:
+        for read in bam.fetch(chrom, start, end):
+            tot += 1
+            if read.is_unmapped:
+                continue
+            if args.minMappingQuality and read.mapq < args.minMappingQuality:
+                filtered += 1
+                continue
+            # filter reads based on SAM flag
+            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
+                filtered += 1
+                continue
+            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
+                filtered += 1
+                continue
+            # fragment length filtering
+            tLen = utilities.getTLen(read)
+            if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
+                filtered += 1
+                continue
+            if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
+                filtered += 1
+                continue
+            # get rid of duplicate reads that have same position on each of the
+            # pairs
+            if args.ignoreDuplicates:
+                # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
+                if tLen >= 0:
+                    s = read.pos
+                    e = s + tLen
+                else:
+                    s = read.pnext
+                    e = s - tLen
+                if read.reference_id != read.next_reference_id:
+                    e = read.pnext
+                if lpos is not None and lpos == read.reference_start \
+                        and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
+                    filtered += 1
+                    continue
+                if lpos != read.reference_start:
+                    prev_pos.clear()
+                lpos = read.reference_start
+                prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
+            # If filterRNAstrand is in args, then filter accordingly
+            # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class
+            if hasattr(args, "filterRNAstrand"):
+                if read.is_paired:
+                    if args.filterRNAstrand == 'forward':
+                        if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)):
+                            filtered += 1
+                            continue
+                    elif args.filterRNAstrand == 'reverse':
+                        if not (read.flag & 144 == 144 or read.flag & 96 == 96):
+                            filtered += 1
+                            continue
+                else:
+                    if args.filterRNAstrand == 'forward' and read.flag & 16 == 0:
+                        filtered += 1
+                        continue
+                    elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16:
+                        filtered += 1
+                        continue
+    return (filtered, tot)
+def fraction_kept(args, stats):
+    """
+    Count the following:
+    (A) The total number of alignments sampled
+    (B) The total number of alignments ignored due to any of the following:
+        --samFlagInclude
+        --samFlagExclude
+        --minMappingQuality
+        --ignoreDuplicates
+        --minFragmentLength
+        --maxFragmentLength
+    Black list regions are already accounted for. This works by sampling the
+    genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
+    whichever is smaller (unless there are fewer than 100,000 alignments, in
+    which case sample everything).
+    The sampling works by dividing the genome into bins and only looking at the
+    first 50000 bases. If this doesn't yield sufficient alignments then the bin
+    size is halved.
+    """
+    # Do we even need to proceed?
+    if (not args.minMappingQuality or args.minMappingQuality == 0) and \
+       (not args.samFlagInclude or args.samFlagInclude == 0) and \
+       (not args.samFlagExclude or args.samFlagExclude == 0) and \
+       (not args.minFragmentLength or args.minFragmentLength == 0) and \
+       (not args.maxFragmentLength or args.maxFragmentLength == 0):
+        if hasattr(args, "filterRNAstrand"):
+            if args.filterRNAstrand not in ["forward", "reverse"]:
+                return 1.0
+        else:
+            return 1.0
+    filtered = 0
+    total = 0
+    distanceBetweenBins = 2000000
+    bam_handle = bamHandler.openBam(args.bam)
+    bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
+    if bam_mapped < 1000000:
+        num_needed_to_sample = bam_mapped
+    else:
+        if 0.1 * bam_mapped >= 1000000:
+            num_needed_to_sample = 0.1 * bam_mapped
+        else:
+            num_needed_to_sample = 1000000
+    if args.exactScaling:
+        num_needed_to_sample = bam_mapped
+    if num_needed_to_sample == bam_mapped:
+        distanceBetweenBins = 55000
+    if args.ignoreForNormalization:
+        chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references)
+                       if chrom_name not in args.ignoreForNormalization]
+    else:
+        chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
+    offset = 0
+    # Iterate over bins at various non-overlapping offsets until we have enough data
+    while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000):
+        res = mapReduce.mapReduce((bam_handle.filename, args, offset),
+                                  getFractionKept_wrapper,
+                                  chrom_sizes,
+                                  genomeChunkLength=distanceBetweenBins,
+                                  blackListFileName=args.blackListFileName,
+                                  numberOfProcessors=args.numberOfProcessors,
+                                  verbose=args.verbose)
+        if len(res):
+            foo, bar = np.sum(res, axis=0)
+            filtered += foo
+            total += bar
+        offset += 1
+    if total == 0:
+        # This should never happen
+        total = 1
+    return 1.0 - float(filtered) / float(total)
+def get_num_kept_reads(args, stats):
+    """
+    Substracts from the total number of mapped reads in a bamfile
+    the proportion of reads that fall into blacklisted regions
+    or that are filtered
+    :return: integer
+    """
+    if stats is None:
+        bam_handle, mapped, unmapped, stats = bamHandler.openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
+    else:
+        bam_handle = bamHandler.openBam(args.bam)
+    bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
+    if args.blackListFileName:
+        blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization,
+                                                      args.blackListFileName, args.numberOfProcessors)
+        print("There are {0} alignments, of which {1} are completely "
+              "within a blacklist region.".format(bam_mapped_total, blacklisted))
+        num_kept_reads = bam_mapped_total - blacklisted
+    else:
+        num_kept_reads = bam_mapped_total
+    ftk = fraction_kept(args, stats)
+    if ftk < 1:
+        num_kept_reads *= ftk
+        print("Due to filtering, {0}% of the aforementioned alignments "
+              "will be used {1}".format(100 * ftk, num_kept_reads))
+    return num_kept_reads, bam_mapped_total
+def get_scale_factor(args, stats):
+    scale_factor = args.scaleFactor
+    bam_mapped, bam_mapped_total = get_num_kept_reads(args, stats)
+    if args.normalizeUsing == 'RPGC':
+        # Print output, since normalzation stuff isn't printed to stderr otherwise
+        sys.stderr.write("normalization: 1x (effective genome size {})\n".format(args.effectiveGenomeSize))
+        # try to guess fragment length if the bam file contains paired end reads
+        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
+        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
+                                                                    return_lengths=False,
+                                                                    blackListFileName=args.blackListFileName,
+                                                                    numberOfProcessors=args.numberOfProcessors,
+                                                                    verbose=args.verbose)
+        if args.extendReads:
+            if args.extendReads is True:
+                # try to guess fragment length if the bam file contains paired end reads
+                if frag_len_dict:
+                    fragment_length = frag_len_dict['median']
+                else:
+                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
+                if args.verbose:
+                    print(("Fragment length based on paired en data "
+                          "estimated to be {}".format(frag_len_dict['median'])))
+            elif args.extendReads < 1:
+                exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
+            elif args.extendReads > 2000:
+                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
+            else:
+                fragment_length = args.extendReads
+        else:
+            # set as fragment length the read length
+            fragment_length = int(read_len_dict['median'])
+            if args.verbose:
+                print("Estimated read length is {}".format(int(read_len_dict['median'])))
+        current_coverage = \
+            float(bam_mapped * fragment_length) / args.effectiveGenomeSize
+        # the scaling sets the coverage to match 1x
+        scale_factor *= 1.0 / current_coverage
+        if debug:
+            print("Estimated current coverage {}".format(current_coverage))
+            print("Scaling factor {}".format(args.scaleFactor))
+    elif args.normalizeUsing == 'RPKM':
+        # Print output, since normalzation stuff isn't printed to stderr otherwise
+        sys.stderr.write("normalization: RPKM\n")
+        # the RPKM is the # reads per tile / \
+        #    ( total reads (in millions) * tile length in Kb)
+        million_reads_mapped = float(bam_mapped) / 1e6
+        tile_len_in_kb = float(args.binSize) / 1000
+        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)
+        if debug:
+            print("scale factor using RPKM is {0}".format(args.scaleFactor))
+    elif args.normalizeUsing == 'CPM':
+        # Print output, since normalzation stuff isn't printed to stderr otherwise
+        sys.stderr.write("normalization: CPM\n")
+        # the CPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
+        million_reads_mapped = float(bam_mapped) / 1e6
+        scale_factor *= 1.0 / (million_reads_mapped)
+        if debug:
+            print("scale factor using CPM is {0}".format(args.scaleFactor))
+    elif args.normalizeUsing == 'BPM':
+        # Print output, since normalzation stuff isn't printed to stderr otherwise
+        sys.stderr.write("normalization: BPM\n")
+        # the BPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
+        # sampled_bins_sum = getSampledSum(args.bam)
+        tile_len_in_kb = float(args.binSize) / 1000
+        tpm_scaleFactor = (bam_mapped / tile_len_in_kb) / 1e6
+        scale_factor *= 1 / (tpm_scaleFactor * tile_len_in_kb)
+        if debug:
+            print("scale factor using BPM is {0}".format(args.scaleFactor))
+    else:
+        # Print output, since normalzation stuff isn't printed to stderr otherwise
+        sys.stderr.write("normalization: none (signal scaled by the fraction of alignments kept after filtering)\n")
+        scale_factor *= bam_mapped / float(bam_mapped_total)
+    if args.verbose:
+        print("Final scaling factor: {}".format(scale_factor))
+    return scale_factor

deepTools/source/deeptools/getScorePerBigWigBin.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import pyBigWig
+import numpy as np
+import os
+import sys
+import shutil
+import warnings
+# deepTools packages
+import deeptools.mapReduce as mapReduce
+import deeptools.utilities
+# debug = 0
+old_settings = np.seterr(all='ignore')
+def countReadsInRegions_wrapper(args):
+    # Using arguments unpacking!
+    return countFragmentsInRegions_worker(*args)
+def countFragmentsInRegions_worker(chrom, start, end,
+                                   bigWigFiles,
+                                   stepSize, binLength,
+                                   save_data,
+                                   bedRegions=None
+                                   ):
+    """ returns the average score in each bigwig file at each 'stepSize'
+    position within the interval start, end for a 'binLength' window.
+    Because the idea is to get counts for window positions at
+    different positions for sampling the bins are equally spaced
+    and *not adjacent*.
+    If a list of bedRegions is given, then the number of reads
+    that overlaps with each region is counted.
+    Test dataset with two samples covering 200 bp.
+    >>> test = Tester()
+    Fragment coverage.
+    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0])
+    array([[1., 1., 2., 2.],
+           [1., 1., 1., 3.]])
+    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0])
+    array([[1.5],
+           [1.5]])
+    BED regions:
+    >>> bedRegions = [[test.chrom, [(45, 55)]], [test.chrom, [(95, 105)]], [test.chrom, [(145, 155)]]]
+    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False,
+    ... bedRegions=bedRegions)[0])
+    array([[1. , 1.5, 2. ],
+           [1. , 1. , 2. ]])
+    """
+    assert start < end, "start {} bigger that end {}".format(start, end)
+    # array to keep the scores for the regions
+    sub_score_per_bin = []
+    rows = 0
+    bigwig_handles = []
+    for foo in bigWigFiles:
+        bigwig_handles.append(pyBigWig.open(foo))
+    regions_to_consider = []
+    if bedRegions:
+        for reg in bedRegions:
+            regs = []
+            for exon in reg[1]:
+                regs.append((exon[0], exon[1]))
+            regions_to_consider.append(regs)
+    else:
+        for i in range(start, end, stepSize):
+            if (i + binLength) > end:
+                regions_to_consider.append([(i, end)])  # last bin (may be smaller)
+            else:
+                regions_to_consider.append([(i, i + binLength)])
+    if save_data:
+        _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
+        _file_name = _file.name
+    else:
+        _file_name = ''
+    warnings.simplefilter("default")
+    i = 0
+    for reg in regions_to_consider:
+        avgReadsArray = []
+        i += 1
+        for idx, bwh in enumerate(bigwig_handles):
+            if chrom not in bwh.chroms():
+                unmod_name = chrom
+                if chrom.startswith('chr'):
+                    # remove the chr part from chromosome name
+                    chrom = chrom[3:]
+                else:
+                    # prefix with 'chr' the chromosome name
+                    chrom = 'chr' + chrom
+                if chrom not in bwh.chroms():
+                    exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx]))
+            weights = []
+            scores = []
+            for exon in reg:
+                weights.append(exon[1] - exon[0])
+                score = bwh.stats(chrom, exon[0], exon[1])
+                if score is None or score == [None] or np.isnan(score[0]):
+                    score = [np.nan]
+                scores.extend(score)
+            avgReadsArray.append(np.average(scores, weights=weights))  # mean of fragment coverage for region
+        sub_score_per_bin.extend(avgReadsArray)
+        rows += 1
+        if save_data:
+            starts = []
+            ends = []
+            for exon in reg:
+                starts.append(str(exon[0]))
+                ends.append(str(exon[1]))
+            starts = ",".join(starts)
+            ends = ",".join(ends)
+            _file.write("\t".join(map(str, [chrom, starts, ends])) + "\t")
+            _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n")
+    if save_data:
+        _file.close()
+    warnings.resetwarnings()
+    # the output is a matrix having as many rows as the variable 'row'
+    # and as many columns as bigwig files. The rows correspond to
+    # each of the regions processed by the worker.
+    # np.array([[score1_1, score1_2],
+    #           [score2_1, score2_2]]
+    return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name
+def getChromSizes(bigwigFilesList):
+    """
+    Get chromosome sizes from bigWig file with pyBigWig
+    Test dataset with two samples covering 200 bp.
+    >>> test = Tester()
+    Chromosome name(s) and size(s).
+    >>> assert getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([]))
+    """
+    def print_chr_names_and_size(chr_set):
+        sys.stderr.write("chromosome\tlength\n")
+        for name, size in chr_set:
+            sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size))
+    bigwigFilesList = bigwigFilesList[:]
+    common_chr = set()
+    for fname in bigwigFilesList:
+        fh = pyBigWig.open(fname)
+        common_chr = common_chr.union(set(fh.chroms().items()))
+        fh.close()
+    non_common_chr = set()
+    for bw in bigwigFilesList:
+        _names_and_size = set(pyBigWig.open(bw).chroms().items())
+        if len(common_chr & _names_and_size) == 0:
+            #  try to add remove 'chr' from the chromosme name
+            _corr_names_size = set()
+            for chrom_name, size in _names_and_size:
+                if chrom_name.startswith('chr'):
+                    _corr_names_size.add((chrom_name[3:], size))
+                else:
+                    _corr_names_size.add(('chr' + chrom_name, size))
+            if len(common_chr & _corr_names_size) == 0:
+                message = "No common chromosomes found. Are the bigwig files " \
+                          "from the same species and same assemblies?\n"
+                sys.stderr.write(message)
+                print_chr_names_and_size(common_chr)
+                sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n"
+                                 "lengths from file\n{}\n".format(bw))
+                print_chr_names_and_size(_names_and_size)
+                exit(1)
+            else:
+                _names_and_size = _corr_names_size
+        non_common_chr |= common_chr ^ _names_and_size
+        common_chr = common_chr & _names_and_size
+    if len(non_common_chr) > 0:
+        sys.stderr.write("\nThe following chromosome names did not match between the bigwig files\n")
+        print_chr_names_and_size(non_common_chr)
+    # get the list of common chromosome names and sizes
+    return sorted(common_chr), non_common_chr
+def getScorePerBin(bigWigFiles, binLength,
+                   numberOfProcessors=1,
+                   verbose=False, region=None,
+                   bedFile=None,
+                   blackListFileName=None,
+                   stepSize=None,
+                   chrsToSkip=[],
+                   out_file_for_raw_data=None,
+                   allArgs=None):
+    """
+    This function returns a matrix containing scores (median) for the coverage
+    of fragments within a region. Each row corresponds to a sampled region.
+    Likewise, each column corresponds to a bigwig file.
+    Test dataset with two samples covering 200 bp.
+    >>> test = Tester()
+    >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
+    array([[1., 1., 2., 2.],
+           [1., 1., 1., 3.]])
+    """
+    # Try to determine an optimal fraction of the genome (chunkSize)
+    # that is sent to workers for analysis. If too short, too much time
+    # is spent loading the files
+    # if too long, some processors end up free.
+    # the following is a heuristic
+    # get list of common chromosome names and sizes
+    chrom_sizes, non_common = getChromSizes(bigWigFiles)
+    # skip chromosome in the list. This is usually for the
+    # X chromosome which may have either one copy  in a male sample
+    # or a mixture of male/female and is unreliable.
+    # Also the skip may contain heterochromatic regions and
+    # mitochondrial DNA
+    if chrsToSkip and len(chrsToSkip):
+        chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]
+    chrnames, chrlengths = list(zip(*chrom_sizes))
+    if stepSize is None:
+        stepSize = binLength  # for adjacent bins
+    # set chunksize based on number of processors used
+    chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
+    # make chunkSize multiple of binLength
+    chunkSize -= chunkSize % binLength
+    if verbose:
+        print("step size is {}".format(stepSize))
+    if region:
+        # in case a region is used, append the tilesize
+        region += ":{}".format(binLength)
+    # mapReduce( (staticArgs), func, chromSize, etc. )
+    if out_file_for_raw_data:
+        save_file = True
+    else:
+        save_file = False
+    # Handle GTF options
+    transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
+    imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file),
+                                   countReadsInRegions_wrapper,
+                                   chrom_sizes,
+                                   genomeChunkLength=chunkSize,
+                                   bedFile=bedFile,
+                                   blackListFileName=blackListFileName,
+                                   region=region,
+                                   numberOfProcessors=numberOfProcessors,
+                                   transcriptID=transcriptID,
+                                   exonID=exonID,
+                                   keepExons=keepExons,
+                                   transcript_id_designator=transcript_id_designator)
+    if out_file_for_raw_data:
+        if len(non_common):
+            sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
+                             "the chromosomes that were not common between the bigwig files\n")
+        # concatenate intermediary bedgraph files
+        ofile = open(out_file_for_raw_data, "w")
+        for _values, tempFileName in imap_res:
+            if tempFileName:
+                # concatenate all intermediate tempfiles into one
+                f = open(tempFileName, 'r')
+                shutil.copyfileobj(f, ofile)
+                f.close()
+                os.remove(tempFileName)
+        ofile.close()
+    # the matrix scores are in the first element of each of the entries in imap_res
+    score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
+    return score_per_bin
+class Tester(object):
+    def __init__(self):
+        """
+        The the two bigWig files are as follows:
+        $ cat /tmp/testA.bg
+        3R      0       100     1
+        3R      100     200     2
+        $ cat /tmp/testB.bg
+        3R      0       150     1
+        3R      150     200     3
+        They cover 200 bp:
+              0              50              100            150            200
+              |------------------------------------------------------------|
+            A  111111111111111111111111111111122222222222222222222222222222
+            B  111111111111111111111111111111111111111111111333333333333333
+        """
+        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+        self.bwFile1 = self.root + "testA.bw"
+        self.bwFile2 = self.root + "testB.bw"
+        self.bwFile_PE = self.root + "test_paired2.bw"
+        self.chrom = '3R'
+        # global debug
+        # debug = 0

deepTools/source/deeptools/heatmapper.py ADDED Viewed

	@@ -0,0 +1,1372 @@

+import sys
+import gzip
+from collections import OrderedDict
+import numpy as np
+from copy import deepcopy
+import pyBigWig
+from deeptools import getScorePerBigWigBin
+from deeptools import mapReduce
+from deeptools.utilities import toString, toBytes, smartLabels
+from deeptools.heatmapper_utilities import getProfileTicks
+old_settings = np.seterr(all='ignore')
+def chopRegions(exonsInput, left=0, right=0):
+    """
+    exons is a list of (start, end) tuples. The goal is to chop these into
+    separate lists of tuples, to take care or unscaled regions. "left" and
+    "right" denote regions of a given size to exclude from the normal binning
+    process (unscaled regions).
+    This outputs three lists of (start, end) tuples:
+    leftBins: 5' unscaled regions
+    bodyBins: body bins for scaling
+    rightBins: 3' unscaled regions
+    In addition are two integers
+    padLeft: Number of bases of padding on the left (due to not being able to fulfill "left")
+    padRight: As above, but on the right side
+    """
+    leftBins = []
+    rightBins = []
+    padLeft = 0
+    padRight = 0
+    exons = deepcopy(exonsInput)
+    while len(exons) > 0 and left > 0:
+        width = exons[0][1] - exons[0][0]
+        if width <= left:
+            leftBins.append(exons[0])
+            del exons[0]
+            left -= width
+        else:
+            leftBins.append((exons[0][0], exons[0][0] + left))
+            exons[0] = (exons[0][0] + left, exons[0][1])
+            left = 0
+    if left > 0:
+        padLeft = left
+    while len(exons) > 0 and right > 0:
+        width = exons[-1][1] - exons[-1][0]
+        if width <= right:
+            rightBins.append(exons[-1])
+            del exons[-1]
+            right -= width
+        else:
+            rightBins.append((exons[-1][1] - right, exons[-1][1]))
+            exons[-1] = (exons[-1][0], exons[-1][1] - right)
+            right = 0
+    if right > 0:
+        padRight = right
+    return leftBins, exons, rightBins[::-1], padLeft, padRight
+def chopRegionsFromMiddle(exonsInput, left=0, right=0):
+    """
+    Like chopRegions(), above, but returns two lists of tuples on each side of
+    the center point of the exons.
+    The steps are as follow:
+     1) Find the center point of the set of exons (e.g., [(0, 200), (300, 400), (800, 900)] would be centered at 200)
+       * If a given exon spans the center point then the exon is split
+     2) The given number of bases at the end of the left-of-center list are extracted
+       * If the set of exons don't contain enough bases, then padLeft is incremented accordingly
+     3) As above but for the right-of-center list
+     4) A tuple of (#2, #3, pading on the left, and padding on the right) is returned
+    """
+    leftBins = []
+    rightBins = []
+    size = sum([x[1] - x[0] for x in exonsInput])
+    middle = size // 2
+    cumulativeSum = 0
+    padLeft = 0
+    padRight = 0
+    exons = deepcopy(exonsInput)
+    # Split exons in half
+    for exon in exons:
+        size = exon[1] - exon[0]
+        if cumulativeSum >= middle:
+            rightBins.append(exon)
+        elif cumulativeSum + size < middle:
+            leftBins.append(exon)
+        else:
+            # Don't add 0-width exonic bins!
+            if exon[0] < exon[1] - cumulativeSum - size + middle:
+                leftBins.append((exon[0], exon[1] - cumulativeSum - size + middle))
+            if exon[1] - cumulativeSum - size + middle < exon[1]:
+                rightBins.append((exon[1] - cumulativeSum - size + middle, exon[1]))
+        cumulativeSum += size
+    # Trim leftBins/adjust padLeft
+    lSum = sum([x[1] - x[0] for x in leftBins])
+    if lSum > left:
+        lSum = 0
+        for i, exon in enumerate(leftBins[::-1]):
+            size = exon[1] - exon[0]
+            if lSum + size > left:
+                leftBins[-i - 1] = (exon[1] + lSum - left, exon[1])
+                break
+            lSum += size
+            if lSum == left:
+                break
+        i += 1
+        if i < len(leftBins):
+            leftBins = leftBins[-i:]
+    elif lSum < left:
+        padLeft = left - lSum
+    # Trim rightBins/adjust padRight
+    rSum = sum([x[1] - x[0] for x in rightBins])
+    if rSum > right:
+        rSum = 0
+        for i, exon in enumerate(rightBins):
+            size = exon[1] - exon[0]
+            if rSum + size > right:
+                rightBins[i] = (exon[0], exon[1] - rSum - size + right)
+                break
+            rSum += size
+            if rSum == right:
+                break
+        rightBins = rightBins[:i + 1]
+    elif rSum < right:
+        padRight = right - rSum
+    return leftBins, rightBins, padLeft, padRight
+def trimZones(zones, maxLength, binSize, padRight):
+    """
+    Given a (variable length) list of lists of (start, end) tuples, trim/remove and tuple that extends past maxLength (e.g., the end of a chromosome)
+    Returns the trimmed zones and padding
+    """
+    output = []
+    for zone, nbins in zones:
+        outZone = []
+        changed = False
+        for reg in zone:
+            if reg[0] >= maxLength:
+                changed = True
+                padRight += reg[1] - reg[0]
+                continue
+            if reg[1] > maxLength:
+                changed = True
+                padRight += reg[1] - maxLength
+                reg = (reg[0], maxLength)
+            if reg[1] > reg[0]:
+                outZone.append(reg)
+        if changed:
+            nBins = sum(x[1] - x[0] for x in outZone) // binSize
+        else:
+            nBins = nbins
+        output.append((outZone, nBins))
+    return output, padRight
+def compute_sub_matrix_wrapper(args):
+    return heatmapper.compute_sub_matrix_worker(*args)
+class heatmapper(object):
+    """
+    Class to handle the reading and
+    plotting of matrices.
+    """
+    def __init__(self):
+        self.parameters = None
+        self.lengthDict = None
+        self.matrix = None
+        self.regions = None
+        self.blackList = None
+        self.quiet = True
+        # These are parameters that were single values in versions <3 but are now internally lists. See issue #614
+        self.special_params = set(['unscaled 5 prime', 'unscaled 3 prime', 'body', 'downstream', 'upstream', 'ref point', 'bin size'])
+    def getTicks(self, idx):
+        """
+        This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks.
+        """
+        xticks, xtickslabel = getProfileTicks(self, self.reference_point_label[idx], self.startLabel, self.endLabel, idx)
+        return xticks, xtickslabel
+    def computeMatrix(self, score_file_list, regions_file, parameters, blackListFileName=None, verbose=False, allArgs=None):
+        """
+        Splits into
+        multiple cores the computation of the scores
+        per bin for each region (defined by a hash '#'
+        in the regions (BED/GFF) file.
+        """
+        if parameters['body'] > 0 and \
+                parameters['body'] % parameters['bin size'] > 0:
+            exit("The --regionBodyLength has to be "
+                 "a multiple of --binSize.\nCurrently the "
+                 "values are {} {} for\nregionsBodyLength and "
+                 "binSize respectively\n".format(parameters['body'],
+                                                 parameters['bin size']))
+        # the beforeRegionStartLength is extended such that
+        # length is a multiple of binSize
+        if parameters['downstream'] % parameters['bin size'] > 0:
+            exit("Length of region after the body has to be "
+                 "a multiple of --binSize.\nCurrent value "
+                 "is {}\n".format(parameters['downstream']))
+        if parameters['upstream'] % parameters['bin size'] > 0:
+            exit("Length of region before the body has to be a multiple of "
+                 "--binSize\nCurrent value is {}\n".format(parameters['upstream']))
+        if parameters['unscaled 5 prime'] % parameters['bin size'] > 0:
+            exit("Length of the unscaled 5 prime region has to be a multiple of "
+                 "--binSize\nCurrent value is {}\n".format(parameters['unscaled 5 prime']))
+        if parameters['unscaled 3 prime'] % parameters['bin size'] > 0:
+            exit("Length of the unscaled 5 prime region has to be a multiple of "
+                 "--binSize\nCurrent value is {}\n".format(parameters['unscaled 3 prime']))
+        if parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] > 0 and parameters['body'] == 0:
+            exit('Unscaled 5- and 3-prime regions only make sense with the scale-regions subcommand.\n')
+        # Take care of GTF options
+        transcriptID = "transcript"
+        exonID = "exon"
+        transcript_id_designator = "transcript_id"
+        keepExons = False
+        self.quiet = False
+        if allArgs is not None:
+            allArgs = vars(allArgs)
+            transcriptID = allArgs.get("transcriptID", transcriptID)
+            exonID = allArgs.get("exonID", exonID)
+            transcript_id_designator = allArgs.get("transcript_id_designator", transcript_id_designator)
+            keepExons = allArgs.get("keepExons", keepExons)
+            self.quiet = allArgs.get("quiet", self.quiet)
+        chromSizes, _ = getScorePerBigWigBin.getChromSizes(score_file_list)
+        res, labels = mapReduce.mapReduce([score_file_list, parameters],
+                                          compute_sub_matrix_wrapper,
+                                          chromSizes,
+                                          self_=self,
+                                          bedFile=regions_file,
+                                          blackListFileName=blackListFileName,
+                                          numberOfProcessors=parameters['proc number'],
+                                          includeLabels=True,
+                                          transcriptID=transcriptID,
+                                          exonID=exonID,
+                                          transcript_id_designator=transcript_id_designator,
+                                          keepExons=keepExons,
+                                          verbose=verbose)
+        # each worker in the pool returns a tuple containing
+        # the submatrix data, the regions that correspond to the
+        # submatrix, and the number of regions lacking scores
+        # Since this is largely unsorted, we need to sort by group
+        # merge all the submatrices into matrix
+        matrix = np.concatenate([r[0] for r in res], axis=0)
+        regions = []
+        regions_no_score = 0
+        for idx in range(len(res)):
+            if len(res[idx][1]):
+                regions.extend(res[idx][1])
+                regions_no_score += res[idx][2]
+        groups = [x[3] for x in regions]
+        foo = sorted(zip(groups, list(range(len(regions))), regions))
+        sortIdx = [x[1] for x in foo]
+        regions = [x[2] for x in foo]
+        matrix = matrix[sortIdx]
+        # mask invalid (nan) values
+        matrix = np.ma.masked_invalid(matrix)
+        assert matrix.shape[0] == len(regions), \
+            "matrix length does not match regions length"
+        if len(regions) == 0:
+            sys.stderr.write("\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n")
+            exit(1)
+        if regions_no_score == len(regions):
+            exit("\nERROR: None of the BED regions could be found in the bigWig"
+                 "file.\nPlease check that the bigwig file is valid and "
+                 "that the chromosome names between the BED file and "
+                 "the bigWig file correspond to each other\n")
+        if regions_no_score > len(regions) * 0.75:
+            file_type = 'bigwig' if score_file_list[0].endswith(".bw") else "BAM"
+            prcnt = 100 * float(regions_no_score) / len(regions)
+            sys.stderr.write(
+                "\n\nWarning: {0:.2f}% of regions are *not* associated\n"
+                "to any score in the given {1} file. Check that the\n"
+                "chromosome names from the BED file are consistent with\n"
+                "the chromosome names in the given {2} file and that both\n"
+                "files refer to the same species\n\n".format(prcnt,
+                                                             file_type,
+                                                             file_type))
+        self.parameters = parameters
+        numcols = matrix.shape[1]
+        num_ind_cols = self.get_num_individual_matrix_cols()
+        sample_boundaries = list(range(0, numcols + num_ind_cols, num_ind_cols))
+        if allArgs is not None and allArgs['samplesLabel'] is not None:
+            sample_labels = allArgs['samplesLabel']
+        else:
+            sample_labels = smartLabels(score_file_list)
+        # Determine the group boundaries
+        group_boundaries = []
+        group_labels_filtered = []
+        last_idx = -1
+        for x in range(len(regions)):
+            if regions[x][3] != last_idx:
+                last_idx = regions[x][3]
+                group_boundaries.append(x)
+                group_labels_filtered.append(labels[last_idx])
+        group_boundaries.append(len(regions))
+        # check if a given group is too small. Groups that
+        # are too small can't be plotted and an exception is thrown.
+        group_len = np.diff(group_boundaries)
+        if len(group_len) > 1:
+            sum_len = sum(group_len)
+            group_frac = [float(x) / sum_len for x in group_len]
+            if min(group_frac) <= 0.002:
+                sys.stderr.write(
+                    "One of the groups defined in the bed file is "
+                    "too small.\nGroups that are too small can't be plotted. "
+                    "\n")
+        self.matrix = _matrix(regions, matrix,
+                              group_boundaries,
+                              sample_boundaries,
+                              group_labels_filtered,
+                              sample_labels)
+        if parameters['skip zeros']:
+            self.matrix.removeempty()
+    @staticmethod
+    def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, parameters, regions):
+        """
+        Returns
+        -------
+        numpy matrix
+            A numpy matrix that contains per each row the values found per each of the regions given
+        """
+        if parameters['verbose']:
+            sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))
+        # read BAM or scores file
+        score_file_handles = []
+        for sc_file in score_file_list:
+            score_file_handles.append(pyBigWig.open(sc_file))
+        # determine the number of matrix columns based on the lengths
+        # given by the user, times the number of score files
+        matrix_cols = len(score_file_list) * \
+            ((parameters['downstream'] +
+              parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] +
+              parameters['upstream'] + parameters['body']) //
+             parameters['bin size'])
+        # create an empty matrix to store the values
+        sub_matrix = np.zeros((len(regions), matrix_cols))
+        sub_matrix[:] = np.nan
+        j = 0
+        sub_regions = []
+        regions_no_score = 0
+        for transcript in regions:
+            feature_chrom = transcript[0]
+            exons = transcript[1]
+            feature_start = exons[0][0]
+            feature_end = exons[-1][1]
+            feature_name = transcript[2]
+            feature_strand = transcript[4]
+            padLeft = 0
+            padRight = 0
+            padLeftNaN = 0
+            padRightNaN = 0
+            upstream = []
+            downstream = []
+            # get the body length
+            body_length = np.sum([x[1] - x[0] for x in exons]) - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']
+            # print some information
+            if parameters['body'] > 0 and \
+                    body_length < parameters['bin size']:
+                if not self.quiet:
+                    sys.stderr.write("A region that is shorter than the bin size (possibly only after accounting for unscaled regions) was found: "
+                                     "({0}) {1} {2}:{3}:{4}. Skipping...\n".format((body_length - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']),
+                                                                                   feature_name, feature_chrom,
+                                                                                   feature_start, feature_end))
+                coverage = np.zeros(matrix_cols)
+                if not parameters['missing data as zero']:
+                    coverage[:] = np.nan
+            else:
+                if feature_strand == '-':
+                    if parameters['downstream'] > 0:
+                        upstream = [(feature_start - parameters['downstream'], feature_start)]
+                    if parameters['upstream'] > 0:
+                        downstream = [(feature_end, feature_end + parameters['upstream'])]
+                    unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 3 prime'], right=parameters['unscaled 5 prime'])
+                    # bins per zone
+                    a = parameters['downstream'] // parameters['bin size']
+                    b = parameters['unscaled 3 prime'] // parameters['bin size']
+                    d = parameters['unscaled 5 prime'] // parameters['bin size']
+                    e = parameters['upstream'] // parameters['bin size']
+                else:
+                    if parameters['upstream'] > 0:
+                        upstream = [(feature_start - parameters['upstream'], feature_start)]
+                    if parameters['downstream'] > 0:
+                        downstream = [(feature_end, feature_end + parameters['downstream'])]
+                    unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 5 prime'], right=parameters['unscaled 3 prime'])
+                    a = parameters['upstream'] // parameters['bin size']
+                    b = parameters['unscaled 5 prime'] // parameters['bin size']
+                    d = parameters['unscaled 3 prime'] // parameters['bin size']
+                    e = parameters['downstream'] // parameters['bin size']
+                c = parameters['body'] // parameters['bin size']
+                # build zones (each is a list of tuples)
+                #  zone0: region before the region start,
+                #  zone1: unscaled 5 prime region
+                #  zone2: the body of the region
+                #  zone3: unscaled 3 prime region
+                #  zone4: the region from the end of the region downstream
+                #  the format for each zone is: [(start, end), ...], number of bins
+                # Note that for "reference-point", upstream/downstream will go
+                # through the exons (if requested) and then possibly continue
+                # on the other side (unless parameters['nan after end'] is true)
+                if parameters['body'] > 0:
+                    zones = [(upstream, a), (unscaled5prime, b), (body, c), (unscaled3prime, d), (downstream, e)]
+                elif parameters['ref point'] == 'TES':  # around TES
+                    if feature_strand == '-':
+                        downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['upstream'])
+                        if padRight > 0 and parameters['nan after end'] is True:
+                            padRightNaN += padRight
+                        elif padRight > 0:
+                            downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
+                        padRight = 0
+                    else:
+                        unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['upstream'])
+                        if padLeft > 0 and parameters['nan after end'] is True:
+                            padLeftNaN += padLeft
+                        elif padLeft > 0:
+                            upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
+                        padLeft = 0
+                    e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
+                    a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
+                    zones = [(upstream, a), (downstream, e)]
+                elif parameters['ref point'] == 'center':  # at the region center
+                    if feature_strand == '-':
+                        upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['downstream'], right=parameters['upstream'])
+                    else:
+                        upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['upstream'], right=parameters['downstream'])
+                    if padLeft > 0 and parameters['nan after end'] is True:
+                        padLeftNaN += padLeft
+                    elif padLeft > 0:
+                        if len(upstream) > 0:
+                            upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
+                        else:
+                            upstream = [(downstream[0][0] - padLeft, downstream[0][0])]
+                    padLeft = 0
+                    if padRight > 0 and parameters['nan after end'] is True:
+                        padRightNaN += padRight
+                    elif padRight > 0:
+                        downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
+                    padRight = 0
+                    a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
+                    e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
+                    # It's possible for a/e to be floats or 0 yet upstream/downstream isn't empty
+                    if a < 1:
+                        upstream = []
+                        a = 0
+                    if e < 1:
+                        downstream = []
+                        e = 0
+                    zones = [(upstream, a), (downstream, e)]
+                else:  # around TSS
+                    if feature_strand == '-':
+                        unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['downstream'])
+                        if padLeft > 0 and parameters['nan after end'] is True:
+                            padLeftNaN += padLeft
+                        elif padLeft > 0:
+                            upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
+                        padLeft = 0
+                    else:
+                        downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['downstream'])
+                        if padRight > 0 and parameters['nan after end'] is True:
+                            padRightNaN += padRight
+                        elif padRight > 0:
+                            downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
+                        padRight = 0
+                    a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
+                    e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
+                    zones = [(upstream, a), (downstream, e)]
+                foo = parameters['upstream']
+                bar = parameters['downstream']
+                if feature_strand == '-':
+                    foo, bar = bar, foo
+                if padLeftNaN > 0:
+                    expected = foo // parameters['bin size']
+                    padLeftNaN = int(round(float(padLeftNaN) / parameters['bin size']))
+                    if expected - padLeftNaN - a > 0:
+                        padLeftNaN += 1
+                if padRightNaN > 0:
+                    expected = bar // parameters['bin size']
+                    padRightNaN = int(round(float(padRightNaN) / parameters['bin size']))
+                    if expected - padRightNaN - e > 0:
+                        padRightNaN += 1
+                coverage = []
+                # compute the values for each of the files being processed.
+                # "cov" is a numpy array of bins
+                for sc_handler in score_file_handles:
+                    # We're only supporting bigWig files at this point
+                    cov = heatmapper.coverage_from_big_wig(
+                        sc_handler, feature_chrom, zones,
+                        parameters['bin size'],
+                        parameters['bin avg type'],
+                        parameters['missing data as zero'],
+                        not self.quiet)
+                    if padLeftNaN > 0:
+                        cov = np.concatenate([[np.nan] * padLeftNaN, cov])
+                    if padRightNaN > 0:
+                        cov = np.concatenate([cov, [np.nan] * padRightNaN])
+                    if feature_strand == "-":
+                        cov = cov[::-1]
+                    coverage = np.hstack([coverage, cov])
+            if coverage is None:
+                regions_no_score += 1
+                if not self.quiet:
+                    sys.stderr.write(
+                        "No data was found for region "
+                        "{0} {1}:{2}-{3}. Skipping...\n".format(
+                            feature_name, feature_chrom,
+                            feature_start, feature_end))
+                coverage = np.zeros(matrix_cols)
+                if not parameters['missing data as zero']:
+                    coverage[:] = np.nan
+            try:
+                temp = coverage.copy()
+                temp[np.isnan(temp)] = 0
+            except:
+                if not self.quiet:
+                    sys.stderr.write(
+                        "No scores defined for region "
+                        "{0} {1}:{2}-{3}. Skipping...\n".format(feature_name,
+                                                                feature_chrom,
+                                                                feature_start,
+                                                                feature_end))
+                coverage = np.zeros(matrix_cols)
+                if not parameters['missing data as zero']:
+                    coverage[:] = np.nan
+            if parameters['min threshold'] is not None and coverage.min() <= parameters['min threshold']:
+                continue
+            if parameters['max threshold'] is not None and coverage.max() >= parameters['max threshold']:
+                continue
+            if parameters['scale'] != 1:
+                coverage = parameters['scale'] * coverage
+            sub_matrix[j, :] = coverage
+            sub_regions.append(transcript)
+            j += 1
+        # remove empty rows
+        sub_matrix = sub_matrix[0:j, :]
+        if len(sub_regions) != len(sub_matrix[:, 0]):
+            sys.stderr.write("regions lengths do not match\n")
+        return sub_matrix, sub_regions, regions_no_score
+    @staticmethod
+    def coverage_from_array(valuesArray, zones, binSize, avgType):
+        try:
+            valuesArray[0]
+        except (IndexError, TypeError) as detail:
+            sys.stderr.write("{0}\nvalues array value: {1}, zones {2}\n".format(detail, valuesArray, zones))
+        cvglist = []
+        zoneEnd = 0
+        valStart = 0
+        valEnd = 0
+        for zone, nBins in zones:
+            if nBins:
+                # linspace is used to more or less evenly partition the data points into the given number of bins
+                zoneEnd += nBins
+                valStart = valEnd
+                valEnd += np.sum([x[1] - x[0] for x in zone])
+                counts_list = []
+                # Partition the space into bins
+                if nBins == 1:
+                    pos_array = np.array([valStart])
+                else:
+                    pos_array = np.linspace(valStart, valEnd, nBins, endpoint=False, dtype=int)
+                pos_array = np.append(pos_array, valEnd)
+                idx = 0
+                while idx < nBins:
+                    idxStart = int(pos_array[idx])
+                    idxEnd = max(int(pos_array[idx + 1]), idxStart + 1)
+                    try:
+                        counts_list.append(heatmapper.my_average(valuesArray[idxStart:idxEnd], avgType))
+                    except Exception as detail:
+                        sys.stderr.write("Exception found: {0}\n".format(detail))
+                    idx += 1
+                cvglist.append(np.array(counts_list))
+        return np.concatenate(cvglist)
+    @staticmethod
+    def change_chrom_names(chrom):
+        """
+        Changes UCSC chromosome names to ensembl chromosome names
+        and vice versa.
+        """
+        if chrom.startswith('chr'):
+            # remove the chr part from chromosome name
+            chrom = chrom[3:]
+            if chrom == "M":
+                chrom = "MT"
+        else:
+            # prefix with 'chr' the chromosome name
+            chrom = 'chr' + chrom
+            if chrom == "chrMT":
+                chrom = "chrM"
+        return chrom
+    @staticmethod
+    def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=False, verbose=True):
+        """
+        uses pyBigWig
+        to query a region define by chrom and zones.
+        The output is an array that contains the bigwig
+        value per base pair. The summary over bins is
+        done in a later step when coverage_from_array is called.
+        This method is more reliable than querying the bins
+        directly from the bigwig, which should be more efficient.
+        By default, any region, even if no chromosome match is found
+        on the bigwig file, produces a result. In other words
+        no regions are skipped.
+        zones: array as follows zone0: region before the region start,
+                                zone1: 5' unscaled region (if present)
+                                zone2: the body of the region (not always present)
+                                zone3: 3' unscaled region (if present)
+                                zone4: the region from the end of the region downstream
+               each zone is a tuple containing start, end, and number of bins
+        This is useful if several matrices wants to be merged
+        or if the sorted BED output of one computeMatrix operation
+        needs to be used for other cases
+        """
+        nVals = 0
+        for zone, _ in zones:
+            for region in zone:
+                nVals += region[1] - region[0]
+        values_array = np.zeros(nVals)
+        if not nansAsZeros:
+            values_array[:] = np.nan
+        if chrom not in list(bigwig.chroms().keys()):
+            unmod_name = chrom
+            chrom = heatmapper.change_chrom_names(chrom)
+            if chrom not in list(bigwig.chroms().keys()):
+                if verbose:
+                    sys.stderr.write("Warning: Your chromosome names do not match.\nPlease check that the "
+                                     "chromosome names in your BED file\ncorrespond to the names in your "
+                                     "bigWig file.\nAn empty line will be added to your heatmap.\nThe problematic "
+                                     "chromosome name is {0}\n\n".format(unmod_name))
+                # return empty nan array
+                return heatmapper.coverage_from_array(values_array, zones, binSize, avgType)
+        maxLen = bigwig.chroms(chrom)
+        startIdx = 0
+        endIdx = 0
+        for zone, _ in zones:
+            for region in zone:
+                startIdx = endIdx
+                if region[0] < 0:
+                    endIdx += abs(region[0])
+                    values_array[startIdx:endIdx] = np.nan
+                    startIdx = endIdx
+                start = max(0, region[0])
+                end = min(maxLen, region[1])
+                endIdx += end - start
+                if start < end:
+                    # This won't be the case if we extend off the front of a chromosome, such as (-100, 0)
+                    values_array[startIdx:endIdx] = bigwig.values(chrom, start, end)
+                if end < region[1]:
+                    startIdx = endIdx
+                    endIdx += region[1] - end
+                    values_array[startIdx:endIdx] = np.nan
+        # replaces nans for zeros
+        if nansAsZeros:
+            values_array[np.isnan(values_array)] = 0
+        return heatmapper.coverage_from_array(values_array, zones,
+                                              binSize, avgType)
+    @staticmethod
+    def my_average(valuesArray, avgType='mean'):
+        """
+        computes the mean, median, etc but only for those values
+        that are not Nan
+        """
+        valuesArray = np.ma.masked_invalid(valuesArray)
+        avg = np.ma.__getattribute__(avgType)(valuesArray)
+        if isinstance(avg, np.ma.core.MaskedConstant):
+            return np.nan
+        else:
+            return avg
+    def matrix_from_dict(self, matrixDict, regionsDict, parameters):
+        self.regionsDict = regionsDict
+        self.matrixDict = matrixDict
+        self.parameters = parameters
+        self.lengthDict = OrderedDict()
+        self.matrixAvgsDict = OrderedDict()
+    def read_matrix_file(self, matrix_file):
+        # reads a bed file containing the position
+        # of genomic intervals
+        # In case a hash sign '#' is found in the
+        # file, this is considered as a delimiter
+        # to split the heatmap into groups
+        import json
+        regions = []
+        matrix_rows = []
+        current_group_index = 0
+        max_group_bound = None
+        fh = gzip.open(matrix_file)
+        for line in fh:
+            line = toString(line).strip()
+            # read the header file containing the parameters
+            # used
+            if line.startswith("@"):
+                # the parameters used are saved using
+                # json
+                self.parameters = json.loads(line[1:].strip())
+                max_group_bound = self.parameters['group_boundaries'][1]
+                continue
+            # split the line into bed interval and matrix values
+            region = line.split('\t')
+            chrom, start, end, name, score, strand = region[0:6]
+            matrix_row = np.ma.masked_invalid(np.fromiter(region[6:], float))
+            matrix_rows.append(matrix_row)
+            starts = start.split(",")
+            ends = end.split(",")
+            regs = [(int(x), int(y)) for x, y in zip(starts, ends)]
+            # get the group index
+            if len(regions) >= max_group_bound:
+                current_group_index += 1
+                max_group_bound = self.parameters['group_boundaries'][current_group_index + 1]
+            regions.append([chrom, regs, name, max_group_bound, strand, score])
+        matrix = np.vstack(matrix_rows)
+        self.matrix = _matrix(regions, matrix, self.parameters['group_boundaries'],
+                              self.parameters['sample_boundaries'],
+                              group_labels=self.parameters['group_labels'],
+                              sample_labels=self.parameters['sample_labels'])
+        if 'sort regions' in self.parameters:
+            self.matrix.set_sorting_method(self.parameters['sort regions'],
+                                           self.parameters['sort using'])
+        # Versions of computeMatrix before 3.0 didn't have an entry of these per column, fix that
+        nSamples = len(self.matrix.sample_labels)
+        h = dict()
+        for k, v in self.parameters.items():
+            if k in self.special_params and type(v) is not list:
+                v = [v] * nSamples
+                if len(v) == 0:
+                    v = [None] * nSamples
+            h[k] = v
+        self.parameters = h
+        return
+    def save_matrix(self, file_name):
+        """
+        saves the data required to reconstruct the matrix
+        the format is:
+        A header containing the parameters used to create the matrix
+        encoded as:
+        @key:value\tkey2:value2 etc...
+        The rest of the file has the same first 5 columns of a
+        BED file: chromosome name, start, end, name, score and strand,
+        all separated by tabs. After the fifth column the matrix
+        values are appended separated by tabs.
+        Groups are separated by adding a line starting with a hash (#)
+        and followed by the group name.
+        The file is gzipped.
+        """
+        import json
+        self.parameters['sample_labels'] = self.matrix.sample_labels
+        self.parameters['group_labels'] = self.matrix.group_labels
+        self.parameters['sample_boundaries'] = self.matrix.sample_boundaries
+        self.parameters['group_boundaries'] = self.matrix.group_boundaries
+        # Redo the parameters, ensuring things related to ticks and labels are repeated appropriately
+        nSamples = len(self.matrix.sample_labels)
+        h = dict()
+        for k, v in self.parameters.items():
+            if type(v) is list and len(v) == 0:
+                v = None
+            if k in self.special_params and type(v) is not list:
+                v = [v] * nSamples
+                if len(v) == 0:
+                    v = [None] * nSamples
+            h[k] = v
+        fh = gzip.open(file_name, 'wb')
+        params_str = json.dumps(h, separators=(',', ':'))
+        fh.write(toBytes("@" + params_str + "\n"))
+        score_list = np.ma.masked_invalid(np.mean(self.matrix.matrix, axis=1))
+        for idx, region in enumerate(self.matrix.regions):
+            # join np_array values
+            # keeping nans while converting them to strings
+            if not np.ma.is_masked(score_list[idx]):
+                float(score_list[idx])
+            matrix_values = "\t".join(
+                np.char.mod('%f', self.matrix.matrix[idx, :]))
+            starts = ["{0}".format(x[0]) for x in region[1]]
+            ends = ["{0}".format(x[1]) for x in region[1]]
+            starts = ",".join(starts)
+            ends = ",".join(ends)
+            # BEDish format (we don't currently store the score)
+            fh.write(
+                toBytes('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format(
+                        region[0],
+                        starts,
+                        ends,
+                        region[2],
+                        region[5],
+                        region[4],
+                        matrix_values)))
+        fh.close()
+    def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_label='TSS', end_label='TES', averagetype='mean'):
+        """
+        Saves the values averaged by col using the avg_type
+        given
+        Args:
+            file_handle: file name to save the file
+            reference_point_label: Name of the reference point label
+            start_label: Name of the star label
+            end_label: Name of the end label
+            averagetype: average type (e.g. mean, median, std)
+        """
+        #  get X labels
+        w = self.parameters['bin size']
+        b = self.parameters['upstream']
+        a = self.parameters['downstream']
+        c = self.parameters.get('unscaled 5 prime', 0)
+        d = self.parameters.get('unscaled 3 prime', 0)
+        m = self.parameters['body']
+        xticks = []
+        xtickslabel = []
+        for idx in range(self.matrix.get_num_samples()):
+            if b[idx] < 1e5:
+                quotient = 1000
+                symbol = 'Kb'
+            else:
+                quotient = 1e6
+                symbol = 'Mb'
+            if m[idx] == 0:
+                last = 0
+                if len(xticks):
+                    last = xticks[-1]
+                xticks.extend([last + (k / w[idx]) for k in [w[idx], b[idx], b[idx] + a[idx]]])
+                xtickslabel.extend(['{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol), reference_point_label,
+                                    '{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)])
+            else:
+                xticks_values = [w[idx]]
+                # only if upstream region is set, add a x tick
+                if b[idx] > 0:
+                    xticks_values.append(b[idx])
+                    xtickslabel.append('{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol))
+                xtickslabel.append(start_label)
+                if c[idx] > 0:
+                    xticks_values.append(b[idx] + c[idx])
+                    xtickslabel.append("")
+                if d[idx] > 0:
+                    xticks_values.append(b[idx] + c[idx] + m[idx])
+                    xtickslabel.append("")
+                xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx])
+                xtickslabel.append(end_label)
+                if a[idx] > 0:
+                    xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx] + a[idx])
+                    xtickslabel.append('{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol))
+                last = 0
+                if len(xticks):
+                    last = xticks[-1]
+                xticks.extend([last + (k / w[idx]) for k in xticks_values])
+        x_axis = np.arange(xticks[-1]) + 1
+        labs = []
+        for x_value in x_axis:
+            if x_value in xticks and xtickslabel[xticks.index(x_value)]:
+                labs.append(xtickslabel[xticks.index(x_value)])
+            elif x_value in xticks:
+                labs.append("tick")
+            else:
+                labs.append("")
+        with open(file_handle, 'w') as fh:
+            # write labels
+            fh.write("bin labels\t\t{}\n".format("\t".join(labs)))
+            fh.write('bins\t\t{}\n'.format("\t".join([str(x) for x in x_axis])))
+            for sample_idx in range(self.matrix.get_num_samples()):
+                for group_idx in range(self.matrix.get_num_groups()):
+                    sub_matrix = self.matrix.get_matrix(group_idx, sample_idx)
+                    values = [str(x) for x in np.ma.__getattribute__(averagetype)(sub_matrix['matrix'], axis=0)]
+                    fh.write("{}\t{}\t{}\n".format(sub_matrix['sample'], sub_matrix['group'], "\t".join(values)))
+    def save_matrix_values(self, file_name):
+        # print a header telling the group names and their length
+        fh = open(file_name, 'wb')
+        info = []
+        groups_len = np.diff(self.matrix.group_boundaries)
+        for i in range(len(self.matrix.group_labels)):
+            info.append("{}:{}".format(self.matrix.group_labels[i],
+                                       groups_len[i]))
+        fh.write(toBytes("#{}\n".format("\t".join(info))))
+        # add to header the x axis values
+        fh.write(toBytes("#downstream:{}\tupstream:{}\tbody:{}\tbin size:{}\tunscaled 5 prime:{}\tunscaled 3 prime:{}\n".format(
+                 self.parameters['downstream'],
+                 self.parameters['upstream'],
+                 self.parameters['body'],
+                 self.parameters['bin size'],
+                 self.parameters.get('unscaled 5 prime', 0),
+                 self.parameters.get('unscaled 3 prime', 0))))
+        sample_len = np.diff(self.matrix.sample_boundaries)
+        for i in range(len(self.matrix.sample_labels)):
+            info.extend([self.matrix.sample_labels[i]] * sample_len[i])
+        fh.write(toBytes("{}\n".format("\t".join(info))))
+        fh.close()
+        # reopen again using append mode
+        fh = open(file_name, 'ab')
+        np.savetxt(fh, self.matrix.matrix, fmt="%.4g", delimiter="\t")
+        fh.close()
+    def save_BED(self, file_handle):
+        boundaries = np.array(self.matrix.group_boundaries)
+        # Add a header
+        file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group")
+        if self.matrix.silhouette is not None:
+            file_handle.write("\tsilhouette")
+        file_handle.write("\n")
+        for idx, region in enumerate(self.matrix.regions):
+            # the label id corresponds to the last boundary
+            # that is smaller than the region index.
+            # for example for a boundary array = [0, 10, 20]
+            # and labels ['a', 'b', 'c'],
+            # for index 5, the label is 'a', for
+            # index 10, the label is 'b' etc
+            label_idx = np.flatnonzero(boundaries <= idx)[-1]
+            starts = ["{0}".format(x[0]) for x in region[1]]
+            ends = ["{0}".format(x[1]) for x in region[1]]
+            starts = ",".join(starts)
+            ends = ",".join(ends)
+            file_handle.write(
+                '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t0'.format(
+                    region[0],
+                    region[1][0][0],
+                    region[1][-1][1],
+                    region[2],
+                    region[5],
+                    region[4]))
+            file_handle.write(
+                '\t{0}\t{1}\t{2}\t{3}'.format(
+                    len(region[1]),
+                    ",".join([str(int(y) - int(x)) for x, y in region[1]]),
+                    ",".join([str(int(x) - int(starts[0])) for x, y in region[1]]),
+                    self.matrix.group_labels[label_idx]))
+            if self.matrix.silhouette is not None:
+                file_handle.write("\t{}".format(self.matrix.silhouette[idx]))
+            file_handle.write("\n")
+        file_handle.close()
+    @staticmethod
+    def matrix_avg(matrix, avgType='mean'):
+        matrix = np.ma.masked_invalid(matrix)
+        return np.ma.__getattribute__(avgType)(matrix, axis=0)
+    def get_individual_matrices(self, matrix):
+        """In case multiple matrices are saved one after the other
+        this method splits them appart.
+        Returns a list containing the matrices
+        """
+        num_cols = matrix.shape[1]
+        num_ind_cols = self.get_num_individual_matrix_cols()
+        matrices_list = []
+        for i in range(0, num_cols, num_ind_cols):
+            if i + num_ind_cols > num_cols:
+                break
+            matrices_list.append(matrix[:, i:i + num_ind_cols])
+        return matrices_list
+    def get_num_individual_matrix_cols(self):
+        """
+        returns the number of columns  that
+        each matrix should have. This is done because
+        the final matrix that is plotted can be composed
+        of smaller matrices that are merged one after
+        the other.
+        """
+        matrixCols = ((self.parameters['downstream'] + self.parameters['upstream'] + self.parameters['body'] + self.parameters['unscaled 5 prime'] + self.parameters['unscaled 3 prime']) //
+                      self.parameters['bin size'])
+        return matrixCols
+def computeSilhouetteScore(d, idx, labels):
+    """
+    Given a square distance matrix with NaN diagonals, compute the silhouette score
+    of a given row (idx). Each row should have an associated label (labels).
+    """
+    keep = ~np.isnan(d[idx, ])
+    foo = np.bincount(labels[keep], weights=d[idx, ][keep])
+    groupSizes = np.bincount(labels[keep])
+    intraIdx = labels[idx]
+    if groupSizes[intraIdx] == 1:
+        return 0
+    intra = foo[labels[idx]] / groupSizes[intraIdx]
+    interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]]
+    inter = np.min(foo[interMask] / groupSizes[interMask])
+    return (inter - intra) / max(inter, intra)
+class _matrix(object):
+    """
+    class to hold heatmapper matrices
+    The base data is a large matrix
+    with definition to know the boundaries for row and col divisions.
+    Col divisions represent groups within a subset, e.g. Active and
+    inactive from PolII bigwig data.
+    Row division represent different samples, for example
+    PolII in males vs. PolII in females.
+    This is an internal class of the heatmapper class
+    """
+    def __init__(self, regions, matrix, group_boundaries, sample_boundaries,
+                 group_labels=None, sample_labels=None):
+        # simple checks
+        assert matrix.shape[0] == group_boundaries[-1], \
+            "row max do not match matrix shape"
+        assert matrix.shape[1] == sample_boundaries[-1], \
+            "col max do not match matrix shape"
+        self.regions = regions
+        self.matrix = matrix
+        self.group_boundaries = group_boundaries
+        self.sample_boundaries = sample_boundaries
+        self.sort_method = None
+        self.sort_using = None
+        self.silhouette = None
+        if group_labels is None:
+            self.group_labels = ['group {}'.format(x)
+                                 for x in range(len(group_boundaries) - 1)]
+        else:
+            assert len(group_labels) == len(group_boundaries) - 1, \
+                "number of group labels does not match number of groups"
+            self.group_labels = group_labels
+        if sample_labels is None:
+            self.sample_labels = ['sample {}'.format(x)
+                                  for x in range(len(sample_boundaries) - 1)]
+        else:
+            assert len(sample_labels) == len(sample_boundaries) - 1, \
+                "number of sample labels does not match number of samples"
+            self.sample_labels = sample_labels
+    def get_matrix(self, group, sample):
+        """
+        Returns a sub matrix from the large
+        matrix. Group and sample are ids,
+        thus, row = 0, col=0 get the first group
+        of the first sample.
+        Returns
+        -------
+        dictionary containing the matrix,
+        the group label and the sample label
+        """
+        group_start = self.group_boundaries[group]
+        group_end = self.group_boundaries[group + 1]
+        sample_start = self.sample_boundaries[sample]
+        sample_end = self.sample_boundaries[sample + 1]
+        return {'matrix': np.ma.masked_invalid(self.matrix[group_start:group_end, :][:, sample_start:sample_end]),
+                'group': self.group_labels[group],
+                'sample': self.sample_labels[sample]}
+    def get_num_samples(self):
+        return len(self.sample_labels)
+    def get_num_groups(self):
+        return len(self.group_labels)
+    def set_group_labels(self, new_labels):
+        """ sets new labels for groups
+        """
+        if len(new_labels) != len(self.group_labels):
+            raise ValueError("length new labels != length original labels")
+        self.group_labels = new_labels
+    def set_sample_labels(self, new_labels):
+        """ sets new labels for groups
+        """
+        if len(new_labels) != len(self.sample_labels):
+            raise ValueError("length new labels != length original labels")
+        self.sample_labels = new_labels
+    def set_sorting_method(self, sort_method, sort_using):
+        self.sort_method = sort_method
+        self.sort_using = sort_using
+    def get_regions(self):
+        """Returns the regions per group
+        Returns
+        ------
+        list
+            Each element of the list is itself a list
+            of dictionaries containing the regions info:
+            chrom, start, end, strand, name etc.
+            Each element of the list corresponds to each
+            of the groups
+        """
+        regions = []
+        for idx in range(len(self.group_labels)):
+            start = self.group_boundaries[idx]
+            end = self.group_boundaries[idx + 1]
+            regions.append(self.regions[start:end])
+        return regions
+    def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None):
+        """
+        Sorts and rearranges the submatrices according to the
+        sorting method given.
+        """
+        if sort_method == 'no':
+            return
+        if (sample_list is not None) and (len(sample_list) > 0):
+            # get the ids that correspond to the selected sample list
+            idx_to_keep = []
+            for sample_idx in sample_list:
+                idx_to_keep += range(self.sample_boundaries[sample_idx], self.sample_boundaries[sample_idx + 1])
+            matrix = self.matrix[:, idx_to_keep]
+        else:
+            matrix = self.matrix
+        # compute the row average:
+        if sort_using == 'region_length':
+            matrix_avgs = list()
+            for x in self.regions:
+                matrix_avgs.append(np.sum([bar[1] - bar[0] for bar in x[1]]))
+            matrix_avgs = np.array(matrix_avgs)
+        elif sort_using == 'mean':
+            matrix_avgs = np.nanmean(matrix, axis=1)
+        elif sort_using == 'mean':
+            matrix_avgs = np.nanmean(matrix, axis=1)
+        elif sort_using == 'median':
+            matrix_avgs = np.nanmedian(matrix, axis=1)
+        elif sort_using == 'max':
+            matrix_avgs = np.nanmax(matrix, axis=1)
+        elif sort_using == 'min':
+            matrix_avgs = np.nanmin(matrix, axis=1)
+        elif sort_using == 'sum':
+            matrix_avgs = np.nansum(matrix, axis=1)
+        else:
+            sys.exit("{} is an unsupported sorting method".format(sort_using))
+        # order per group
+        _sorted_regions = []
+        _sorted_matrix = []
+        for idx in range(len(self.group_labels)):
+            start = self.group_boundaries[idx]
+            end = self.group_boundaries[idx + 1]
+            order = matrix_avgs[start:end].argsort()
+            if sort_method == 'descend':
+                order = order[::-1]
+            _sorted_matrix.append(self.matrix[start:end, :][order, :])
+            # sort the regions
+            _reg = self.regions[start:end]
+            for idx in order:
+                _sorted_regions.append(_reg[idx])
+        self.matrix = np.vstack(_sorted_matrix)
+        self.regions = _sorted_regions
+        self.set_sorting_method(sort_method, sort_using)
+    def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None):
+        matrix = np.asarray(self.matrix)
+        matrix_to_cluster = matrix
+        if clustering_samples is not None:
+            assert all(i > 0 for i in clustering_samples), \
+                "all indices should be bigger than or equal to 1."
+            assert all(i <= len(self.sample_labels) for i in
+                       clustering_samples), \
+                "each index should be smaller than or equal to {}(total "\
+                "number of samples.)".format(len(self.sample_labels))
+            clustering_samples = np.asarray(clustering_samples) - 1
+            samples_cols = []
+            for idx in clustering_samples:
+                samples_cols += range(self.sample_boundaries[idx],
+                                      self.sample_boundaries[idx + 1])
+            matrix_to_cluster = matrix_to_cluster[:, samples_cols]
+        if np.any(np.isnan(matrix_to_cluster)):
+            # replace nans for 0 otherwise kmeans produces a weird behaviour
+            sys.stderr.write("*Warning* For clustering nan values have to be replaced by zeros \n")
+            matrix_to_cluster[np.isnan(matrix_to_cluster)] = 0
+        if method == 'kmeans':
+            from scipy.cluster.vq import vq, kmeans
+            centroids, _ = kmeans(matrix_to_cluster, k)
+            # order the centroids in an attempt to
+            # get the same cluster order
+            cluster_labels, _ = vq(matrix_to_cluster, centroids)
+        if method == 'hierarchical':
+            # normally too slow for large data sets
+            from scipy.cluster.hierarchy import fcluster, linkage
+            Z = linkage(matrix_to_cluster, method='ward', metric='euclidean')
+            cluster_labels = fcluster(Z, k, criterion='maxclust')
+            # hierarchical clustering labels from 1 .. k
+            # while k-means labels 0 .. k -1
+            # Thus, for consistency, we subtract 1
+            cluster_labels -= 1
+        # sort clusters
+        _clustered_mean = []
+        _cluster_ids_list = []
+        for cluster in range(k):
+            cluster_ids = np.flatnonzero(cluster_labels == cluster)
+            _cluster_ids_list.append(cluster_ids)
+            _clustered_mean.append(matrix_to_cluster[cluster_ids, :].mean())
+        # reorder clusters based on mean
+        cluster_order = np.argsort(_clustered_mean)[::-1]
+        # create groups using the clustering
+        self.group_labels = []
+        self.group_boundaries = [0]
+        _clustered_regions = []
+        _clustered_matrix = []
+        cluster_number = 1
+        for cluster in cluster_order:
+            self.group_labels.append("cluster_{}".format(cluster_number))
+            cluster_number += 1
+            cluster_ids = _cluster_ids_list[cluster]
+            self.group_boundaries.append(self.group_boundaries[-1] +
+                                         len(cluster_ids))
+            _clustered_matrix.append(self.matrix[cluster_ids, :])
+            for idx in cluster_ids:
+                _clustered_regions.append(self.regions[idx])
+        self.regions = _clustered_regions
+        self.matrix = np.vstack(_clustered_matrix)
+        return idx
+    def computeSilhouette(self, k):
+        if k > 1:
+            from scipy.spatial.distance import pdist, squareform
+            silhouette = np.repeat(0.0, self.group_boundaries[-1])
+            groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1])
+            labels = np.repeat(np.arange(k), groupSizes)
+            d = pdist(self.matrix)
+            d2 = squareform(d)
+            np.fill_diagonal(d2, np.nan)  # This excludes the diagonal
+            for idx in range(len(labels)):
+                silhouette[idx] = computeSilhouetteScore(d2, idx, labels)
+            sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette)))
+            self.silhouette = silhouette
+    def removeempty(self):
+        """
+        removes matrix rows containing only zeros or nans
+        """
+        to_keep = []
+        score_list = np.ma.masked_invalid(np.mean(self.matrix, axis=1))
+        for idx, region in enumerate(self.regions):
+            if np.ma.is_masked(score_list[idx]) or float(score_list[idx]) == 0:
+                continue
+            else:
+                to_keep.append(idx)
+        self.regions = [self.regions[x] for x in to_keep]
+        self.matrix = self.matrix[to_keep, :]
+        # adjust sample boundaries
+        to_keep = np.array(to_keep)
+        self.group_boundaries = [len(to_keep[to_keep < x]) for x in self.group_boundaries]
+    def flatten(self):
+        """
+        flatten and remove nans from matrix. Useful
+        to get max and mins from matrix.
+        :return flattened matrix
+        """
+        matrix_flatten = np.asarray(self.matrix.flatten())
+        # nans are removed from the flattened array
+        matrix_flatten = matrix_flatten[~np.isnan(matrix_flatten)]
+        if len(matrix_flatten) == 0:
+            num_nan = len(np.flatnonzero(np.isnan(self.matrix.flatten())))
+            raise ValueError("matrix only contains nans "
+                             "(total nans: {})".format(num_nan))
+        return matrix_flatten

deepTools/source/deeptools/heatmapper_utilities.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['svg.fonttype'] = 'none'
+from deeptools import cm  # noqa: F401
+import matplotlib.colors as pltcolors
+import plotly.graph_objs as go
+old_settings = np.seterr(all='ignore')
+def plot_single(ax, ma, average_type, color, label, plot_type='lines'):
+    """
+    Adds a line to the plot in the given ax using the specified method
+    Parameters
+    ----------
+    ax : matplotlib axis
+        matplotlib axis
+    ma : numpy array
+        numpy array The data on this matrix is summarized according
+        to the `average_type` argument.
+    average_type : str
+        string values are sum mean median min max std
+    color : str
+        a valid color: either a html color name, hex
+        (e.g #002233), RGB + alpha tuple or list or RGB tuple or list
+    label : str
+        label
+    plot_type: str
+        type of plot. Either 'se' for standard error, 'std' for
+        standard deviation, 'overlapped_lines' to plot each line of the matrix,
+        fill to plot the area between the x axis and the value or any other string to
+        just plot the average line.
+    Returns
+    -------
+    ax
+        matplotlib axis
+    Examples
+    --------
+    >>> import matplotlib.pyplot as plt
+    >>> import os
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> matrix = np.array([[1,2,3],
+    ...                    [4,5,6],
+    ...                    [7,8,9]])
+    >>> ax = plot_single(ax, matrix -2, 'mean', color=[0.6, 0.8, 0.9], label='fill light blue', plot_type='fill')
+    >>> ax = plot_single(ax, matrix, 'mean', color='blue', label='red')
+    >>> ax = plot_single(ax, matrix + 5, 'mean', color='red', label='red', plot_type='std')
+    >>> ax = plot_single(ax, matrix + 10, 'mean', color='#cccccc', label='gray se', plot_type='se')
+    >>> ax = plot_single(ax, matrix + 20, 'mean', color=(0.9, 0.5, 0.9), label='violet', plot_type='std')
+    >>> ax = plot_single(ax, matrix + 30, 'mean', color=(0.9, 0.5, 0.9, 0.5), label='violet with alpha', plot_type='std')
+    >>> leg = ax.legend()
+    >>> plt.savefig("/tmp/test.pdf")
+    >>> plt.close()
+    >>> fig = plt.figure()
+    >>> os.remove("/tmp/test.pdf")
+    """
+    summary = np.ma.__getattribute__(average_type)(ma, axis=0)
+    # only plot the average profiles without error regions
+    x = np.arange(len(summary))
+    if isinstance(color, np.ndarray):
+        color = pltcolors.to_hex(color, keep_alpha=True)
+    ax.plot(x, summary, color=color, label=label, alpha=0.9)
+    if plot_type == 'fill':
+        ax.fill_between(x, summary, facecolor=color, alpha=0.6, edgecolor='none')
+    if plot_type in ['se', 'std']:
+        if plot_type == 'se':  # standard error
+            std = np.std(ma, axis=0) / np.sqrt(ma.shape[0])
+        else:
+            std = np.std(ma, axis=0)
+        alpha = 0.2
+        # an alpha channel has to be added to the color to fill the area
+        # between the mean (or median etc.) and the std or se
+        f_color = pltcolors.colorConverter.to_rgba(color, alpha)
+        ax.fill_between(x, summary, summary + std, facecolor=f_color, edgecolor='none')
+        ax.fill_between(x, summary, summary - std, facecolor=f_color, edgecolor='none')
+    ax.set_xlim(0, max(x))
+    return ax
+def plotly_single(ma, average_type, color, label, plot_type='line'):
+    """A plotly version of plot_single. Returns a list of traces"""
+    summary = list(np.ma.__getattribute__(average_type)(ma, axis=0))
+    x = list(np.arange(len(summary)))
+    if isinstance(color, str):
+        color = list(matplotlib.colors.to_rgb(color))
+    traces = [go.Scatter(x=x, y=summary, name=label, line={'color': "rgba({},{},{},0.9)".format(color[0], color[1], color[2])}, showlegend=False)]
+    if plot_type == 'fill':
+        traces[0].update(fill='tozeroy', fillcolor=color)
+    if plot_type in ['se', 'std']:
+        if plot_type == 'se':  # standard error
+            std = np.std(ma, axis=0) / np.sqrt(ma.shape[0])
+        else:
+            std = np.std(ma, axis=0)
+        x_rev = x[::-1]
+        lower = summary - std
+        trace = go.Scatter(x=x + x_rev,
+                           y=np.concatenate([summary + std, lower[::-1]]),
+                           fill='tozerox',
+                           fillcolor="rgba({},{},{},0.2)".format(color[0], color[1], color[2]),
+                           line=go.Line(color='transparent'),
+                           showlegend=False,
+                           name=label)
+        traces.append(trace)
+    return traces
+def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx):
+    """
+    returns the position and labelling of the xticks that
+    correspond to the heatmap
+    As of deepTools 3, the various parameters can be lists, in which case we then need to index things (the idx parameter)
+    As of matplotlib 3 the ticks in the heatmap need to have 0.5 added to them.
+    As of matplotlib 3.1 there is no longer padding added to all ticks. Reference point ticks will be adjusted by width/2
+    or width for spacing and the last half of scaled ticks will be shifed by 1 bin so the ticks are at the beginning of bins.
+    """
+    w = hm.parameters['bin size']
+    b = hm.parameters['upstream']
+    a = hm.parameters['downstream']
+    if idx is not None:
+        w = w[idx]
+        b = b[idx]
+        a = a[idx]
+    try:
+        c = hm.parameters['unscaled 5 prime']
+        if idx is not None:
+            c = c[idx]
+    except:
+        c = 0
+    try:
+        d = hm.parameters['unscaled 3 prime']
+        if idx is not None:
+            d = d[idx]
+    except:
+        d = 0
+    m = hm.parameters['body']
+    if idx is not None:
+        m = m[idx]
+    if b < 1e5:
+        quotient = 1000
+        symbol = 'Kb'
+    else:
+        quotient = 1e6
+        symbol = 'Mb'
+    if m == 0:
+        xticks = [(k / w) for k in [0, b - 0.5 * w, b + a - w]]
+        xtickslabel = ['{0:.1f}'.format(-(float(b) / quotient)),
+                       referencePointLabel,
+                       '{0:.1f}{1}'.format(float(a) / quotient, symbol)]
+    else:
+        xticks_values = [0]
+        xtickslabel = []
+        # only if upstream region is set, add a x tick
+        if b > 0:
+            xticks_values.append(b)
+            xtickslabel.append('{0:.1f}'.format(-(float(b) / quotient)))
+        xtickslabel.append(startLabel)
+        # set the x tick for the body parameter, regardless if
+        # upstream is 0 (not set)
+        if c > 0:
+            xticks_values.append(b + c)
+            xtickslabel.append("")
+        if d > 0:
+            xticks_values.append(b + c + m)
+            xtickslabel.append("")
+        # We need to subtract the bin size from the last 2 point so they're placed at the beginning of the bin
+        xticks_values.append(b + c + m + d - w)
+        xtickslabel.append(endLabel)
+        if a > 0:
+            xticks_values.append(b + c + m + d + a - w)
+            xtickslabel.append('{0:.1f}{1}'.format(float(a) / quotient, symbol))
+        xticks = [(k / w) for k in xticks_values]
+        xticks = [max(x, 0) for x in xticks]
+    return xticks, xtickslabel

deepTools/source/deeptools/mapReduce.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import multiprocessing
+from deeptoolsintervals import GTF
+import random
+debug = 0
+def mapReduce(staticArgs, func, chromSize,
+              genomeChunkLength=None,
+              region=None,
+              bedFile=None,
+              blackListFileName=None,
+              numberOfProcessors=4,
+              verbose=False,
+              includeLabels=False,
+              keepExons=False,
+              transcriptID="transcriptID",
+              exonID="exonID",
+              transcript_id_designator="transcript_id",
+              self_=None):
+    """
+    Split the genome into parts that are sent to workers using a defined
+    number of procesors. Results are collected and returned.
+    For each genomic region the given 'func' is called using
+    the following parameters:
+     chrom, start, end, staticArgs
+    The *arg* are static, *pickable* variables that need to be sent
+    to workers.
+    The genome chunk length corresponds to a fraction of the genome, in bp,
+    that is send to each of the workers for processing.
+    Depending on the type of process a larger or shorter regions may be
+    preferred
+    :param chromSize: A list of duples containing the chromosome
+                      name and its length
+    :param region: The format is chr:start:end:tileSize (see function
+                   getUserRegion)
+    :param staticArgs: tuple of arguments that are sent to the given 'func'
+    :param func: function to call. The function is called using the
+                 following parameters (chrom, start, end, staticArgs)
+    :param bedFile: Is a bed file is given, the args to the func to be
+                    called are extended to include a list of bed
+                    defined regions.
+    :param blackListFileName: A list of regions to exclude from all computations.
+                              Note that this has genomeChunkLength resolution...
+    :param self_: In case mapreduce should make a call to an object
+                  the self variable has to be passed.
+    :param includeLabels: Pass group and transcript labels into the calling
+                          function. These are added to the static args
+                          (groupLabel and transcriptName).
+    If "includeLabels" is true, a tuple of (results, labels) is returned
+    """
+    if not genomeChunkLength:
+        genomeChunkLength = 1e5
+    genomeChunkLength = int(genomeChunkLength)
+    if verbose:
+        print("genome partition size for multiprocessing: {0}".format(
+            genomeChunkLength))
+    region_start = 0
+    region_end = None
+    # if a region is set, that means that the task should only cover
+    # the given genomic position
+    if region:
+        chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, region)
+        if verbose:
+            print("chrom size: {0}, region start: {1}, region end: {2}, "
+                  "genome chunk length sent to each procesor: {3}".format(chromSize, region_start, region_end, genomeChunkLength))
+    if bedFile:
+        defaultGroup = None
+        if len(bedFile) == 1:
+            defaultGroup = "genes"
+        bed_interval_tree = GTF(bedFile, defaultGroup=defaultGroup, transcriptID=transcriptID, exonID=exonID, transcript_id_designator=transcript_id_designator, keepExons=keepExons)
+    if blackListFileName:
+        blackList = GTF(blackListFileName)
+    TASKS = []
+    # iterate over all chromosomes
+    for chrom, size in chromSize:
+        # the start is zero unless a specific region is defined
+        start = 0 if region_start == 0 else region_start
+        for startPos in range(start, size, genomeChunkLength):
+            endPos = min(size, startPos + genomeChunkLength)
+            # Reject a chunk if it overlaps
+            if blackListFileName:
+                regions = blSubtract(blackList, chrom, [startPos, endPos])
+            else:
+                regions = [[startPos, endPos]]
+            for reg in regions:
+                if self_ is not None:
+                    argsList = [self_]
+                else:
+                    argsList = []
+                argsList.extend([chrom, reg[0], reg[1]])
+                # add to argument list the static list received the the function
+                argsList.extend(staticArgs)
+                # if a bed file is given, append to the TASK list,
+                # a list of bed regions that overlap with the
+                # current genomeChunk.
+                if bedFile:
+                    # This effectively creates batches of intervals, which is
+                    # generally more performant due to the added overhead of
+                    # initializing additional workers.
+                    # TODO, there's no point in including the chromosome
+                    if includeLabels:
+                        bed_regions_list = [[chrom, x[4], x[2], x[3], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, numericGroups=True, includeStrand=True)]
+                    else:
+                        bed_regions_list = [[chrom, x[4], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, includeStrand=True)]
+                    if len(bed_regions_list) == 0:
+                        continue
+                    # add to argument list, the position of the bed regions to use
+                    argsList.append(bed_regions_list)
+                TASKS.append(tuple(argsList))
+    if len(TASKS) > 1 and numberOfProcessors > 1:
+        if verbose:
+            print(("using {} processors for {} "
+                   "number of tasks".format(numberOfProcessors,
+                                            len(TASKS))))
+        random.shuffle(TASKS)
+        pool = multiprocessing.Pool(numberOfProcessors)
+        res = pool.map_async(func, TASKS).get(9999999)
+        pool.close()
+        pool.join()
+    else:
+        res = list(map(func, TASKS))
+    if includeLabels:
+        if bedFile:
+            return res, bed_interval_tree.labels
+        else:
+            return res, None
+    return res
+def getUserRegion(chrom_sizes, region_string, max_chunk_size=1e6):
+    r"""
+    Verifies if a given region argument, given by the user
+    is valid. The format of the region_string is chrom:start:end:tileSize
+    where start, end and tileSize are optional.
+    :param chrom_sizes: dictionary of chromosome/scaffold size. Key=chromosome name
+    :param region_string: a string of the form chr:start:end
+    :param max_chunk_size: upper limit for the chunk size
+    :return: tuple chrom_size for the region start, region end, chunk size
+    #>>> data = getUserRegion({'chr2': 1000}, "chr1:10:10")
+    #Traceback (most recent call last):
+    #    ...
+    #NameError: Unknown chromosome: chr1
+    #Known chromosomes are: ['chr2']
+    If the region end is biger than the chromosome size, this
+    value is used instead
+    >>> getUserRegion({'chr2': 1000}, "chr2:10:1001")
+    ([('chr2', 1000)], 10, 1000, 990)
+    Test chunk and regions size reduction to match tile size
+    >>> getUserRegion({'chr2': 200000}, "chr2:10:123344:3")
+    ([('chr2', 123344)], 9, 123345, 123336)
+    Test chromosome name mismatch
+    >>> getUserRegion({'2': 200000}, "chr2:10:123344:3")
+    ([('2', 123344)], 9, 123345, 123336)
+    >>> getUserRegion({'chrM': 200000}, "MT:10:123344:3")
+    ([('chrM', 123344)], 9, 123345, 123336)
+    """
+    region = region_string.split(":")
+    chrom = region[0]
+    chrom_sizes = dict(chrom_sizes)
+    if chrom not in list(chrom_sizes.keys()):
+        if chrom == "MT":
+            chromUse = "chrM"
+        elif chrom == "chrM":
+            chromUse = "MT"
+        elif chrom[0:3] == "chr":
+            chromUse = chrom[3:]
+        else:
+            chromUse = "chr" + chrom
+        if chromUse not in list(chrom_sizes.keys()):
+            raise NameError("Unknown chromosome: %s\nKnown "
+                            "chromosomes are: %s " % (chrom, list(chrom_sizes.keys())))
+        chrom = chromUse
+    try:
+        region_start = int(region[1])
+    except IndexError:
+        region_start = 0
+    try:
+        region_end = int(region[2]) if int(region[2]) <= chrom_sizes[chrom] \
+            else chrom_sizes[chrom]
+    except IndexError:
+        region_end = chrom_sizes[chrom]
+    if region_start > region_end or region_start < 0:
+        raise NameError("{} not valid. The format is chrom:start:end. "
+                        "Without comas, dashes or dots. ".format(region_string))
+    try:
+        tilesize = int(region[3])
+    except IndexError:
+        tilesize = None
+    chrom_sizes = [(chrom, region_end)]
+    # if tilesize is given, make region_start and region_end
+    # multiple of tileSize
+    if tilesize:
+        region_start -= region_start % tilesize
+        region_end += tilesize - (region_end % tilesize)
+    chunk_size = int(region_end - region_start)
+    if chunk_size > max_chunk_size:
+        chunk_size = max_chunk_size
+        if tilesize and tilesize < chunk_size:
+            chunk_size -= chunk_size % tilesize
+    return chrom_sizes, region_start, region_end, int(chunk_size)
+def blSubtract(t, chrom, chunk):
+    """
+    If a genomic region overlaps with a blacklisted region, then subtract that region out
+    returns a list of lists
+    """
+    if t is None:
+        return [chunk]
+    overlaps = t.findOverlaps(chrom, chunk[0], chunk[1])
+    if overlaps is not None and len(overlaps) > 0:
+        output = []
+        for o in overlaps:
+            if chunk[1] <= chunk[0]:
+                break
+            if chunk[0] < o[0]:
+                output.append([chunk[0], o[0]])
+            chunk[0] = o[1]
+        if chunk[0] < chunk[1]:
+            output.append([chunk[0], chunk[1]])
+    else:
+        output = [chunk]
+    return output

deepTools/source/deeptools/misc.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+# This should force numpy to run single threaded. See issue #697
+# This module MUST be imported before numpy
+# Note that these environment variables are internal to deepTools (they won't exist on the shell after the command completes)
+if 'MKL_NUM_THREADS' not in os.environ:
+    os.environ['MKL_NUM_THREADS'] = 'sequential'
+if 'NUMEXPR_NUM_THREADS' not in os.environ:
+    os.environ['NUMEXPR_NUM_THREADS'] = '1'
+if 'OMP_NUM_THREADS' not in os.environ:
+    os.environ['OMP_NUM_THREADS'] = '1'
+if 'VECLIB_MAXIMUM_THREADS' not in os.environ:
+    os.environ['VECLIB_MAXIMUM_THREADS'] = '1'