[build-system] requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" [project] name = "doc_redaction" version = "2.2.0" description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface" readme = "README.md" authors = [ { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" }, ] maintainers = [ { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" }, ] license = "AGPL-3.0-only" # This licence type required to use PyMuPDF keywords = [ "redaction", "pdf", "nlp", "documents", "document-processing", "gradio", "pii", "pii-detection" ] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Legal Industry", "Topic :: Text Processing :: General", "Topic :: Security :: Cryptography", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13" ] requires-python = ">=3.10" dependencies = [ "pdfminer.six<=20260107", "pdf2image<=1.17.0", "pymupdf<=1.27.1", "bleach<=6.3.0", "opencv-python<=4.13.0.92", "presidio_analyzer<=2.2.362", "presidio_anonymizer<=2.2.362", "presidio-image-redactor<=0.0.58", "pikepdf<=10.3.0", "pandas<=2.3.3", "scikit-learn<=1.8.0", "spacy<=3.8.14", "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz", "gradio<=6.10.0", "boto3<=1.42.91", "pyarrow<=23.0.1", "openpyxl<=3.1.5", "Faker<=40.8.0", "python-levenshtein<=0.27.3", "spaczz<=0.6.1", "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.5.3/gradio_image_annotation-0.5.3-py3-none-any.whl", "rapidfuzz<=3.14.5", "python-dotenv<=1.2.2", "awslambdaric<=3.1.1", "python-docx<=1.2.0", "polars<=1.38.1", "defusedxml<=0.7.1", "numpy<=2.4.4", "spaces<=0.48.3", "google-genai<=1.73.0", "openai<=2.31.0", "markdown<=3.10.2", "tabulate<=0.10.0" ] [project.optional-dependencies] # For testing dev = ["pytest"] test = ["pytest", "pytest-cov"] # To install the app with paddle and vlm support with pip, example command (in base folder and correct python environment): pip install .[paddle,vlm], or uv pip install .[paddle,vlm] if using uv. Note need to GPU version of Torch below # Extra dependencies for PaddleOCR # The following installs the CPU version of paddleOCR. If you want the GPU-accelerated version, run pip install the relevant wheel for paddlepaddle-gpu==3.2.1 from the following link: https://www.paddlepaddle.org.cn/packages/stable/cu129/ paddle = [ "protobuf<=7.34.0", "paddlepaddle>=3.0.0,<=3.2.1", "paddleocr<=3.3.0", "pycocotools<=2.0.10", ] # Extra dependencies for VLM models # The following installs the CPU compatible version of pytorch. For torch cuda support you should run manually pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129 after installation vlm = [ "torch<=2.9.1", "torchvision<=0.24.1", "transformers<=5.5.4", "accelerate<=1.13.0", "bitsandbytes<=0.49.2", # Needed for on the fly quantisation in transformers "sentencepiece<=0.2.1", # Needed for PaddleOCRVL #"optimum<=2.1.0", # Needed for GPTQ quantised models in transformers. Commented out, as optional #"GPTQModel<=5.8.0", # Needed for GPTQ quantised models in transformers. Highly advised to install from a wheel from https://github.com/ModelCloud/GPTQModel #"flash_attn<=2.8.3", # Faster inference with transformers. Highly recommended to install from a wheel at https://github.com/Dao-AILab/flash-attention ] # Run Gradio as an mcp server mcp = [ "gradio[mcp]<=6.10.0" ] [project.urls] Homepage = "https://seanpedrick-case.github.io/doc_redaction/" Repository = "https://github.com/seanpedrick-case/doc_redaction" [project.scripts] cli_redact = "doc_redaction.cli_redact:main" mcp_doc_redaction = "mcp_doc_redaction.server:main" doc_redaction_install_deps = "doc_redaction.install_deps:main" [tool.setuptools] include-package-data = true py-modules = [ "app", "agent_routes", "cli_redact", "lambda_entrypoint", "load_dynamo_logs", "load_s3_logs", ] [tool.setuptools.packages.find] where = ["."] include = ["doc_redaction*", "tools*", "mcp_doc_redaction*"] exclude = [ "test*", "skills*", "cdk*", "src*", "example_data*", ] # Configuration for Ruff linter: [tool.ruff] line-length = 88 [tool.ruff.lint] select = ["E", "F", "I"] ignore = [ "E501", # line-too-long (handled with Black) "E402", # module-import-not-at-top-of-file (sometimes needed for conditional imports) ] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401"] # Allow unused imports in __init__.py # Configuration for a Black formatter: [tool.black] line-length = 88 target-version = ['py310'] # Configuration for pytest: [tool.pytest.ini_options] markers = [ "integration: optional slow tests (CLI PDF smoke; set PYTEST_CLI_INTEGRATION=1 where needed)", ] filterwarnings = [ "ignore::DeprecationWarning:click.parser", "ignore::DeprecationWarning:weasel.util.config", "ignore::DeprecationWarning:builtin type", "ignore::DeprecationWarning:websockets.legacy", "ignore::DeprecationWarning:websockets.server", "ignore::DeprecationWarning:spacy.cli._util", "ignore::DeprecationWarning:weasel.util.config", "ignore::DeprecationWarning:importlib._bootstrap", ] testpaths = ["test"] python_files = ["test_*.py", "*_test.py"] python_classes = ["Test*"] python_functions = ["test_*"] addopts = [ "-v", "--tb=short", "--strict-markers", "--disable-warnings", "-m", "not integration", ]