[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "doc_redaction"
version = "2.2.0"
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
readme = "README.md"
authors = [
    { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
]
maintainers = [
    { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
]
license = "AGPL-3.0-only" # This licence type required to use PyMuPDF
keywords = [
    "redaction",
    "pdf",
    "nlp",
    "documents",
    "document-processing",
    "gradio",
    "pii",
    "pii-detection"
]
classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Intended Audience :: Developers",
    "Intended Audience :: Legal Industry",
    "Topic :: Text Processing :: General",
    "Topic :: Security :: Cryptography",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13"
]
requires-python = ">=3.10"
dependencies = [
    "pdfminer.six<=20260107",
    "pdf2image<=1.17.0",
    "pymupdf<=1.27.1",
    "bleach<=6.3.0",
    "opencv-python<=4.13.0.92",
    "presidio_analyzer<=2.2.362",
    "presidio_anonymizer<=2.2.362",
    "presidio-image-redactor<=0.0.58",
    "pikepdf<=10.3.0",
    "pandas<=2.3.3",
    "scikit-learn<=1.8.0",
    "spacy<=3.8.14",
    "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
    "gradio<=6.10.0",
    "boto3<=1.42.91",
    "pyarrow<=23.0.1",
    "openpyxl<=3.1.5",
    "Faker<=40.8.0",
    "python-levenshtein<=0.27.3",
    "spaczz<=0.6.1",
    "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.5.3/gradio_image_annotation-0.5.3-py3-none-any.whl",
    "rapidfuzz<=3.14.5",
    "python-dotenv<=1.2.2",
    "awslambdaric<=3.1.1",
    "python-docx<=1.2.0",
    "polars<=1.38.1",
    "defusedxml<=0.7.1",
    "numpy<=2.4.4",
    "spaces<=0.48.3",
    "google-genai<=1.73.0",
    "openai<=2.31.0",
    "markdown<=3.10.2",
    "tabulate<=0.10.0"
]

[project.optional-dependencies]

# For testing
dev = ["pytest"]
test = ["pytest", "pytest-cov"]

# To install the app with paddle and vlm support with pip, example command (in base folder and correct python environment): pip install .[paddle,vlm], or uv pip install .[paddle,vlm] if using uv. Note need to GPU version of Torch below

# Extra dependencies for PaddleOCR
# The following installs the CPU version of paddleOCR. If you want the GPU-accelerated version, run pip install the relevant wheel for paddlepaddle-gpu==3.2.1 from the following link: https://www.paddlepaddle.org.cn/packages/stable/cu129/
paddle = [
    "protobuf<=7.34.0",
    "paddlepaddle>=3.0.0,<=3.2.1", 
    "paddleocr<=3.3.0",
    "pycocotools<=2.0.10",
]

# Extra dependencies for VLM models
# The following installs the CPU compatible version of pytorch. For torch cuda support you should run manually pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129 after installation
vlm = [
    "torch<=2.9.1",
    "torchvision<=0.24.1",
    "transformers<=5.5.4",
    "accelerate<=1.13.0",
    "bitsandbytes<=0.49.2", # Needed for on the fly quantisation in transformers
    "sentencepiece<=0.2.1", # Needed for PaddleOCRVL
    #"optimum<=2.1.0", # Needed for GPTQ quantised models in transformers. Commented out, as optional
    #"GPTQModel<=5.8.0", # Needed for GPTQ quantised models in transformers. Highly advised to install from a wheel from https://github.com/ModelCloud/GPTQModel
    #"flash_attn<=2.8.3", # Faster inference with transformers. Highly recommended to install from a wheel at https://github.com/Dao-AILab/flash-attention
]

# Run Gradio as an mcp server
mcp = [
    "gradio[mcp]<=6.10.0"
]

[project.urls]
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
Repository = "https://github.com/seanpedrick-case/doc_redaction"

[project.scripts]
cli_redact = "doc_redaction.cli_redact:main"
mcp_doc_redaction = "mcp_doc_redaction.server:main"
doc_redaction_install_deps = "doc_redaction.install_deps:main"

[tool.setuptools]
include-package-data = true
py-modules = [
    "app",
    "agent_routes",
    "cli_redact",
    "lambda_entrypoint",
    "load_dynamo_logs",
    "load_s3_logs",
]

[tool.setuptools.packages.find]
where = ["."]
include = ["doc_redaction*", "tools*", "mcp_doc_redaction*"]
exclude = [
    "test*",
    "skills*",
    "cdk*",
    "src*",
    "example_data*",
]

# Configuration for Ruff linter:
[tool.ruff]
line-length = 88

[tool.ruff.lint]
select = ["E", "F", "I"]
ignore = [
    "E501",  # line-too-long (handled with Black)
    "E402",  # module-import-not-at-top-of-file (sometimes needed for conditional imports)
]

[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]  # Allow unused imports in __init__.py

# Configuration for a Black formatter:
[tool.black]
line-length = 88
target-version = ['py310']

# Configuration for pytest:
[tool.pytest.ini_options]
markers = [
    "integration: optional slow tests (CLI PDF smoke; set PYTEST_CLI_INTEGRATION=1 where needed)",
]
filterwarnings = [
    "ignore::DeprecationWarning:click.parser",
    "ignore::DeprecationWarning:weasel.util.config",
    "ignore::DeprecationWarning:builtin type",
    "ignore::DeprecationWarning:websockets.legacy",
    "ignore::DeprecationWarning:websockets.server",
    "ignore::DeprecationWarning:spacy.cli._util",
    "ignore::DeprecationWarning:weasel.util.config",
    "ignore::DeprecationWarning:importlib._bootstrap",
]
testpaths = ["test"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
    "-v",
    "--tb=short",
    "--strict-markers",
    "--disable-warnings",
    "-m",
    "not integration",
]