Karim shoair commited on
Commit ·
2b837a0
1
Parent(s): abf8a7e
Going online (first public version)
Browse files- .bandit.yml +5 -0
- .flake8 +3 -0
- .github/FUNDING.yml +1 -0
- .github/ISSUE_TEMPLATE/01-bug_report.yml +82 -0
- .github/ISSUE_TEMPLATE/02-feature_request.yml +19 -0
- .github/ISSUE_TEMPLATE/03-other.yml +19 -0
- .github/ISSUE_TEMPLATE/config.yml +1 -0
- .github/PULL_REQUEST_TEMPLATE.md +53 -0
- .github/workflows/publish.yml +31 -0
- .github/workflows/tests.yml +48 -0
- .gitignore +76 -139
- .pre-commit-config.yaml +14 -0
- CONTRIBUTING.md +30 -0
- MANIFEST.in +5 -0
- README.md +434 -0
- ROADMAP.md +13 -0
- benchmarks.py +139 -0
- docs/Core/using scrapling custom types.md +21 -0
- docs/Examples/selectorless_stackoverflow.py +23 -0
- docs/Extending Scrapling/writing storage system.md +17 -0
- docs/index.md +2 -0
- pytest.ini +2 -0
- scrapling/__init__.py +10 -0
- scrapling/custom_types.py +146 -0
- scrapling/mixins.py +74 -0
- scrapling/parser.py +903 -0
- scrapling/py.typed +0 -0
- scrapling/storage_adaptors.py +149 -0
- scrapling/translator.py +148 -0
- scrapling/utils.py +164 -0
- setup.cfg +8 -0
- setup.py +65 -0
- tests/__init__.py +1 -0
- tests/requirements.txt +2 -0
- tests/test_all_functions.py +336 -0
- tox.ini +20 -0
.bandit.yml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
skips:
|
| 2 |
+
- B101
|
| 3 |
+
- B311
|
| 4 |
+
- B320
|
| 5 |
+
- B410
|
.flake8
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[flake8]
|
| 2 |
+
ignore = E501 # line too long
|
| 3 |
+
exclude = .git,__pycache__,docs,.github,build,dist
|
.github/FUNDING.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
github: D4Vinci
|
.github/ISSUE_TEMPLATE/01-bug_report.yml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Bug report
|
| 2 |
+
description: Create a bug report to help us address errors in the repository
|
| 3 |
+
labels: [bug]
|
| 4 |
+
body:
|
| 5 |
+
- type: checkboxes
|
| 6 |
+
attributes:
|
| 7 |
+
label: Have you searched if there an existing issue for this?
|
| 8 |
+
description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/bug).
|
| 9 |
+
options:
|
| 10 |
+
- label: I have searched the existing issues
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: input
|
| 14 |
+
attributes:
|
| 15 |
+
label: "Python version (python --version)"
|
| 16 |
+
placeholder: "Python 3.8"
|
| 17 |
+
validations:
|
| 18 |
+
required: true
|
| 19 |
+
|
| 20 |
+
- type: input
|
| 21 |
+
attributes:
|
| 22 |
+
label: "Scrapling version (scrapling.__version__)"
|
| 23 |
+
placeholder: "0.1"
|
| 24 |
+
validations:
|
| 25 |
+
required: true
|
| 26 |
+
|
| 27 |
+
- type: textarea
|
| 28 |
+
attributes:
|
| 29 |
+
label: "Dependencies version (pip3 freeze)"
|
| 30 |
+
description: >
|
| 31 |
+
This is the output of the command `pip3 freeze --all`. Note that the
|
| 32 |
+
actual output might be different as compared to the placeholder text.
|
| 33 |
+
placeholder: |
|
| 34 |
+
cssselect==1.2.0
|
| 35 |
+
lxml==5.3.0
|
| 36 |
+
orjson==3.10.7
|
| 37 |
+
...
|
| 38 |
+
validations:
|
| 39 |
+
required: true
|
| 40 |
+
|
| 41 |
+
- type: input
|
| 42 |
+
attributes:
|
| 43 |
+
label: "What's your operating system?"
|
| 44 |
+
placeholder: "Windows 10"
|
| 45 |
+
validations:
|
| 46 |
+
required: true
|
| 47 |
+
|
| 48 |
+
- type: dropdown
|
| 49 |
+
attributes:
|
| 50 |
+
label: 'Are you using a separate virtual environment?'
|
| 51 |
+
description: "Please pay attention to this question"
|
| 52 |
+
options:
|
| 53 |
+
- No
|
| 54 |
+
- Yes
|
| 55 |
+
default: 0
|
| 56 |
+
validations:
|
| 57 |
+
required: true
|
| 58 |
+
|
| 59 |
+
- type: textarea
|
| 60 |
+
attributes:
|
| 61 |
+
label: "Expected behavior"
|
| 62 |
+
description: "Describe the behavior you expect. May include images or videos."
|
| 63 |
+
validations:
|
| 64 |
+
required: true
|
| 65 |
+
|
| 66 |
+
- type: textarea
|
| 67 |
+
attributes:
|
| 68 |
+
label: "Actual behavior (Remember to use `debug` parameter)"
|
| 69 |
+
validations:
|
| 70 |
+
required: true
|
| 71 |
+
|
| 72 |
+
- type: textarea
|
| 73 |
+
attributes:
|
| 74 |
+
label: Steps To Reproduce
|
| 75 |
+
description: Steps to reproduce the behavior.
|
| 76 |
+
placeholder: |
|
| 77 |
+
1. In this environment...
|
| 78 |
+
2. With this config...
|
| 79 |
+
3. Run '...'
|
| 80 |
+
4. See error...
|
| 81 |
+
validations:
|
| 82 |
+
required: false
|
.github/ISSUE_TEMPLATE/02-feature_request.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Feature request
|
| 2 |
+
description: Suggest features, propose improvements, discuss new ideas.
|
| 3 |
+
labels: [enhancement]
|
| 4 |
+
body:
|
| 5 |
+
- type: checkboxes
|
| 6 |
+
attributes:
|
| 7 |
+
label: Have you searched if there an existing feature request for this?
|
| 8 |
+
description: Please search [existing requests](https://github.com/D4Vinci/Scrapling/labels/enhancement).
|
| 9 |
+
options:
|
| 10 |
+
- label: I have searched the existing requests
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: textarea
|
| 14 |
+
attributes:
|
| 15 |
+
label: "Feature description"
|
| 16 |
+
description: >
|
| 17 |
+
This could include new topics or improving any existing features/implementations.
|
| 18 |
+
validations:
|
| 19 |
+
required: true
|
.github/ISSUE_TEMPLATE/03-other.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Other
|
| 2 |
+
description: Use this for any other issues. PLEASE do not create blank issues
|
| 3 |
+
labels: ["awaiting triage"]
|
| 4 |
+
body:
|
| 5 |
+
- type: textarea
|
| 6 |
+
id: issuedescription
|
| 7 |
+
attributes:
|
| 8 |
+
label: What would you like to share?
|
| 9 |
+
description: Provide a clear and concise explanation of your issue.
|
| 10 |
+
validations:
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: textarea
|
| 14 |
+
id: extrainfo
|
| 15 |
+
attributes:
|
| 16 |
+
label: Additional information
|
| 17 |
+
description: Is there anything else we should know about this issue?
|
| 18 |
+
validations:
|
| 19 |
+
required: false
|
.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
.github/PULL_REQUEST_TEMPLATE.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!--
|
| 2 |
+
You are amazing! Thanks for contributing to Scrapling!
|
| 3 |
+
Please, DO NOT DELETE ANY TEXT from this template! (unless instructed).
|
| 4 |
+
-->
|
| 5 |
+
|
| 6 |
+
## Proposed change
|
| 7 |
+
<!--
|
| 8 |
+
Describe the big picture of your changes here to communicate to the
|
| 9 |
+
maintainers why we should accept this pull request. If it fixes a bug
|
| 10 |
+
or resolves a feature request, be sure to link to that issue in the
|
| 11 |
+
additional information section.
|
| 12 |
+
-->
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
### Type of change:
|
| 16 |
+
<!--
|
| 17 |
+
What type of change does your PR introduce to Scrapling?
|
| 18 |
+
NOTE: Please, check at least 1 box!
|
| 19 |
+
If your PR requires multiple boxes to be checked, you'll most likely need to
|
| 20 |
+
split it into multiple PRs. This makes things easier and faster to code review.
|
| 21 |
+
-->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- [ ] Dependency upgrade
|
| 26 |
+
- [ ] Bugfix (non-breaking change which fixes an issue)
|
| 27 |
+
- [ ] New integration (thank you!)
|
| 28 |
+
- [ ] New feature (which adds functionality to an existing integration)
|
| 29 |
+
- [ ] Deprecation (breaking change to happen in the future)
|
| 30 |
+
- [ ] Breaking change (fix/feature causing existing functionality to break)
|
| 31 |
+
- [ ] Code quality improvements to existing code or addition of tests
|
| 32 |
+
- [ ] Add or change doctests? -- Note: Please avoid changing both code and tests in a single pull request.
|
| 33 |
+
- [ ] Documentation change?
|
| 34 |
+
|
| 35 |
+
### Additional information
|
| 36 |
+
<!--
|
| 37 |
+
Details are important, and help maintainers processing your PR.
|
| 38 |
+
Please be sure to fill out additional details, if applicable.
|
| 39 |
+
-->
|
| 40 |
+
|
| 41 |
+
- This PR fixes or closes issue: fixes #
|
| 42 |
+
- This PR is related to issue:
|
| 43 |
+
- Link to documentation pull request: **
|
| 44 |
+
|
| 45 |
+
### Checklist:
|
| 46 |
+
* [ ] I have read [CONTRIBUTING.md](/CONTRIBUTING.md).
|
| 47 |
+
* [ ] This pull request is all my own work -- I have not plagiarized.
|
| 48 |
+
* [ ] I know that pull requests will not be merged if they fail the automated tests.
|
| 49 |
+
* [ ] All new Python files are placed inside an existing directory.
|
| 50 |
+
* [ ] All filenames are in all lowercase characters with no spaces or dashes.
|
| 51 |
+
* [ ] All functions and variable names follow Python naming conventions.
|
| 52 |
+
* [ ] All function parameters and return values are annotated with Python [type hints](https://docs.python.org/3/library/typing.html).
|
| 53 |
+
* [ ] All functions have doc-strings.
|
.github/workflows/publish.yml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Publish Python 🐍 distributions 📦 to PyPI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
release:
|
| 5 |
+
types: [created]
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
build-n-publish:
|
| 9 |
+
name: Build and publish Python 🐍 distributions 📦 to PyPI
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v4
|
| 13 |
+
- name: Set up Python
|
| 14 |
+
uses: actions/setup-python@v5
|
| 15 |
+
with:
|
| 16 |
+
python-version: "3.x" # Latest available Python version
|
| 17 |
+
|
| 18 |
+
- name: Upgrade pip
|
| 19 |
+
run: python3 -m pip install --upgrade pip
|
| 20 |
+
|
| 21 |
+
- name: Install build
|
| 22 |
+
run: python3 -m pip install --upgrade build twine setuptools
|
| 23 |
+
|
| 24 |
+
- name: Build a binary wheel and a source tarball
|
| 25 |
+
run: python3 -m build --sdist --wheel --outdir dist/
|
| 26 |
+
|
| 27 |
+
- name: Publish distribution 📦 to PyPI
|
| 28 |
+
uses: pypa/gh-action-pypi-publish@release/v1.10.3
|
| 29 |
+
with:
|
| 30 |
+
user: __token__
|
| 31 |
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
.github/workflows/tests.yml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Tests
|
| 2 |
+
on: [push, pull_request]
|
| 3 |
+
|
| 4 |
+
concurrency:
|
| 5 |
+
group: ${{github.workflow}}-${{ github.ref }}
|
| 6 |
+
cancel-in-progress: true
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
tests:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
strategy:
|
| 12 |
+
fail-fast: false
|
| 13 |
+
matrix:
|
| 14 |
+
include:
|
| 15 |
+
- python-version: "3.6"
|
| 16 |
+
env:
|
| 17 |
+
TOXENV: py
|
| 18 |
+
- python-version: "3.7"
|
| 19 |
+
env:
|
| 20 |
+
TOXENV: py
|
| 21 |
+
- python-version: "3.8"
|
| 22 |
+
env:
|
| 23 |
+
TOXENV: py
|
| 24 |
+
- python-version: "3.9"
|
| 25 |
+
env:
|
| 26 |
+
TOXENV: py
|
| 27 |
+
- python-version: "3.10"
|
| 28 |
+
env:
|
| 29 |
+
TOXENV: py
|
| 30 |
+
- python-version: "3.11"
|
| 31 |
+
env:
|
| 32 |
+
TOXENV: py
|
| 33 |
+
- python-version: "3.12"
|
| 34 |
+
env:
|
| 35 |
+
TOXENV: py
|
| 36 |
+
|
| 37 |
+
steps:
|
| 38 |
+
- uses: actions/checkout@v4
|
| 39 |
+
- name: Set up Python ${{ matrix.python-version }}
|
| 40 |
+
uses: actions/setup-python@v5
|
| 41 |
+
with:
|
| 42 |
+
python-version: ${{ matrix.python-version }}
|
| 43 |
+
|
| 44 |
+
- name: Run tests
|
| 45 |
+
env: ${{ matrix.env }}
|
| 46 |
+
run: |
|
| 47 |
+
pip install -U tox
|
| 48 |
+
tox
|
.gitignore
CHANGED
|
@@ -1,128 +1,25 @@
|
|
| 1 |
-
#
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
| 4 |
-
*$py.class
|
| 5 |
-
|
| 6 |
-
# C extensions
|
| 7 |
-
*.so
|
| 8 |
-
|
| 9 |
-
# Distribution / packaging
|
| 10 |
-
.Python
|
| 11 |
-
build/
|
| 12 |
-
develop-eggs/
|
| 13 |
-
dist/
|
| 14 |
-
downloads/
|
| 15 |
-
eggs/
|
| 16 |
-
.eggs/
|
| 17 |
-
lib/
|
| 18 |
-
lib64/
|
| 19 |
-
parts/
|
| 20 |
-
sdist/
|
| 21 |
-
var/
|
| 22 |
-
wheels/
|
| 23 |
-
share/python-wheels/
|
| 24 |
-
*.egg-info/
|
| 25 |
-
.installed.cfg
|
| 26 |
-
*.egg
|
| 27 |
-
MANIFEST
|
| 28 |
-
|
| 29 |
-
# PyInstaller
|
| 30 |
-
# Usually these files are written by a python script from a template
|
| 31 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
-
*.manifest
|
| 33 |
-
*.spec
|
| 34 |
-
|
| 35 |
-
# Installer logs
|
| 36 |
-
pip-log.txt
|
| 37 |
-
pip-delete-this-directory.txt
|
| 38 |
-
|
| 39 |
-
# Unit test / coverage reports
|
| 40 |
-
htmlcov/
|
| 41 |
-
.tox/
|
| 42 |
-
.nox/
|
| 43 |
-
.coverage
|
| 44 |
-
.coverage.*
|
| 45 |
.cache
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
*.
|
| 49 |
-
|
| 50 |
-
.
|
| 51 |
-
.
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
*.
|
| 57 |
-
|
| 58 |
-
# Django stuff:
|
| 59 |
-
*.log
|
| 60 |
-
local_settings.py
|
| 61 |
-
db.sqlite3
|
| 62 |
-
db.sqlite3-journal
|
| 63 |
-
|
| 64 |
-
# Flask stuff:
|
| 65 |
-
instance/
|
| 66 |
-
.webassets-cache
|
| 67 |
-
|
| 68 |
-
# Scrapy stuff:
|
| 69 |
-
.scrapy
|
| 70 |
-
|
| 71 |
-
# Sphinx documentation
|
| 72 |
-
docs/_build/
|
| 73 |
-
|
| 74 |
-
# PyBuilder
|
| 75 |
-
.pybuilder/
|
| 76 |
-
target/
|
| 77 |
-
|
| 78 |
-
# Jupyter Notebook
|
| 79 |
-
.ipynb_checkpoints
|
| 80 |
-
|
| 81 |
-
# IPython
|
| 82 |
-
profile_default/
|
| 83 |
-
ipython_config.py
|
| 84 |
-
|
| 85 |
-
# pyenv
|
| 86 |
-
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
-
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
-
# .python-version
|
| 89 |
-
|
| 90 |
-
# pipenv
|
| 91 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
-
# install all needed dependencies.
|
| 95 |
-
#Pipfile.lock
|
| 96 |
-
|
| 97 |
-
# poetry
|
| 98 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
-
# commonly ignored for libraries.
|
| 101 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
-
#poetry.lock
|
| 103 |
-
|
| 104 |
-
# pdm
|
| 105 |
-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
-
#pdm.lock
|
| 107 |
-
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
-
# in version control.
|
| 109 |
-
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 110 |
-
.pdm.toml
|
| 111 |
-
.pdm-python
|
| 112 |
-
.pdm-build/
|
| 113 |
-
|
| 114 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 115 |
-
__pypackages__/
|
| 116 |
-
|
| 117 |
-
# Celery stuff
|
| 118 |
-
celerybeat-schedule
|
| 119 |
-
celerybeat.pid
|
| 120 |
|
| 121 |
-
#
|
| 122 |
-
*.
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
#
|
| 125 |
-
.env
|
| 126 |
.venv
|
| 127 |
env/
|
| 128 |
venv/
|
|
@@ -130,33 +27,73 @@ ENV/
|
|
| 130 |
env.bak/
|
| 131 |
venv.bak/
|
| 132 |
|
| 133 |
-
#
|
| 134 |
-
.
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
#
|
| 138 |
-
.
|
| 139 |
|
| 140 |
-
#
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
# mypy
|
| 144 |
.mypy_cache/
|
| 145 |
.dmypy.json
|
| 146 |
dmypy.json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
-
#
|
| 149 |
-
.
|
| 150 |
|
| 151 |
-
#
|
| 152 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
| 155 |
-
|
|
|
|
| 156 |
|
| 157 |
-
#
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 162 |
-
#.idea/
|
|
|
|
| 1 |
+
# cached files
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
.cache
|
| 5 |
+
.DS_Store
|
| 6 |
+
*~
|
| 7 |
+
.*.sw[po]
|
| 8 |
+
.build
|
| 9 |
+
.ve
|
| 10 |
+
.env
|
| 11 |
+
.pytest
|
| 12 |
+
.benchmarks
|
| 13 |
+
.bootstrap
|
| 14 |
+
.appveyor.token
|
| 15 |
+
*.bak
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
# installation package
|
| 18 |
+
*.egg-info/
|
| 19 |
+
dist/
|
| 20 |
+
build/
|
| 21 |
|
| 22 |
+
# environments
|
|
|
|
| 23 |
.venv
|
| 24 |
env/
|
| 25 |
venv/
|
|
|
|
| 27 |
env.bak/
|
| 28 |
venv.bak/
|
| 29 |
|
| 30 |
+
# C extensions
|
| 31 |
+
*.so
|
| 32 |
+
|
| 33 |
+
# pycharm
|
| 34 |
+
.idea/
|
| 35 |
|
| 36 |
+
# vscode
|
| 37 |
+
*.code-workspace
|
| 38 |
|
| 39 |
+
# Packages
|
| 40 |
+
*.egg
|
| 41 |
+
*.egg-info
|
| 42 |
+
dist
|
| 43 |
+
build
|
| 44 |
+
eggs
|
| 45 |
+
.eggs
|
| 46 |
+
parts
|
| 47 |
+
bin
|
| 48 |
+
var
|
| 49 |
+
sdist
|
| 50 |
+
wheelhouse
|
| 51 |
+
develop-eggs
|
| 52 |
+
.installed.cfg
|
| 53 |
+
lib
|
| 54 |
+
lib64
|
| 55 |
+
venv*/
|
| 56 |
+
.venv*/
|
| 57 |
+
pyvenv*/
|
| 58 |
+
pip-wheel-metadata/
|
| 59 |
+
poetry.lock
|
| 60 |
+
|
| 61 |
+
# Installer logs
|
| 62 |
+
pip-log.txt
|
| 63 |
|
| 64 |
# mypy
|
| 65 |
.mypy_cache/
|
| 66 |
.dmypy.json
|
| 67 |
dmypy.json
|
| 68 |
+
mypy.ini
|
| 69 |
+
|
| 70 |
+
# test caches
|
| 71 |
+
.tox/
|
| 72 |
+
.pytest_cache/
|
| 73 |
+
.coverage
|
| 74 |
+
htmlcov
|
| 75 |
+
report.xml
|
| 76 |
+
nosetests.xml
|
| 77 |
+
coverage.xml
|
| 78 |
+
|
| 79 |
+
# Translations
|
| 80 |
+
*.mo
|
| 81 |
|
| 82 |
+
# Buildout
|
| 83 |
+
.mr.developer.cfg
|
| 84 |
|
| 85 |
+
# IDE project files
|
| 86 |
+
.project
|
| 87 |
+
.pydevproject
|
| 88 |
+
.idea
|
| 89 |
+
*.iml
|
| 90 |
+
*.komodoproject
|
| 91 |
|
| 92 |
+
# Complexity
|
| 93 |
+
output/*.html
|
| 94 |
+
output/*/index.html
|
| 95 |
|
| 96 |
+
# Sphinx
|
| 97 |
+
docs/_build
|
| 98 |
+
public/
|
| 99 |
+
web/
|
|
|
|
|
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/PyCQA/bandit
|
| 3 |
+
rev: 1.7.8
|
| 4 |
+
hooks:
|
| 5 |
+
- id: bandit
|
| 6 |
+
args: [-r, -c, .bandit.yml]
|
| 7 |
+
- repo: https://github.com/PyCQA/flake8
|
| 8 |
+
rev: 7.0.0
|
| 9 |
+
hooks:
|
| 10 |
+
- id: flake8
|
| 11 |
+
- repo: https://github.com/pycqa/isort
|
| 12 |
+
rev: 5.13.2
|
| 13 |
+
hooks:
|
| 14 |
+
- id: isort
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to Scrapling
|
| 2 |
+
Everybody is invited and welcome to contribute to Scrapling. Smaller changes have a better chance to get included in a timely manner. Adding unit tests for new features or test cases for bugs you've fixed help us to ensure that the Pull Request (PR) is fine.
|
| 3 |
+
|
| 4 |
+
There is a lot to do...
|
| 5 |
+
- If you are not a developer perhaps you would like to help with the [documentation](/docs)?
|
| 6 |
+
- If you are a developer, most of the features I'm planning to add in the future are moved to [roadmap file](/ROADMAP.md) so consider reading it.
|
| 7 |
+
|
| 8 |
+
Scrapling includes a comprehensive test suite which can be executed with pytest:
|
| 9 |
+
```bash
|
| 10 |
+
$ pytest
|
| 11 |
+
=============================== test session starts ===============================
|
| 12 |
+
platform darwin -- Python 3.12.7, pytest-8.3.3, pluggy-1.5.0
|
| 13 |
+
rootdir: /<some_where>/Scrapling
|
| 14 |
+
configfile: pytest.ini
|
| 15 |
+
plugins: cov-5.0.0, anyio-4.6.0
|
| 16 |
+
collected 16 items
|
| 17 |
+
|
| 18 |
+
tests/test_all_functions.py ................ [100%]
|
| 19 |
+
|
| 20 |
+
=============================== 16 passed in 0.22s ================================
|
| 21 |
+
```
|
| 22 |
+
Also, consider setting `debug` to `True` while initializing the Adaptor object so it's easier to know what's happening in the background.
|
| 23 |
+
|
| 24 |
+
### The process is straight-forward.
|
| 25 |
+
|
| 26 |
+
- Read [How to get faster PR reviews](https://github.com/kubernetes/community/blob/master/contributors/guide/pull-requests.md#best-practices-for-faster-reviews) by Kubernetes (but skip step 0 and 1)
|
| 27 |
+
- Fork Scrapling [git repository](https://github.com/D4Vinci/Scrapling).
|
| 28 |
+
- Make your changes.
|
| 29 |
+
- Ensure tests work.
|
| 30 |
+
- Create a Pull Request against the [**dev**](https://github.com/D4Vinci/Scraplin/tree/dev) branch of Scrapling.
|
MANIFEST.in
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include LICENSE
|
| 2 |
+
include scrapling/py.typed
|
| 3 |
+
|
| 4 |
+
recursive-exclude * __pycache__
|
| 5 |
+
recursive-exclude * *.py[co]
|
README.md
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
|
| 2 |
+
[](https://badge.fury.io/py/scrapling) [](https://pypi.org/project/scrapling/) [](https://opensource.org/licenses/BSD-3-Clause)
|
| 3 |
+
|
| 4 |
+
Dealing with failing web scrapers due to website changes? Meet Scrapling.
|
| 5 |
+
|
| 6 |
+
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. Whether you're a beginner or an expert, Scrapling provides powerful features while maintaining simplicity.
|
| 7 |
+
|
| 8 |
+
```python
|
| 9 |
+
from scrapling import Adaptor
|
| 10 |
+
|
| 11 |
+
# Scrape data that survives website changes
|
| 12 |
+
page = Adaptor(html, auto_match=True)
|
| 13 |
+
products = page.css('.product', auto_save=True)
|
| 14 |
+
# Later, even if selectors change:
|
| 15 |
+
products = page.css('.product', auto_match=True) # Still finds them!
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## Key Features
|
| 19 |
+
|
| 20 |
+
### Adaptive Scraping
|
| 21 |
+
- 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
|
| 22 |
+
- 🎯 **Flexible Querying**: Use CSS selectors, XPath, text search, or regex - chain them however you want!
|
| 23 |
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
|
| 24 |
+
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using its powerful features.
|
| 25 |
+
|
| 26 |
+
### Performance
|
| 27 |
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup by up to 237x in our tests).
|
| 28 |
+
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
| 29 |
+
- ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
|
| 30 |
+
|
| 31 |
+
### Developing Experience
|
| 32 |
+
- 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
|
| 33 |
+
- 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
|
| 34 |
+
- 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
|
| 35 |
+
- 🔌 **Scrapy-Compatible API**: Familiar methods and similar pseudo-elements for Scrapy users.
|
| 36 |
+
- 📘 **Type hints**: Complete type coverage for better IDE support and fewer bugs.
|
| 37 |
+
|
| 38 |
+
## Getting Started
|
| 39 |
+
|
| 40 |
+
Let's walk through a basic example that demonstrates small group of Scrapling's core features:
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
import requests
|
| 44 |
+
from scrapling import Adaptor
|
| 45 |
+
|
| 46 |
+
# Fetch a web page
|
| 47 |
+
url = 'https://quotes.toscrape.com/'
|
| 48 |
+
response = requests.get(url)
|
| 49 |
+
|
| 50 |
+
# Create an Adaptor instance
|
| 51 |
+
page = Adaptor(response.text, url=url)
|
| 52 |
+
# Get all strings in the full page
|
| 53 |
+
page.get_all_text(ignore_tags=('script', 'style'))
|
| 54 |
+
|
| 55 |
+
# Get all quotes, any of these methods will return a list of strings (TextHandlers)
|
| 56 |
+
quotes = page.css('.quote .text::text') # CSS selector
|
| 57 |
+
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
| 58 |
+
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
| 59 |
+
quotes = [element.text for element in page.css('.quote').css('.text')] # Slower than bulk query above
|
| 60 |
+
|
| 61 |
+
# Get the first quote element
|
| 62 |
+
quote = page.css('.quote').first # or [0] or .get()
|
| 63 |
+
|
| 64 |
+
# Working with elements
|
| 65 |
+
quote.html_content # Inner HTML
|
| 66 |
+
quote.prettify() # Prettified version of Inner HTML
|
| 67 |
+
quote.attrib # Element attributes
|
| 68 |
+
quote.path # DOM path to element (List)
|
| 69 |
+
```
|
| 70 |
+
To keep it simple, all methods can be chained on top of each other as long as you are chaining methods that return an element (It's called an `Adaptor` object) or a List of Adaptors (It's called `Adaptors` object)
|
| 71 |
+
|
| 72 |
+
### Installation
|
| 73 |
+
Scrapling is a breeze to get started with - We only require at least Python 3.6 to work and the rest of the requirements are installed automatically with the package.
|
| 74 |
+
```bash
|
| 75 |
+
# Using pip
|
| 76 |
+
pip install scrapling
|
| 77 |
+
|
| 78 |
+
# Or the latest from GitHub
|
| 79 |
+
pip install git+https://github.com/D4Vinci/Scrapling.git@master
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## Performance
|
| 83 |
+
|
| 84 |
+
Scrapling isn't just powerful - it's also blazing fast. Scrapling implements many best practices, design patterns, and numerous optimizations to save fractions of seconds. All of that while focusing exclusively on parsing HTML documents.
|
| 85 |
+
Here are benchmarks comparing Scrapling to popular Python libraries in two tests.
|
| 86 |
+
|
| 87 |
+
### Text Extraction Speed Test (5000 nested elements).
|
| 88 |
+
|
| 89 |
+
| # | Library | Time (ms) | vs Scrapling |
|
| 90 |
+
|---|:-----------------:|:---------:|:------------:|
|
| 91 |
+
| 1 | Scrapling | 5.44 | 1.0x |
|
| 92 |
+
| 2 | Parsel/Scrapy | 5.53 | 1.017x |
|
| 93 |
+
| 3 | Raw Lxml | 6.76 | 1.243x |
|
| 94 |
+
| 4 | PyQuery | 21.96 | 4.037x |
|
| 95 |
+
| 5 | Selectolax | 67.12 | 12.338x |
|
| 96 |
+
| 6 | BS4 with Lxml | 1307.03 | 240.263x |
|
| 97 |
+
| 7 | MechanicalSoup | 1322.64 | 243.132x |
|
| 98 |
+
| 8 | BS4 with html5lib | 3373.75 | 620.175x |
|
| 99 |
+
|
| 100 |
+
As you see, Scrapling is on par with Scrapy and slightly faster than Lxml which both libraries are built on top of. These are the closest results to Scrapling. PyQuery is also built on top of Lxml but still, Scrapling is 4 times faster.
|
| 101 |
+
|
| 102 |
+
### Extraction By Text Speed Test
|
| 103 |
+
|
| 104 |
+
| Library | Time (ms) | vs Scrapling |
|
| 105 |
+
|:-----------:|:---------:|:------------:|
|
| 106 |
+
| Scrapling | 2.51 | 1.0x |
|
| 107 |
+
| AutoScraper | 11.41 | 4.546x |
|
| 108 |
+
|
| 109 |
+
Scrapling can find elements with more methods and it returns full element `Adaptor` objects not only the text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them. As you see, Scrapling is still 4.5 times faster at same task.
|
| 110 |
+
|
| 111 |
+
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](/benchmarks.py) for methodology and to run your comparisons.
|
| 112 |
+
|
| 113 |
+
## Advanced Features
|
| 114 |
+
### Smart Navigation
|
| 115 |
+
```python
|
| 116 |
+
>>> quote.tag
|
| 117 |
+
'div'
|
| 118 |
+
|
| 119 |
+
>>> quote.parent
|
| 120 |
+
<data='<div class="col-md-8"> <div class="quote...' parent='<div class="row"> <div class="col-md-8">...'>
|
| 121 |
+
|
| 122 |
+
>>> quote.parent.tag
|
| 123 |
+
'div'
|
| 124 |
+
|
| 125 |
+
>>> quote.children
|
| 126 |
+
[<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>,
|
| 127 |
+
<data='<span>by <small class="author" itemprop=...' parent='<div class="quote" itemscope itemtype="h...'>,
|
| 128 |
+
<data='<div class="tags"> Tags: <meta class="ke...' parent='<div class="quote" itemscope itemtype="h...'>]
|
| 129 |
+
|
| 130 |
+
>>> quote.siblings
|
| 131 |
+
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 132 |
+
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 133 |
+
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 134 |
+
...]
|
| 135 |
+
|
| 136 |
+
>>> quote.next # gets the next element, the same logic applies to `quote.previous`
|
| 137 |
+
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>
|
| 138 |
+
|
| 139 |
+
>>> quote.children.css(".author::text")
|
| 140 |
+
['Albert Einstein']
|
| 141 |
+
|
| 142 |
+
>>> quote.has_class('quote')
|
| 143 |
+
True
|
| 144 |
+
|
| 145 |
+
# Generate new selectors for any element
|
| 146 |
+
>>> quote.css_selector
|
| 147 |
+
'body > div > div:nth-of-type(2) > div > div'
|
| 148 |
+
|
| 149 |
+
# Test these selectors on your favorite browser or reuse them again in the library in other methods!
|
| 150 |
+
>>> quote.xpath_selector
|
| 151 |
+
'//body/div/div[2]/div/div'
|
| 152 |
+
```
|
| 153 |
+
If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element like below
|
| 154 |
+
```python
|
| 155 |
+
for ancestor in quote.iterancestors():
|
| 156 |
+
# do something with it...
|
| 157 |
+
```
|
| 158 |
+
You can search for a specific ancestor of an element that satisfies a function, all you need to do is to pass a function that takes an `Adaptor` object as an argument and return `True` if the condition satisfies or `False` otherwise like below:
|
| 159 |
+
```python
|
| 160 |
+
>>> quote.find_ancestor(lambda ancestor: ancestor.has_class('row'))
|
| 161 |
+
<data='<div class="row"> <div class="col-md-8">...' parent='<div class="container"> <div class="row...'>
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
### Content-based Selection & Finding Similar Elements
|
| 165 |
+
You can select elements by their text content in multiple ways, here's a full example on another website:
|
| 166 |
+
```python
|
| 167 |
+
>>> response = requests.get('https://books.toscrape.com/index.html')
|
| 168 |
+
|
| 169 |
+
>>> page = Adaptor(response.text, url=response.url)
|
| 170 |
+
|
| 171 |
+
>>> page.find_by_text('Tipping the Velvet') # Find the first element that its text fully matches this text
|
| 172 |
+
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
| 173 |
+
|
| 174 |
+
>>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
|
| 175 |
+
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
| 176 |
+
|
| 177 |
+
>>> page.find_by_regex(r'£[\d\.]+') # Get the first element that its text content matches my price regex
|
| 178 |
+
<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>
|
| 179 |
+
|
| 180 |
+
>>> page.find_by_regex(r'£[\d\.]+', first_match=False) # Get all elements that matches my price regex
|
| 181 |
+
[<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>,
|
| 182 |
+
<data='<p class="price_color">£53.74</p>' parent='<div class="product_price"> <p class="pr...'>,
|
| 183 |
+
<data='<p class="price_color">£50.10</p>' parent='<div class="product_price"> <p class="pr...'>,
|
| 184 |
+
<data='<p class="price_color">£47.82</p>' parent='<div class="product_price"> <p class="pr...'>,
|
| 185 |
+
...]
|
| 186 |
+
```
|
| 187 |
+
Find all elements that are similar to the current element in location and attributes
|
| 188 |
+
```python
|
| 189 |
+
# For this case, ignore the 'title' attribute while matching
|
| 190 |
+
>>> page.find_by_text('Tipping the Velvet').find_similar(ignore_attributes=['title'])
|
| 191 |
+
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
|
| 192 |
+
<data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
|
| 193 |
+
<data='<a href="catalogue/sharp-objects_997/ind...' parent='<h3><a href="catalogue/sharp-objects_997...'>,
|
| 194 |
+
...]
|
| 195 |
+
|
| 196 |
+
# You will notice that the number of elements is 19 not 20 because the current element is not included.
|
| 197 |
+
>>> len(page.find_by_text('Tipping the Velvet').find_similar(ignore_attributes=['title']))
|
| 198 |
+
19
|
| 199 |
+
|
| 200 |
+
# Get the `href` attribute from all similar elements
|
| 201 |
+
>>> [element.attrib['href'] for element in page.find_by_text('Tipping the Velvet').find_similar(ignore_attributes=['title'])]
|
| 202 |
+
['catalogue/a-light-in-the-attic_1000/index.html',
|
| 203 |
+
'catalogue/soumission_998/index.html',
|
| 204 |
+
'catalogue/sharp-objects_997/index.html',
|
| 205 |
+
...]
|
| 206 |
+
```
|
| 207 |
+
To increase the complexity a little bit, let's say we want to get all books' data using that element as a starting point for some reason
|
| 208 |
+
```python
|
| 209 |
+
>>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
|
| 210 |
+
print({
|
| 211 |
+
"name": product.css('h3 a::text')[0],
|
| 212 |
+
"price": product.css('.price_color')[0].re_first(r'[\d\.]+'),
|
| 213 |
+
"stock": product.css('.availability::text')[-1].clean()
|
| 214 |
+
})
|
| 215 |
+
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
|
| 216 |
+
{'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
|
| 217 |
+
{'name': 'Sharp Objects', 'price': '47.82', 'stock': 'In stock'}
|
| 218 |
+
...
|
| 219 |
+
```
|
| 220 |
+
The [documentation](/docs/Examples) will provide more advanced examples.
|
| 221 |
+
|
| 222 |
+
### Handling Structural Changes
|
| 223 |
+
> Because [the internet archive](https://web.archive.org/) is down at the time of writing this, I can't use real websites as examples even though I tested that before (I mean browsing an old version of a website and then counting the current version of the website as structural changes)
|
| 224 |
+
|
| 225 |
+
Let's say you are scraping a page with a structure like this:
|
| 226 |
+
```html
|
| 227 |
+
<div class="container">
|
| 228 |
+
<section class="products">
|
| 229 |
+
<article class="product" id="p1">
|
| 230 |
+
<h3>Product 1</h3>
|
| 231 |
+
<p class="description">Description 1</p>
|
| 232 |
+
</article>
|
| 233 |
+
<article class="product" id="p2">
|
| 234 |
+
<h3>Product 2</h3>
|
| 235 |
+
<p class="description">Description 2</p>
|
| 236 |
+
</article>
|
| 237 |
+
</section>
|
| 238 |
+
</div>
|
| 239 |
+
```
|
| 240 |
+
and you want to scrape the first product, the one with the `p1` ID. You will probably write a selector like this
|
| 241 |
+
```python
|
| 242 |
+
page.css('#p1')
|
| 243 |
+
```
|
| 244 |
+
When website owners implement structural changes like
|
| 245 |
+
```html
|
| 246 |
+
<div class="new-container">
|
| 247 |
+
<div class="product-wrapper">
|
| 248 |
+
<section class="products">
|
| 249 |
+
<article class="product new-class" data-id="p1">
|
| 250 |
+
<div class="product-info">
|
| 251 |
+
<h3>Product 1</h3>
|
| 252 |
+
<p class="new-description">Description 1</p>
|
| 253 |
+
</div>
|
| 254 |
+
</article>
|
| 255 |
+
<article class="product new-class" data-id="p2">
|
| 256 |
+
<div class="product-info">
|
| 257 |
+
<h3>Product 2</h3>
|
| 258 |
+
<p class="new-description">Description 2</p>
|
| 259 |
+
</div>
|
| 260 |
+
</article>
|
| 261 |
+
</section>
|
| 262 |
+
</div>
|
| 263 |
+
</div>
|
| 264 |
+
```
|
| 265 |
+
The selector will no longer function and your code needs maintenance. That's where Scrapling auto-matching feature comes into play.
|
| 266 |
+
|
| 267 |
+
```python
|
| 268 |
+
# Before the change
|
| 269 |
+
page = Adaptor(page_source, url='example.com', auto_match=True)
|
| 270 |
+
element = page.css('#p1' auto_save=True)
|
| 271 |
+
if not element: # One day website changes?
|
| 272 |
+
element = page.css('#p1', auto_match=True) # Still finds it!
|
| 273 |
+
# the rest of the code...
|
| 274 |
+
```
|
| 275 |
+
> How does the auto-matching work? Check the [FAQs](#FAQs) section for that and other possible issues while auto-matching.
|
| 276 |
+
|
| 277 |
+
**Notes:**
|
| 278 |
+
1. Passing the `auto_save` argument without setting `auto_match` to `True` while initializing the Adaptor object will only result in ignoring the `auto_save` argument value and the following warning message
|
| 279 |
+
```text
|
| 280 |
+
Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
|
| 281 |
+
```
|
| 282 |
+
This behavior is purely for performance reasons so the database gets created/connected only when you are planning to use the auto-matching features. Same case with the `auto_match` argument.
|
| 283 |
+
|
| 284 |
+
2. The `auto_match` parameter works only for `Adaptor` instances not `Adaptors` so if you do something like this you will get an error
|
| 285 |
+
```python
|
| 286 |
+
page.css('body').css('#p1', auto_match=True)
|
| 287 |
+
```
|
| 288 |
+
because you can't auto-match a whole list, you have to be specific and do something like
|
| 289 |
+
```python
|
| 290 |
+
page.css('body')[0].css('#p1', auto_match=True)
|
| 291 |
+
```
|
| 292 |
+
|
| 293 |
+
### Is That All?
|
| 294 |
+
Here's what else you can do with Scrapling:
|
| 295 |
+
|
| 296 |
+
- Accessing the `lxml.etree` object itself of any element directly
|
| 297 |
+
```python
|
| 298 |
+
>>> quote._root
|
| 299 |
+
<Element div at 0x107f98870>
|
| 300 |
+
```
|
| 301 |
+
- Saving and retrieving elements manually to auto-match them outside the `css` and the `xpath` methods but you have to set the identifier by yourself.
|
| 302 |
+
|
| 303 |
+
- To save element to the database:
|
| 304 |
+
```python
|
| 305 |
+
>>> element = page.find_by_text('Tipping the Velvet', first_match=True)
|
| 306 |
+
>>> page.save(element, 'my_special_element')
|
| 307 |
+
```
|
| 308 |
+
- Now later when you want to retrieve it and relocate it in the page with auto-matching, it would be like this
|
| 309 |
+
```python
|
| 310 |
+
>>> element_dict = page.retrieve('my_special_element')
|
| 311 |
+
>>> page.relocate(element_dict, adaptor_type=True)
|
| 312 |
+
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
| 313 |
+
>>> page.relocate(element_dict, adaptor_type=True).css('::text')
|
| 314 |
+
['Tipping the Velvet']
|
| 315 |
+
```
|
| 316 |
+
- if you want to keep it as `lxml.etree` object, leave the `adaptor_type` argument
|
| 317 |
+
```python
|
| 318 |
+
>>> page.relocate(element_dict)
|
| 319 |
+
[<Element a at 0x105a2a7b0>]
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
- Doing operations on element content is the same as scrapy
|
| 323 |
+
```python
|
| 324 |
+
quote.re(r'somethings') # Get all strings (TextHandlers) that match the regex pattern
|
| 325 |
+
quote.re_first(r'something') # Get the first string (TextHandler) only
|
| 326 |
+
quote.json() # If the content text is jsonable, then convert it to json using `orjson` which is 10x faster than the standard json library and provides more options
|
| 327 |
+
```
|
| 328 |
+
Hence all of these methods are actually methods from the `TextHandler` within that contains the text content so the same can be done directly if you call the `.text` property or equivalent selector function.
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
- Doing operations on the text content itself includes
|
| 332 |
+
- Cleaning the text from any white spaces and replacing consecutive spaces with single space
|
| 333 |
+
```python
|
| 334 |
+
quote.clean()
|
| 335 |
+
```
|
| 336 |
+
- You already know about the regex matching and the fast json parsing but did you know that all strings returned from the regex search are actually `TextHandler` objects too? so in cases where you have for example a JS object assigned to a JS variable inside JS code and want to extract it with regex and then convert it to json object, in other libraries, these would be more than 1 line of code but here you can do it in 1 line like this
|
| 337 |
+
```python
|
| 338 |
+
page.xpath('//script/text()').re_first(r'var dataLayer = (.+);').json()
|
| 339 |
+
```
|
| 340 |
+
- Sort all characters in the string as if it were a list and return the new string
|
| 341 |
+
```python
|
| 342 |
+
quote.sort()
|
| 343 |
+
```
|
| 344 |
+
> To be clear, `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work with it.
|
| 345 |
+
|
| 346 |
+
- Any element's attributes are not exactly a dictionary but a sub-class of [mapping](https://docs.python.org/3/glossary.html#term-mapping) called `AttributesHandler` that's read-only so it's faster and string values returned are actually `TextHandler` objects so all operations above can be done on them, standard dictionary operations that doesn't modify the data, and more :)
|
| 347 |
+
- Unlike standard dictionaries, here you can search by values too and can do partial searches. It might be handy in some cases (returns a generator of matches)
|
| 348 |
+
```python
|
| 349 |
+
>>> for item in element.attrib.search_values('catalogue', partial=True):
|
| 350 |
+
print(item)
|
| 351 |
+
{'href': 'catalogue/tipping-the-velvet_999/index.html'}
|
| 352 |
+
```
|
| 353 |
+
- Serialize the current attributes to JSON bytes:
|
| 354 |
+
```python
|
| 355 |
+
>>> element.attrib.json_string
|
| 356 |
+
b'{"href":"catalogue/tipping-the-velvet_999/index.html","title":"Tipping the Velvet"}'
|
| 357 |
+
```
|
| 358 |
+
- Converting it to a normal dictionary
|
| 359 |
+
```python
|
| 360 |
+
>>> dict(element.attrib)
|
| 361 |
+
{'href': 'catalogue/tipping-the-velvet_999/index.html',
|
| 362 |
+
'title': 'Tipping the Velvet'}
|
| 363 |
+
```
|
| 364 |
+
|
| 365 |
+
Scrapling is under active development so expect many more features coming soon :)
|
| 366 |
+
|
| 367 |
+
## More Advanced Usage
|
| 368 |
+
|
| 369 |
+
There are a lot of deep details skipped here to make this as short as possible so to take a deep dive, head to the [docs](/docs) section. I will try to keep it updated as possible and add complex examples. There I will explain points like how to write your storage system, write spiders that don't depend on selectors at all, and more...
|
| 370 |
+
|
| 371 |
+
Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
## FAQs
|
| 375 |
+
This section addresses common questions about Scrapling, please read this section before opening an issue.
|
| 376 |
+
|
| 377 |
+
### How does auto-matching work?
|
| 378 |
+
1. You need to get a working selector and run it at least once with methods `css` or `xpath` with the `auto_save` parameter set to `True` before structural changes happen.
|
| 379 |
+
2. Before returning results for you, Scrapling uses its configured database and saves unique properties about that element.
|
| 380 |
+
3. Now because everything about the element can be changed or removed, nothing from the element can be used as a unique identifier for the database. To solve this issue, I made the storage system rely on two things:
|
| 381 |
+
1. The domain of the URL you gave while initializing the first Adaptor object
|
| 382 |
+
2. The `identifier` parameter you passed to the method while selecting. If you didn't pass one, then the selector string itself will be used as an identifier but remember you will have to use it as an identifier value later when the structure changes and you want to pass the new selector.
|
| 383 |
+
|
| 384 |
+
Together both are used to retrieve the element's unique properties from the database later.
|
| 385 |
+
4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
|
| 386 |
+
5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
|
| 387 |
+
6. The score for each element is stored in the table and in the end, the element(s) with the highest combined similarity scores are returned.
|
| 388 |
+
|
| 389 |
+
### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
|
| 390 |
+
Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
|
| 391 |
+
|
| 392 |
+
### If all things about an element can change or get removed, what are the unique properties to be saved?
|
| 393 |
+
For each element, Scrapling will extract:
|
| 394 |
+
- Element tag name, text, attributes (names and values), siblings (tag names only), and path (tag names only).
|
| 395 |
+
- Element's parent tag name, attributes (names and values), and text.
|
| 396 |
+
|
| 397 |
+
### I have enabled the `auto_save`/`auto_match` parameter while selecting and it got completely ignored with a warning message
|
| 398 |
+
That's because passing the `auto_save`/`auto_match` argument without setting `auto_match` to `True` while initializing the Adaptor object will only result in ignoring the `auto_save`/`auto_match` argument value. This behavior is purely for performance reasons so the database gets created only when you are planning to use the auto-matching features.
|
| 399 |
+
|
| 400 |
+
### I have done everything as the docs but the auto-matching didn't return anything, what's wrong?
|
| 401 |
+
It could be one of these reasons:
|
| 402 |
+
1. No data were saved/stored for this element before.
|
| 403 |
+
2. The selector passed is not the one used while storing element data. The solution is simple
|
| 404 |
+
- Pass the old selector again as an identifier to the method called.
|
| 405 |
+
- Retrieve the element with the retrieve method using the old selector as identifier then save it again with the save method and the new selector as identifier.
|
| 406 |
+
- Start using the identifier argument more often if you are planning to use every new selector from now on.
|
| 407 |
+
3. The website had some extreme structural changes like a new full design. If this happens a lot with this website, the solution would be to make your code as selector-free as possible using Scrapling features.
|
| 408 |
+
|
| 409 |
+
### Can Scrapling replace code built on top of BeautifulSoup4?
|
| 410 |
+
Pretty much yeah, almost all features you get from BeautifulSoup can be found or achieved in Scrapling one way or another. In fact, if you see there's a feature in bs4 that is missing in Scrapling, please make a feature request from the issues tab to let me know.
|
| 411 |
+
|
| 412 |
+
### Can Scrapling replace code built on top of AutoScraper?
|
| 413 |
+
Of course, you can find elements by text/regex, find similar elements in a more reliable way than AutoScraper, and finally save/retrieve elements manually to use later as the model feature in AutoScraper. I have pulled all top articles about AutoScraper from Google and tested Scrapling against examples in them. In all examples, Scrapling got the same results as AutoScraper in much less time.
|
| 414 |
+
|
| 415 |
+
### Is Scrapling thread-safe?
|
| 416 |
+
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
|
| 417 |
+
|
| 418 |
+
## Contributing
|
| 419 |
+
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
| 420 |
+
|
| 421 |
+
Please read the [contributing file](/CONTRIBUTING.md) before doing anything.
|
| 422 |
+
|
| 423 |
+
## License
|
| 424 |
+
This work is licensed under BSD-3
|
| 425 |
+
|
| 426 |
+
## Acknowledgments
|
| 427 |
+
This project includes code adapted from:
|
| 428 |
+
- Parsel (BSD License) - Used for [translator](/scrapling/translator.py) submodule
|
| 429 |
+
|
| 430 |
+
## Known Issues
|
| 431 |
+
- In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
|
| 432 |
+
- Currently, Scrapling is not compatible with async/await.
|
| 433 |
+
|
| 434 |
+
<div align="center"><small>Made with ❤️ by Karim Shoair</small></div><br>
|
ROADMAP.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## TODOs
|
| 2 |
+
- Add more tests and increase the code coverage.
|
| 3 |
+
- Structure the tests folder in a better way.
|
| 4 |
+
- Add more documentation.
|
| 5 |
+
- Add the browsing ability.
|
| 6 |
+
- Create detailed documentation for 'readthedocs' website, preferably add Github action for deploying it.
|
| 7 |
+
- Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
|
| 8 |
+
- Need to add more functionality to `AttributesHandler` and more navigation functions to `Adaptor` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
|
| 9 |
+
- Add `.filter` method to `Adaptors` object and other similar methods.
|
| 10 |
+
- Add functionality to automatically detect pagination URLs
|
| 11 |
+
- Add the ability to auto-detect schemas in pages and manipulate them
|
| 12 |
+
- Add ability to generate a regex from a group of elements (Like for all href attributes)
|
| 13 |
+
-
|
benchmarks.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import timeit
|
| 3 |
+
import functools
|
| 4 |
+
import requests
|
| 5 |
+
from statistics import mean
|
| 6 |
+
|
| 7 |
+
from scrapling import Adaptor
|
| 8 |
+
from parsel import Selector
|
| 9 |
+
from lxml import etree, html
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
from pyquery import PyQuery as pq
|
| 12 |
+
from autoscraper import AutoScraper
|
| 13 |
+
from selectolax.parser import HTMLParser
|
| 14 |
+
from mechanicalsoup import StatefulBrowser
|
| 15 |
+
|
| 16 |
+
large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def benchmark(func):
|
| 20 |
+
@functools.wraps(func)
|
| 21 |
+
def wrapper(*args, **kwargs):
|
| 22 |
+
benchmark_name = func.__name__.replace('test_', '').replace('_', ' ')
|
| 23 |
+
print(f"-> {benchmark_name}", end=" ", flush=True)
|
| 24 |
+
# Warm-up phase
|
| 25 |
+
timeit.repeat(lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals())
|
| 26 |
+
# Measure time (1 run, repeat 100 times, take average)
|
| 27 |
+
times = timeit.repeat(
|
| 28 |
+
lambda: func(*args, **kwargs), number=1, repeat=100, globals=globals(), timer=time.process_time
|
| 29 |
+
)
|
| 30 |
+
min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
|
| 31 |
+
print(f"average execution time: {min_time} ms")
|
| 32 |
+
return min_time
|
| 33 |
+
|
| 34 |
+
return wrapper
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@benchmark
|
| 38 |
+
def test_lxml():
|
| 39 |
+
return [
|
| 40 |
+
e.text
|
| 41 |
+
for e in etree.fromstring(
|
| 42 |
+
large_html,
|
| 43 |
+
# Scrapling and Parsel use the same parser inside so this is just to make it fair
|
| 44 |
+
parser=html.HTMLParser(recover=True, huge_tree=True)
|
| 45 |
+
).cssselect('.item')]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@benchmark
|
| 49 |
+
def test_bs4_lxml():
|
| 50 |
+
return [e.text for e in BeautifulSoup(large_html, 'lxml').select('.item')]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@benchmark
|
| 54 |
+
def test_bs4_html5lib():
|
| 55 |
+
return [e.text for e in BeautifulSoup(large_html, 'html5lib').select('.item')]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@benchmark
|
| 59 |
+
def test_pyquery():
|
| 60 |
+
return [e.text() for e in pq(large_html)('.item').items()]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@benchmark
|
| 64 |
+
def test_scrapling():
|
| 65 |
+
# No need to do `.extract()` like parsel to extract text
|
| 66 |
+
# Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False, debug=False).css('.item')]`
|
| 67 |
+
# for obvious reasons, of course.
|
| 68 |
+
return Adaptor(large_html, auto_match=False, debug=False).css('.item::text')
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@benchmark
|
| 72 |
+
def test_parsel():
|
| 73 |
+
return Selector(text=large_html).css('.item::text').extract()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@benchmark
|
| 77 |
+
def test_mechanicalsoup():
|
| 78 |
+
browser = StatefulBrowser()
|
| 79 |
+
browser.open_fake_page(large_html)
|
| 80 |
+
return [e.text for e in browser.page.select('.item')]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@benchmark
|
| 84 |
+
def test_selectolax():
|
| 85 |
+
return [node.text() for node in HTMLParser(large_html).css('.item')]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def display(results):
|
| 89 |
+
# Sort and display results
|
| 90 |
+
sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
|
| 91 |
+
scrapling_time = results['Scrapling']
|
| 92 |
+
print("\nRanked Results (fastest to slowest):")
|
| 93 |
+
print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
|
| 94 |
+
print('-' * 50)
|
| 95 |
+
for i, (test_name, test_time) in enumerate(sorted_results, 1):
|
| 96 |
+
compare = round(test_time / scrapling_time, 3)
|
| 97 |
+
print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@benchmark
|
| 101 |
+
def test_scrapling_text(request_html):
|
| 102 |
+
# Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
|
| 103 |
+
return [
|
| 104 |
+
element.text for element in Adaptor(
|
| 105 |
+
request_html, auto_match=False, debug=False
|
| 106 |
+
).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@benchmark
|
| 111 |
+
def test_autoscraper(request_html):
|
| 112 |
+
# autoscraper by default returns elements text
|
| 113 |
+
return AutoScraper().build(html=request_html, wanted_list=['Tipping the Velvet'])
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
print(' Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n')
|
| 118 |
+
results1 = {
|
| 119 |
+
"Raw Lxml": test_lxml(),
|
| 120 |
+
"Parsel/Scrapy": test_parsel(),
|
| 121 |
+
"Scrapling": test_scrapling(),
|
| 122 |
+
'Selectolax': test_selectolax(),
|
| 123 |
+
"PyQuery": test_pyquery(),
|
| 124 |
+
"BS4 with Lxml": test_bs4_lxml(),
|
| 125 |
+
"MechanicalSoup": test_mechanicalsoup(),
|
| 126 |
+
"BS4 with html5lib": test_bs4_html5lib(),
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
display(results1)
|
| 130 |
+
print('\n' + "="*25)
|
| 131 |
+
req = requests.get('https://books.toscrape.com/index.html')
|
| 132 |
+
print(
|
| 133 |
+
' Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n'
|
| 134 |
+
)
|
| 135 |
+
results2 = {
|
| 136 |
+
"Scrapling": test_scrapling_text(req.text),
|
| 137 |
+
"AutoScraper": test_autoscraper(req.text),
|
| 138 |
+
}
|
| 139 |
+
display(results2)
|
docs/Core/using scrapling custom types.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
> You can take advantage from the custom-made types for Scrapling and use it outside the library if you want. It's better than copying their code after all :)
|
| 2 |
+
|
| 3 |
+
### All current types can be imported alone like below
|
| 4 |
+
```python
|
| 5 |
+
>>> from scrapling import TextHandler, AttributesHandler
|
| 6 |
+
|
| 7 |
+
>>> somestring = TextHandler('{}')
|
| 8 |
+
>>> somestring.json()
|
| 9 |
+
'{}'
|
| 10 |
+
>>> somedict_1 = AttributesHandler({'a': 1})
|
| 11 |
+
>>> somedict_2 = AttributesHandler(a=1)
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
Note `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work.
|
| 15 |
+
If you want to check for the type in your code, it's better to depend on Python built-in function `issubclass`.
|
| 16 |
+
|
| 17 |
+
The class `AttributesHandler` is a sub-class of `collections.abc.Mapping` so it's immutable (read-only) and all operations are inherited from it. The data passed can be accessed later though the `._data` method but careful it's of type `types.MappingProxyType` so it's immutable (read-only) as well (faster than `collections.abc.Mapping` by fractions of seconds).
|
| 18 |
+
|
| 19 |
+
So basically to make it simple to you if you are new to Python, the same operations and methods from Python standard `dict` type will all work with class `AttributesHandler` except the ones that try to modify the actual data.
|
| 20 |
+
|
| 21 |
+
If you want to modify the data inside `AttributesHandler`, you have to convert it to dictionary first like with using the `dict` function and modify it outside.
|
docs/Examples/selectorless_stackoverflow.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
I only made this example to show how Scrapling features can be used to scrape a website without writing any selector
|
| 3 |
+
so this script doesn't depend on the website structure.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from scrapling import Adaptor
|
| 8 |
+
|
| 9 |
+
response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
|
| 10 |
+
page = Adaptor(response.text, url=response.url)
|
| 11 |
+
# First we will extract the first question title and its author based on the text content
|
| 12 |
+
first_question_title = page.find_by_text('Run Selenium Python Script on Remote Server')
|
| 13 |
+
first_question_author = page.find_by_text('Ryan')
|
| 14 |
+
# If you want you can extract other questions tags like below
|
| 15 |
+
first_question = first_question_title.find_ancestor(
|
| 16 |
+
lambda ancestor: ancestor.attrib.get('id') and 'question-summary' in ancestor.attrib.get('id')
|
| 17 |
+
)
|
| 18 |
+
rest_of_questions = first_question.find_similar()
|
| 19 |
+
# But since nothing to rely on to extract other titles/authors from these elements without CSS/XPath selectors due to the website nature
|
| 20 |
+
# We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
|
| 21 |
+
for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
|
| 22 |
+
print(i, title.text, author.text)
|
| 23 |
+
|
docs/Extending Scrapling/writing storage system.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Scrapling by default is using SQLite but in case you want to write your storage system to store elements properties there for the auto-matching, this tutorial got you covered.
|
| 2 |
+
|
| 3 |
+
You might want to use FireBase for example and share the database between multiple spiders on different machines, it's a great idea to use an online database like that because this way the spiders will share with each others.
|
| 4 |
+
|
| 5 |
+
So first to make your storage class work, it must do the big 3:
|
| 6 |
+
1. Inherit from the abstract class `scrapling.storage_adaptors.StorageSystemMixin` and accept a string argument which will be the `url` argument to maintain the library logic.
|
| 7 |
+
2. Use the decorator `functools.lru_cache` on top of the class itself to follow the Singleton design pattern as other classes.
|
| 8 |
+
3. Implement methods `save` and `retrieve`, as you see from the type hints:
|
| 9 |
+
- The method `save` returns nothing and will get two arguments from the library
|
| 10 |
+
* The first one is of type `lxml.html.HtmlElement` which is the element itself, ofc. It must be converted to dictionary using the function `scrapling.utils._StorageTools.element_to_dict` so we keep the same format then saved to your database as you wish.
|
| 11 |
+
* The second one is string which is the identifier used for retrieval. The combination of this identifier and the `url` argument from initialization must be unique for each row or the auto-match will be messed up.
|
| 12 |
+
- The method `retrieve` takes a string which is the identifier, using it with the `url` passed on initialization the element's dictionary is retrieved from the database and returned if it exist otherwise it returns `None`
|
| 13 |
+
> If the instructions weren't clear enough for you, you can check my implementation using SQLite3 in [storage_adaptors](/scrapling/storage_adaptors.py) file
|
| 14 |
+
|
| 15 |
+
If your class satisfy this, the rest is easy. If you are planning to use the library in a threaded application, make sure that your class supports it. The default used class is thread-safe.
|
| 16 |
+
|
| 17 |
+
There are some helper functions added to the abstract class if you want to use it. It's easier to see it for yourself in the [code](/scrapling/storage_adaptors.py), it's heavily commented :)
|
docs/index.md
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This section is still under work but any help is highly appreciated
|
| 2 |
+
## I will try to make full detailed documentation with Sphinx ASAP.
|
pytest.ini
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
addopts = -p no:warnings --doctest-modules --ignore=setup.py
|
scrapling/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Declare top-level shortcuts
|
| 2 |
+
from scrapling.parser import Adaptor, Adaptors
|
| 3 |
+
from scrapling.custom_types import TextHandler, AttributesHandler
|
| 4 |
+
|
| 5 |
+
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 6 |
+
__version__ = "0.1"
|
| 7 |
+
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
__all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']
|
scrapling/custom_types.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from types import MappingProxyType
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from typing import Dict, List, Union, Pattern
|
| 5 |
+
|
| 6 |
+
from scrapling.utils import _is_iterable, flatten
|
| 7 |
+
|
| 8 |
+
from orjson import loads, dumps
|
| 9 |
+
from w3lib.html import replace_entities as _replace_entities
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TextHandler(str):
|
| 13 |
+
"""Extends standard Python string by adding more functionality"""
|
| 14 |
+
__slots__ = ()
|
| 15 |
+
|
| 16 |
+
def __new__(cls, string):
|
| 17 |
+
# Because str is immutable and we can't override __init__
|
| 18 |
+
if type(string) is str:
|
| 19 |
+
return super().__new__(cls, string)
|
| 20 |
+
else:
|
| 21 |
+
return super().__new__(cls, '')
|
| 22 |
+
|
| 23 |
+
def sort(self, reverse: bool = False) -> str:
|
| 24 |
+
"""Return a sorted version of the string"""
|
| 25 |
+
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 26 |
+
|
| 27 |
+
def clean(self) -> str:
|
| 28 |
+
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 29 |
+
data = re.sub(r'[\t|\r|\n]', '', self)
|
| 30 |
+
data = re.sub(' +', ' ', data)
|
| 31 |
+
return self.__class__(data.strip())
|
| 32 |
+
|
| 33 |
+
def json(self) -> Dict:
|
| 34 |
+
"""Return json response if the response is jsonable otherwise throw error"""
|
| 35 |
+
# Using __str__ function as a workaround for orjson issue with subclasses of str
|
| 36 |
+
# Check this out: https://github.com/ijl/orjson/issues/445
|
| 37 |
+
return loads(self.__str__())
|
| 38 |
+
|
| 39 |
+
def re(
|
| 40 |
+
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
| 41 |
+
case_sensitive: bool = False, check_match: bool = False
|
| 42 |
+
) -> Union[List[str], bool]:
|
| 43 |
+
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 44 |
+
|
| 45 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 46 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 47 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 48 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 49 |
+
:param check_match: used to quickly check if this regex matches or not without any operations on the results
|
| 50 |
+
|
| 51 |
+
"""
|
| 52 |
+
if isinstance(regex, str):
|
| 53 |
+
if not case_sensitive:
|
| 54 |
+
regex = re.compile(regex, re.UNICODE)
|
| 55 |
+
else:
|
| 56 |
+
regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
|
| 57 |
+
|
| 58 |
+
input_text = self.clean() if clean_match else self
|
| 59 |
+
results = regex.findall(input_text)
|
| 60 |
+
if check_match:
|
| 61 |
+
return bool(results)
|
| 62 |
+
|
| 63 |
+
if all(_is_iterable(res) for res in results):
|
| 64 |
+
results = flatten(results)
|
| 65 |
+
|
| 66 |
+
if not replace_entities:
|
| 67 |
+
return [TextHandler(string) for string in results]
|
| 68 |
+
|
| 69 |
+
return [TextHandler(_replace_entities(s)) for s in results]
|
| 70 |
+
|
| 71 |
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
| 72 |
+
clean_match: bool = False, case_sensitive: bool = False,):
|
| 73 |
+
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 74 |
+
|
| 75 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 76 |
+
:param default: The default value to be returned if there is no match
|
| 77 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 78 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 79 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 80 |
+
|
| 81 |
+
"""
|
| 82 |
+
result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
|
| 83 |
+
return result[0] if result else default
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class AttributesHandler(Mapping):
|
| 87 |
+
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
| 88 |
+
at the same time I use it to add more functionalities.
|
| 89 |
+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
| 90 |
+
"""
|
| 91 |
+
__slots__ = ('_data',)
|
| 92 |
+
|
| 93 |
+
def __init__(self, mapping=None, **kwargs):
|
| 94 |
+
mapping = {
|
| 95 |
+
key: TextHandler(value) if type(value) is str else value
|
| 96 |
+
for key, value in mapping.items()
|
| 97 |
+
} if mapping is not None else {}
|
| 98 |
+
|
| 99 |
+
if kwargs:
|
| 100 |
+
mapping.update({
|
| 101 |
+
key: TextHandler(value) if type(value) is str else value
|
| 102 |
+
for key, value in kwargs.items()
|
| 103 |
+
})
|
| 104 |
+
|
| 105 |
+
# Fastest read-only mapping type
|
| 106 |
+
self._data = MappingProxyType(mapping)
|
| 107 |
+
|
| 108 |
+
def get(self, key, default=None):
|
| 109 |
+
"""Acts like standard dictionary `.get()` method"""
|
| 110 |
+
return self._data.get(key, default)
|
| 111 |
+
|
| 112 |
+
def search_values(self, keyword, partial=False):
|
| 113 |
+
"""Search current attributes by values and return dictionary of each matching item
|
| 114 |
+
:param keyword: The keyword to search for in the attributes values
|
| 115 |
+
:param partial: If True, the function will search if keyword in each value instead of perfect match
|
| 116 |
+
"""
|
| 117 |
+
for key, value in self._data.items():
|
| 118 |
+
if partial:
|
| 119 |
+
if keyword in value:
|
| 120 |
+
yield AttributesHandler({key: value})
|
| 121 |
+
else:
|
| 122 |
+
if keyword == value:
|
| 123 |
+
yield AttributesHandler({key: value})
|
| 124 |
+
|
| 125 |
+
@property
|
| 126 |
+
def json_string(self):
|
| 127 |
+
"""Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
|
| 128 |
+
return dumps(dict(self._data))
|
| 129 |
+
|
| 130 |
+
def __getitem__(self, key):
|
| 131 |
+
return self._data[key]
|
| 132 |
+
|
| 133 |
+
def __iter__(self):
|
| 134 |
+
return iter(self._data)
|
| 135 |
+
|
| 136 |
+
def __len__(self):
|
| 137 |
+
return len(self._data)
|
| 138 |
+
|
| 139 |
+
def __repr__(self):
|
| 140 |
+
return f"{self.__class__.__name__}({self._data})"
|
| 141 |
+
|
| 142 |
+
def __str__(self):
|
| 143 |
+
return str(self._data)
|
| 144 |
+
|
| 145 |
+
def __contains__(self, key):
|
| 146 |
+
return key in self._data
|
scrapling/mixins.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
class SelectorsGeneration:
|
| 3 |
+
"""Selectors generation functions
|
| 4 |
+
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
| 5 |
+
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
| 6 |
+
|
| 7 |
+
def __general_selection(self, selection: str = 'css') -> str:
|
| 8 |
+
"""Generate a selector for the current element.
|
| 9 |
+
:return: A string of the generated selector.
|
| 10 |
+
"""
|
| 11 |
+
selectorPath = []
|
| 12 |
+
target = self
|
| 13 |
+
css = selection.lower() == 'css'
|
| 14 |
+
while target is not None:
|
| 15 |
+
if target.parent:
|
| 16 |
+
if target.attrib.get('id'):
|
| 17 |
+
# id is enough
|
| 18 |
+
part = (
|
| 19 |
+
f'#{target.attrib["id"]}' if css
|
| 20 |
+
else f"[@id='{target.attrib['id']}']"
|
| 21 |
+
)
|
| 22 |
+
selectorPath.append(part)
|
| 23 |
+
return (
|
| 24 |
+
" > ".join(reversed(selectorPath)) if css
|
| 25 |
+
else '//*' + "/".join(reversed(selectorPath))
|
| 26 |
+
)
|
| 27 |
+
else:
|
| 28 |
+
part = f'{target.tag}'
|
| 29 |
+
# We won't use classes anymore because I some websites share exact classes between elements
|
| 30 |
+
# classes = target.attrib.get('class', '').split()
|
| 31 |
+
# if classes and css:
|
| 32 |
+
# part += f".{'.'.join(classes)}"
|
| 33 |
+
# else:
|
| 34 |
+
counter = {}
|
| 35 |
+
for child in target.parent.children:
|
| 36 |
+
counter.setdefault(child.tag, 0)
|
| 37 |
+
counter[child.tag] += 1
|
| 38 |
+
if child._root == target._root:
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
if counter[target.tag] > 1:
|
| 42 |
+
part += (
|
| 43 |
+
f":nth-of-type({counter[target.tag]})" if css
|
| 44 |
+
else f"[{counter[target.tag]}]"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
selectorPath.append(part)
|
| 48 |
+
target = target.parent
|
| 49 |
+
if target is None or target.tag == 'html':
|
| 50 |
+
return (
|
| 51 |
+
" > ".join(reversed(selectorPath)) if css
|
| 52 |
+
else '//' + "/".join(reversed(selectorPath))
|
| 53 |
+
)
|
| 54 |
+
else:
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
return (
|
| 58 |
+
" > ".join(reversed(selectorPath)) if css
|
| 59 |
+
else '//' + "/".join(reversed(selectorPath))
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def css_selector(self) -> str:
|
| 64 |
+
"""Generate a CSS selector for the current element
|
| 65 |
+
:return: A string of the generated selector.
|
| 66 |
+
"""
|
| 67 |
+
return self.__general_selection()
|
| 68 |
+
|
| 69 |
+
@property
|
| 70 |
+
def xpath_selector(self) -> str:
|
| 71 |
+
"""Generate a XPath selector for the current element
|
| 72 |
+
:return: A string of the generated selector.
|
| 73 |
+
"""
|
| 74 |
+
return self.__general_selection('xpath')
|
scrapling/parser.py
ADDED
|
@@ -0,0 +1,903 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from difflib import SequenceMatcher
|
| 3 |
+
from typing import Any, Dict, List, Tuple, Optional, Pattern, SupportsIndex, Union, Callable, Generator
|
| 4 |
+
|
| 5 |
+
from scrapling.translator import HTMLTranslator
|
| 6 |
+
from scrapling.mixins import SelectorsGeneration
|
| 7 |
+
from scrapling.custom_types import TextHandler, AttributesHandler
|
| 8 |
+
from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
| 9 |
+
from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
| 10 |
+
|
| 11 |
+
from lxml import etree, html
|
| 12 |
+
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Adaptor(SelectorsGeneration):
|
| 16 |
+
__slots__ = (
|
| 17 |
+
'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
|
| 18 |
+
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
text: Optional[str] = None,
|
| 24 |
+
url: Optional[str] = None,
|
| 25 |
+
body: bytes = b"",
|
| 26 |
+
encoding: str = "utf8",
|
| 27 |
+
huge_tree: bool = True,
|
| 28 |
+
root: Optional[html.HtmlElement] = None,
|
| 29 |
+
keep_comments: Optional[bool] = False,
|
| 30 |
+
auto_match: Optional[bool] = False,
|
| 31 |
+
storage: Any = SQLiteStorageSystem,
|
| 32 |
+
storage_args: Optional[Dict] = None,
|
| 33 |
+
debug: Optional[bool] = True,
|
| 34 |
+
):
|
| 35 |
+
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
| 36 |
+
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
| 37 |
+
|
| 38 |
+
Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
|
| 39 |
+
inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
|
| 40 |
+
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
| 41 |
+
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
| 42 |
+
|
| 43 |
+
:param text: HTML body passed as text.
|
| 44 |
+
:param url: allows storing a URL with the html data for retrieving later.
|
| 45 |
+
:param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
|
| 46 |
+
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
| 47 |
+
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 48 |
+
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
| 49 |
+
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
| 50 |
+
Don't use it unless you know what you are doing!
|
| 51 |
+
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 52 |
+
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
| 53 |
+
priority over all auto-match related arguments/functions in the class.
|
| 54 |
+
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 55 |
+
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 56 |
+
If empty, default values will be used.
|
| 57 |
+
:param debug: Enable debug mode
|
| 58 |
+
"""
|
| 59 |
+
if root is None and not body and text is None:
|
| 60 |
+
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
| 61 |
+
|
| 62 |
+
if root is None:
|
| 63 |
+
if text is None:
|
| 64 |
+
if not body or not isinstance(body, bytes):
|
| 65 |
+
raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
|
| 66 |
+
|
| 67 |
+
body = body.replace(b"\x00", b"").strip()
|
| 68 |
+
else:
|
| 69 |
+
if not isinstance(text, str):
|
| 70 |
+
raise TypeError(f"text argument must be of type str, got {text.__class__}")
|
| 71 |
+
|
| 72 |
+
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 73 |
+
|
| 74 |
+
parser = html.HTMLParser(
|
| 75 |
+
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 76 |
+
recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
|
| 77 |
+
compact=True, huge_tree=huge_tree, default_doctype=True
|
| 78 |
+
)
|
| 79 |
+
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 80 |
+
|
| 81 |
+
else:
|
| 82 |
+
# All html types inherits from HtmlMixin so this to check for all at once
|
| 83 |
+
if not issubclass(type(root), html.HtmlMixin):
|
| 84 |
+
raise TypeError(
|
| 85 |
+
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
self._root = root
|
| 89 |
+
|
| 90 |
+
setup_basic_logging(level='debug' if debug else 'info')
|
| 91 |
+
self.__auto_match_enabled = auto_match
|
| 92 |
+
|
| 93 |
+
if self.__auto_match_enabled:
|
| 94 |
+
if not storage_args:
|
| 95 |
+
storage_args = {
|
| 96 |
+
'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
|
| 97 |
+
'url': url
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
if not hasattr(storage, '__wrapped__'):
|
| 101 |
+
raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
|
| 102 |
+
|
| 103 |
+
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
| 104 |
+
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
| 105 |
+
|
| 106 |
+
self._storage = storage(**storage_args)
|
| 107 |
+
|
| 108 |
+
self.__keep_comments = keep_comments
|
| 109 |
+
self.__huge_tree_enabled = huge_tree
|
| 110 |
+
self.encoding = encoding
|
| 111 |
+
self.url = url
|
| 112 |
+
# For selector stuff
|
| 113 |
+
self.__attributes = None
|
| 114 |
+
self.__text = None
|
| 115 |
+
self.__tag = None
|
| 116 |
+
self.__debug = debug
|
| 117 |
+
|
| 118 |
+
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
| 119 |
+
@staticmethod
|
| 120 |
+
def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
|
| 121 |
+
"""Return True if given element is a result of a string expression
|
| 122 |
+
Examples:
|
| 123 |
+
Xpath -> '/text()', '/@attribute' etc...
|
| 124 |
+
CSS3 -> '::text', '::attr(attrib)'...
|
| 125 |
+
"""
|
| 126 |
+
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
| 127 |
+
return issubclass(type(element), etree._ElementUnicodeResult)
|
| 128 |
+
|
| 129 |
+
def __get_correct_result(
|
| 130 |
+
self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
|
| 131 |
+
) -> Union[TextHandler, html.HtmlElement, 'Adaptor', str]:
|
| 132 |
+
"""Used internally in all functions to convert results to type (Adaptor|Adaptors) when possible"""
|
| 133 |
+
if self._is_text_node(element):
|
| 134 |
+
# etree._ElementUnicodeResult basically inherit from `str` so it's fine
|
| 135 |
+
return TextHandler(str(element))
|
| 136 |
+
else:
|
| 137 |
+
if issubclass(type(element), html.HtmlMixin):
|
| 138 |
+
return self.__class__(
|
| 139 |
+
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
| 140 |
+
keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
| 141 |
+
)
|
| 142 |
+
return element
|
| 143 |
+
|
| 144 |
+
def __convert_results(
|
| 145 |
+
self, result: Union[List[html.HtmlElement], html.HtmlElement]
|
| 146 |
+
) -> Union['Adaptors[Adaptor]', 'Adaptor', List, None]:
|
| 147 |
+
"""Used internally in all functions to convert results to type (Adaptor|Adaptors) in bulk when possible"""
|
| 148 |
+
if result is None:
|
| 149 |
+
return None
|
| 150 |
+
elif result == []: # Lxml will give a warning if I used something like `not result`
|
| 151 |
+
return []
|
| 152 |
+
|
| 153 |
+
if isinstance(result, Adaptors):
|
| 154 |
+
return result
|
| 155 |
+
|
| 156 |
+
if type(result) is list:
|
| 157 |
+
results = [self.__get_correct_result(n) for n in result]
|
| 158 |
+
if all(isinstance(res, self.__class__) for res in results):
|
| 159 |
+
return Adaptors(results)
|
| 160 |
+
return results
|
| 161 |
+
|
| 162 |
+
return self.__get_correct_result(result)
|
| 163 |
+
|
| 164 |
+
def __getstate__(self) -> Any:
|
| 165 |
+
# lxml don't like it :)
|
| 166 |
+
raise TypeError("Can't pickle Adaptor objects")
|
| 167 |
+
|
| 168 |
+
# The following four properties I made them into functions instead of variables directly
|
| 169 |
+
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
| 170 |
+
# when the user need them for the first time for that specific element and gets cached for next times
|
| 171 |
+
# Doing that only made the library performance test sky rocked multiple times faster than before
|
| 172 |
+
# because I was executing them on initialization before :))
|
| 173 |
+
@property
|
| 174 |
+
def tag(self) -> str:
|
| 175 |
+
"""Get tag name of the element"""
|
| 176 |
+
if not self.__tag:
|
| 177 |
+
self.__tag = self._root.tag
|
| 178 |
+
return self.__tag
|
| 179 |
+
|
| 180 |
+
@property
|
| 181 |
+
def text(self) -> TextHandler:
|
| 182 |
+
"""Get text content of the element"""
|
| 183 |
+
if not self.__text:
|
| 184 |
+
self.__text = TextHandler(self._root.text)
|
| 185 |
+
return self.__text
|
| 186 |
+
|
| 187 |
+
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
|
| 188 |
+
"""Get all child strings of this element, concatenated using the given separator.
|
| 189 |
+
|
| 190 |
+
:param separator: Strings will be concatenated using this separator.
|
| 191 |
+
:param strip: If True, strings will be stripped before being concatenated.
|
| 192 |
+
:param ignore_tags: A tuple of all tag names you want to ignore
|
| 193 |
+
:param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored
|
| 194 |
+
|
| 195 |
+
:return: A TextHandler
|
| 196 |
+
"""
|
| 197 |
+
_all_strings = []
|
| 198 |
+
|
| 199 |
+
def _traverse(node: html.HtmlElement) -> None:
|
| 200 |
+
"""Traverse element children and get text content of each
|
| 201 |
+
|
| 202 |
+
:param node: Current node in the tree structure
|
| 203 |
+
:return:
|
| 204 |
+
"""
|
| 205 |
+
if node.tag not in ignore_tags:
|
| 206 |
+
text = node.text
|
| 207 |
+
if text and type(text) is str:
|
| 208 |
+
if valid_values:
|
| 209 |
+
if text.strip():
|
| 210 |
+
_all_strings.append(text if not strip else text.strip())
|
| 211 |
+
else:
|
| 212 |
+
_all_strings.append(text if not strip else text.strip())
|
| 213 |
+
|
| 214 |
+
for branch in node.iterchildren():
|
| 215 |
+
_traverse(branch)
|
| 216 |
+
|
| 217 |
+
# We will start using Lxml directly for the speed boost
|
| 218 |
+
_traverse(self._root)
|
| 219 |
+
|
| 220 |
+
return TextHandler(separator.join([s for s in _all_strings]))
|
| 221 |
+
|
| 222 |
+
@property
|
| 223 |
+
def attrib(self) -> AttributesHandler:
|
| 224 |
+
"""Get attributes of the element"""
|
| 225 |
+
if not self.__attributes:
|
| 226 |
+
self.__attributes = AttributesHandler(self._root.attrib)
|
| 227 |
+
return self.__attributes
|
| 228 |
+
|
| 229 |
+
@property
|
| 230 |
+
def html_content(self) -> str:
|
| 231 |
+
"""Return the inner html code of the element"""
|
| 232 |
+
return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
|
| 233 |
+
|
| 234 |
+
body = html_content
|
| 235 |
+
|
| 236 |
+
def prettify(self) -> str:
|
| 237 |
+
"""Return a prettified version of the element's inner html-code"""
|
| 238 |
+
return etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False)
|
| 239 |
+
|
| 240 |
+
def has_class(self, class_name: str) -> bool:
|
| 241 |
+
"""Check if element has a specific class
|
| 242 |
+
:param class_name: The class name to check for
|
| 243 |
+
:return: True if element has class with that name otherwise False
|
| 244 |
+
"""
|
| 245 |
+
return class_name in self._root.classes
|
| 246 |
+
|
| 247 |
+
@property
|
| 248 |
+
def parent(self) -> Union['Adaptor', None]:
|
| 249 |
+
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 250 |
+
return self.__convert_results(self._root.getparent())
|
| 251 |
+
|
| 252 |
+
@property
|
| 253 |
+
def children(self) -> Union['Adaptors[Adaptor]', List]:
|
| 254 |
+
"""Return the children elements of the current element or empty list otherwise"""
|
| 255 |
+
return self.__convert_results(list(
|
| 256 |
+
child for child in self._root.iterchildren() if type(child) not in html_forbidden
|
| 257 |
+
))
|
| 258 |
+
|
| 259 |
+
@property
|
| 260 |
+
def siblings(self) -> Union['Adaptors[Adaptor]', List]:
|
| 261 |
+
"""Return other children of the current element's parent or empty list otherwise"""
|
| 262 |
+
if self.parent:
|
| 263 |
+
return Adaptors([child for child in self.parent.children if child._root != self._root])
|
| 264 |
+
return []
|
| 265 |
+
|
| 266 |
+
def iterancestors(self) -> Generator['Adaptor', None, None]:
|
| 267 |
+
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
| 268 |
+
for ancestor in self._root.iterancestors():
|
| 269 |
+
yield self.__convert_results(ancestor)
|
| 270 |
+
|
| 271 |
+
def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
| 272 |
+
"""Loop over all ancestors of the element till one match the passed function
|
| 273 |
+
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 274 |
+
:return: The first ancestor that match the function or ``None`` otherwise.
|
| 275 |
+
"""
|
| 276 |
+
for ancestor in self.iterancestors():
|
| 277 |
+
if func(ancestor):
|
| 278 |
+
return ancestor
|
| 279 |
+
return None
|
| 280 |
+
|
| 281 |
+
@property
|
| 282 |
+
def path(self) -> 'Adaptors[Adaptor]':
|
| 283 |
+
"""Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
|
| 284 |
+
lst = list(self.iterancestors())
|
| 285 |
+
return Adaptors(lst)
|
| 286 |
+
|
| 287 |
+
@property
|
| 288 |
+
def next(self) -> Union['Adaptor', None]:
|
| 289 |
+
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 290 |
+
next_element = self._root.getnext()
|
| 291 |
+
if next_element is not None:
|
| 292 |
+
while type(next_element) in html_forbidden:
|
| 293 |
+
# Ignore html comments and unwanted types
|
| 294 |
+
next_element = next_element.getnext()
|
| 295 |
+
|
| 296 |
+
return self.__convert_results(next_element)
|
| 297 |
+
|
| 298 |
+
@property
|
| 299 |
+
def previous(self) -> Union['Adaptor', None]:
|
| 300 |
+
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 301 |
+
prev_element = self._root.getprevious()
|
| 302 |
+
if prev_element is not None:
|
| 303 |
+
while type(prev_element) in html_forbidden:
|
| 304 |
+
# Ignore html comments and unwanted types
|
| 305 |
+
prev_element = prev_element.getprevious()
|
| 306 |
+
|
| 307 |
+
return self.__convert_results(prev_element)
|
| 308 |
+
|
| 309 |
+
def __str__(self) -> str:
|
| 310 |
+
return self.html_content
|
| 311 |
+
|
| 312 |
+
def __repr__(self) -> str:
|
| 313 |
+
length_limit = 40
|
| 314 |
+
data = "<"
|
| 315 |
+
content = clean_spaces(self.html_content)
|
| 316 |
+
if len(content) > length_limit:
|
| 317 |
+
content = content[:length_limit].strip() + '...'
|
| 318 |
+
data += f"data='{content}'"
|
| 319 |
+
|
| 320 |
+
if self.parent:
|
| 321 |
+
parent_content = clean_spaces(self.parent.html_content)
|
| 322 |
+
if len(parent_content) > length_limit:
|
| 323 |
+
parent_content = parent_content[:length_limit].strip() + '...'
|
| 324 |
+
|
| 325 |
+
data += f" parent='{parent_content}'"
|
| 326 |
+
|
| 327 |
+
return data + ">"
|
| 328 |
+
|
| 329 |
+
# From here we start the selecting functions
|
| 330 |
+
def relocate(
|
| 331 |
+
self, element: Union[Dict, html.HtmlElement, 'Adaptor'], percentage: int = 0, adaptor_type: bool = False
|
| 332 |
+
) -> Union[List[Union[html.HtmlElement, None]], 'Adaptors']:
|
| 333 |
+
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 334 |
+
|
| 335 |
+
:param element: The element we want to relocate in the tree
|
| 336 |
+
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
| 337 |
+
calculation depends solely on the page structure so don't play with this number unless you must know
|
| 338 |
+
what you are doing!
|
| 339 |
+
:param adaptor_type: If True, the return result will be converted to `Adaptors` object
|
| 340 |
+
:return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
|
| 341 |
+
"""
|
| 342 |
+
score_table = {}
|
| 343 |
+
# Note: `element` will be most likely always be a dictionary at this point.
|
| 344 |
+
if isinstance(element, self.__class__):
|
| 345 |
+
element = element._root
|
| 346 |
+
|
| 347 |
+
if issubclass(type(element), html.HtmlElement):
|
| 348 |
+
element = _StorageTools.element_to_dict(element)
|
| 349 |
+
|
| 350 |
+
# TODO: Optimize the traverse logic a bit, maybe later
|
| 351 |
+
def _traverse(node: html.HtmlElement, ele: Dict) -> None:
|
| 352 |
+
"""Get the matching score of the given element against the node then traverse the children
|
| 353 |
+
|
| 354 |
+
:param node: Current node in the tree structure
|
| 355 |
+
:param ele: The element we are searching for as dictionary
|
| 356 |
+
:return:
|
| 357 |
+
"""
|
| 358 |
+
# Hence: the code doesn't stop even if the score was 100%
|
| 359 |
+
# because there might be another element(s) left in page with the same score
|
| 360 |
+
score = self.__calculate_similarity_score(ele, node)
|
| 361 |
+
score_table.setdefault(score, []).append(node)
|
| 362 |
+
for branch in node.iterchildren():
|
| 363 |
+
_traverse(branch, ele)
|
| 364 |
+
|
| 365 |
+
# This will block until we traverse all children/branches
|
| 366 |
+
_traverse(self._root, element)
|
| 367 |
+
|
| 368 |
+
if score_table:
|
| 369 |
+
highest_probability = max(score_table.keys())
|
| 370 |
+
if score_table[highest_probability] and highest_probability >= percentage:
|
| 371 |
+
logging.debug(f'Highest probability was {highest_probability}%')
|
| 372 |
+
logging.debug('Top 5 best matching elements are: ')
|
| 373 |
+
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 374 |
+
logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
|
| 375 |
+
if not adaptor_type:
|
| 376 |
+
return score_table[highest_probability]
|
| 377 |
+
return self.__convert_results(score_table[highest_probability])
|
| 378 |
+
return []
|
| 379 |
+
|
| 380 |
+
def css(self, selector: str, identifier: str = '',
|
| 381 |
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
| 382 |
+
) -> Union['Adaptors[Adaptor]', List]:
|
| 383 |
+
"""Search current tree with CSS3 selectors
|
| 384 |
+
|
| 385 |
+
**Important:
|
| 386 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 387 |
+
and want to relocate the same element(s)**
|
| 388 |
+
|
| 389 |
+
:param selector: The CSS3 selector to be used.
|
| 390 |
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 391 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 392 |
+
otherwise the selector will be used.
|
| 393 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 394 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 395 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 396 |
+
number unless you must know what you are doing!
|
| 397 |
+
|
| 398 |
+
:return: List as :class:`Adaptors`
|
| 399 |
+
"""
|
| 400 |
+
try:
|
| 401 |
+
if not self.__auto_match_enabled:
|
| 402 |
+
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 403 |
+
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
| 404 |
+
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
| 405 |
+
|
| 406 |
+
results = []
|
| 407 |
+
if ',' in selector:
|
| 408 |
+
for single_selector in split_selectors(selector):
|
| 409 |
+
# I'm doing this only so the `save` function save data correctly for combined selectors
|
| 410 |
+
# Like using the ',' to combine two different selectors that point to different elements.
|
| 411 |
+
xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
|
| 412 |
+
results += self.xpath(
|
| 413 |
+
xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
|
| 414 |
+
)
|
| 415 |
+
else:
|
| 416 |
+
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
| 417 |
+
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
| 418 |
+
|
| 419 |
+
return self.__convert_results(results)
|
| 420 |
+
except (SelectorError, SelectorSyntaxError,):
|
| 421 |
+
raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
|
| 422 |
+
|
| 423 |
+
def xpath(self, selector: str, identifier: str = '',
|
| 424 |
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
| 425 |
+
) -> Union['Adaptors[Adaptor]', List]:
|
| 426 |
+
"""Search current tree with XPath selectors
|
| 427 |
+
|
| 428 |
+
**Important:
|
| 429 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 430 |
+
and want to relocate the same element(s)**
|
| 431 |
+
|
| 432 |
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 433 |
+
|
| 434 |
+
:param selector: The XPath selector to be used.
|
| 435 |
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 436 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 437 |
+
otherwise the selector will be used.
|
| 438 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 439 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 440 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 441 |
+
number unless you must know what you are doing!
|
| 442 |
+
|
| 443 |
+
:return: List as :class:`Adaptors`
|
| 444 |
+
"""
|
| 445 |
+
try:
|
| 446 |
+
selected_elements = self._root.xpath(selector, **kwargs)
|
| 447 |
+
|
| 448 |
+
if selected_elements:
|
| 449 |
+
if not self.__auto_match_enabled and auto_save:
|
| 450 |
+
logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
| 451 |
+
|
| 452 |
+
elif self.__auto_match_enabled and auto_save:
|
| 453 |
+
self.save(selected_elements[0], identifier or selector)
|
| 454 |
+
|
| 455 |
+
return self.__convert_results(selected_elements)
|
| 456 |
+
else:
|
| 457 |
+
if self.__auto_match_enabled and auto_match:
|
| 458 |
+
element_data = self.retrieve(identifier or selector)
|
| 459 |
+
if element_data:
|
| 460 |
+
relocated = self.relocate(element_data, percentage)
|
| 461 |
+
if relocated is not None and auto_save:
|
| 462 |
+
self.save(relocated[0], identifier or selector)
|
| 463 |
+
|
| 464 |
+
return self.__convert_results(relocated)
|
| 465 |
+
else:
|
| 466 |
+
return self.__convert_results(selected_elements)
|
| 467 |
+
|
| 468 |
+
elif not self.__auto_match_enabled and auto_match:
|
| 469 |
+
logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
| 470 |
+
|
| 471 |
+
return self.__convert_results(selected_elements)
|
| 472 |
+
|
| 473 |
+
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
| 474 |
+
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
| 475 |
+
|
| 476 |
+
def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
|
| 477 |
+
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
| 478 |
+
|
| 479 |
+
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
| 480 |
+
:param candidate: The element to compare with the original element.
|
| 481 |
+
:return: A percentage score of how similar is the candidate to the original element
|
| 482 |
+
"""
|
| 483 |
+
score, checks = 0, 0
|
| 484 |
+
candidate = _StorageTools.element_to_dict(candidate)
|
| 485 |
+
|
| 486 |
+
# Possible TODO:
|
| 487 |
+
# Study the idea of giving weight to each test below so some are more important than others
|
| 488 |
+
# Current results: With weights some websites had better score while it was worse for others
|
| 489 |
+
score += 1 if original['tag'] == candidate['tag'] else 0 # * 0.3 # 30%
|
| 490 |
+
checks += 1
|
| 491 |
+
|
| 492 |
+
if original['text']:
|
| 493 |
+
score += SequenceMatcher(None, original['text'], candidate.get('text') or '').ratio() # * 0.3 # 30%
|
| 494 |
+
checks += 1
|
| 495 |
+
|
| 496 |
+
# if both doesn't have attributes, it still count for something!
|
| 497 |
+
score += self.__calculate_dict_diff(original['attributes'], candidate['attributes']) # * 0.3 # 30%
|
| 498 |
+
checks += 1
|
| 499 |
+
|
| 500 |
+
# Separate similarity test for class, id, href,... this will help in full structural changes
|
| 501 |
+
for attrib in ('class', 'id', 'href', 'src',):
|
| 502 |
+
if original['attributes'].get(attrib):
|
| 503 |
+
score += SequenceMatcher(
|
| 504 |
+
None, original['attributes'][attrib], candidate['attributes'].get(attrib) or ''
|
| 505 |
+
).ratio() # * 0.3 # 30%
|
| 506 |
+
checks += 1
|
| 507 |
+
|
| 508 |
+
score += SequenceMatcher(None, original['path'], candidate['path']).ratio() # * 0.1 # 10%
|
| 509 |
+
checks += 1
|
| 510 |
+
|
| 511 |
+
if original.get('parent_name'):
|
| 512 |
+
# Then we start comparing parents' data
|
| 513 |
+
if candidate.get('parent_name'):
|
| 514 |
+
score += SequenceMatcher(
|
| 515 |
+
None, original['parent_name'], candidate.get('parent_name') or ''
|
| 516 |
+
).ratio() # * 0.2 # 20%
|
| 517 |
+
checks += 1
|
| 518 |
+
|
| 519 |
+
score += self.__calculate_dict_diff(
|
| 520 |
+
original['parent_attribs'], candidate.get('parent_attribs') or {}
|
| 521 |
+
) # * 0.2 # 20%
|
| 522 |
+
checks += 1
|
| 523 |
+
|
| 524 |
+
if original['parent_text']:
|
| 525 |
+
score += SequenceMatcher(
|
| 526 |
+
None, original['parent_text'], candidate.get('parent_text') or ''
|
| 527 |
+
).ratio() # * 0.1 # 10%
|
| 528 |
+
checks += 1
|
| 529 |
+
# else:
|
| 530 |
+
# # The original element have a parent and this one not, this is not a good sign
|
| 531 |
+
# score -= 0.1
|
| 532 |
+
|
| 533 |
+
if original.get('siblings'):
|
| 534 |
+
score += SequenceMatcher(
|
| 535 |
+
None, original['siblings'], candidate.get('siblings') or []
|
| 536 |
+
).ratio() # * 0.1 # 10%
|
| 537 |
+
checks += 1
|
| 538 |
+
|
| 539 |
+
# How % sure? let's see
|
| 540 |
+
return round((score / checks) * 100, 2)
|
| 541 |
+
|
| 542 |
+
@staticmethod
|
| 543 |
+
def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
|
| 544 |
+
"""Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
|
| 545 |
+
"""
|
| 546 |
+
score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
|
| 547 |
+
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
| 548 |
+
return score
|
| 549 |
+
|
| 550 |
+
def save(self, element: Union['Adaptor', html.HtmlElement], identifier: str) -> None:
|
| 551 |
+
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 552 |
+
|
| 553 |
+
:param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
|
| 554 |
+
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 555 |
+
the docs for more info.
|
| 556 |
+
"""
|
| 557 |
+
if self.__auto_match_enabled:
|
| 558 |
+
if isinstance(element, self.__class__):
|
| 559 |
+
element = element._root
|
| 560 |
+
|
| 561 |
+
if self._is_text_node(element):
|
| 562 |
+
element = element.getparent()
|
| 563 |
+
|
| 564 |
+
self._storage.save(element, identifier)
|
| 565 |
+
else:
|
| 566 |
+
logging.critical(
|
| 567 |
+
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
def retrieve(self, identifier: str) -> Optional[Dict]:
|
| 571 |
+
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 572 |
+
|
| 573 |
+
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
| 574 |
+
the docs for more info.
|
| 575 |
+
:return: A dictionary of the unique properties
|
| 576 |
+
"""
|
| 577 |
+
if self.__auto_match_enabled:
|
| 578 |
+
return self._storage.retrieve(identifier)
|
| 579 |
+
|
| 580 |
+
logging.critical(
|
| 581 |
+
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
# Operations on text functions
|
| 585 |
+
def json(self) -> Dict:
|
| 586 |
+
"""Return json response if the response is jsonable otherwise throws error"""
|
| 587 |
+
return self.text.json()
|
| 588 |
+
|
| 589 |
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
|
| 590 |
+
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 591 |
+
|
| 592 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 593 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 594 |
+
"""
|
| 595 |
+
return self.text.re(regex, replace_entities)
|
| 596 |
+
|
| 597 |
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
|
| 598 |
+
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 599 |
+
|
| 600 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 601 |
+
:param default: The default value to be returned if there is no match
|
| 602 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 603 |
+
|
| 604 |
+
"""
|
| 605 |
+
return self.text.re_first(regex, default, replace_entities)
|
| 606 |
+
|
| 607 |
+
def find_similar(
|
| 608 |
+
self,
|
| 609 |
+
similarity_threshold: float = 0.2,
|
| 610 |
+
ignore_attributes: Union[List, Tuple] = ('href', 'src',),
|
| 611 |
+
match_text: bool = False
|
| 612 |
+
) -> Union['Adaptors[Adaptor]', List]:
|
| 613 |
+
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 614 |
+
then return the ones that match the current element attributes with percentage higher than the input threshold.
|
| 615 |
+
|
| 616 |
+
This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
|
| 617 |
+
a products-list container and want to find other products using that that element as a starting point EXCEPT
|
| 618 |
+
this function works in any case without depending on the element type.
|
| 619 |
+
|
| 620 |
+
:param similarity_threshold: The percentage to use while comparing elements attributes.
|
| 621 |
+
Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
|
| 622 |
+
same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless your are
|
| 623 |
+
extremely unlucky then attributes matching comes into play so basically don't play with this number unless
|
| 624 |
+
you are getting the results you don't want.
|
| 625 |
+
Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
|
| 626 |
+
:param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
|
| 627 |
+
The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
|
| 628 |
+
:param match_text: If True, elements text content will be taken into calculation while matching.
|
| 629 |
+
Not recommended to use in normal cases but it depends.
|
| 630 |
+
|
| 631 |
+
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
| 632 |
+
"""
|
| 633 |
+
def get_attributes(element: html.HtmlElement) -> Dict:
|
| 634 |
+
"""Return attributes dictionary without the ignored list"""
|
| 635 |
+
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
| 636 |
+
|
| 637 |
+
def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
|
| 638 |
+
"""Calculate a score of how much these elements are alike and return True
|
| 639 |
+
if score is higher or equal the threshold"""
|
| 640 |
+
candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
|
| 641 |
+
score, checks = 0, 0
|
| 642 |
+
|
| 643 |
+
if original_attributes:
|
| 644 |
+
score += sum(
|
| 645 |
+
SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
|
| 646 |
+
for k, v in original_attributes.items()
|
| 647 |
+
)
|
| 648 |
+
checks += len(candidate_attributes)
|
| 649 |
+
else:
|
| 650 |
+
if not candidate_attributes:
|
| 651 |
+
# Both doesn't have attributes, this must mean something
|
| 652 |
+
score += 1
|
| 653 |
+
checks += 1
|
| 654 |
+
|
| 655 |
+
if match_text:
|
| 656 |
+
score += SequenceMatcher(
|
| 657 |
+
None, clean_spaces(original.text or ''), clean_spaces(candidate.text or '')
|
| 658 |
+
).ratio()
|
| 659 |
+
checks += 1
|
| 660 |
+
|
| 661 |
+
if checks:
|
| 662 |
+
return round(score / checks, 2) >= similarity_threshold
|
| 663 |
+
return False
|
| 664 |
+
|
| 665 |
+
# We will use the elements root from now on to get the speed boost of using Lxml directly
|
| 666 |
+
root = self._root
|
| 667 |
+
current_depth = len(list(root.iterancestors()))
|
| 668 |
+
target_attrs = get_attributes(root) if ignore_attributes else root.attrib
|
| 669 |
+
similar_elements = list()
|
| 670 |
+
# + root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth-1}]")
|
| 671 |
+
parent = root.getparent()
|
| 672 |
+
if parent is not None:
|
| 673 |
+
grandparent = parent.getparent() # lol
|
| 674 |
+
if grandparent is not None:
|
| 675 |
+
potential_matches = root.xpath(
|
| 676 |
+
f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
|
| 677 |
+
)
|
| 678 |
+
else:
|
| 679 |
+
potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
|
| 680 |
+
else:
|
| 681 |
+
potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
|
| 682 |
+
|
| 683 |
+
for potential_match in potential_matches:
|
| 684 |
+
if potential_match != root and are_alike(root, target_attrs, potential_match):
|
| 685 |
+
similar_elements.append(potential_match)
|
| 686 |
+
|
| 687 |
+
return self.__convert_results(similar_elements)
|
| 688 |
+
|
| 689 |
+
def find_by_text(
|
| 690 |
+
self, text: str, first_match: bool = True, partial: bool = False,
|
| 691 |
+
case_sensitive: bool = False, clean_match: bool = True
|
| 692 |
+
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
| 693 |
+
"""Find elements that its text content fully/partially matches input.
|
| 694 |
+
:param text: Text query to match
|
| 695 |
+
:param first_match: Return first element that matches conditions, enabled by default
|
| 696 |
+
:param partial: If enabled, function return elements that contains the input text
|
| 697 |
+
:param case_sensitive: if enabled, letters case will be taken into consideration
|
| 698 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 699 |
+
"""
|
| 700 |
+
|
| 701 |
+
results = []
|
| 702 |
+
if not case_sensitive:
|
| 703 |
+
text = text.lower()
|
| 704 |
+
|
| 705 |
+
def _traverse(node: Adaptor) -> None:
|
| 706 |
+
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
| 707 |
+
node_text = node.text
|
| 708 |
+
# if there's already no text in this node, dodge it to save CPU cycles and time
|
| 709 |
+
if node_text:
|
| 710 |
+
if clean_match:
|
| 711 |
+
node_text = node_text.clean()
|
| 712 |
+
|
| 713 |
+
if not case_sensitive:
|
| 714 |
+
node_text = node_text.lower()
|
| 715 |
+
|
| 716 |
+
if partial:
|
| 717 |
+
if text in node_text:
|
| 718 |
+
results.append(node)
|
| 719 |
+
elif text == node_text:
|
| 720 |
+
results.append(node)
|
| 721 |
+
|
| 722 |
+
if results and first_match:
|
| 723 |
+
# we got an element so we should stop
|
| 724 |
+
return
|
| 725 |
+
|
| 726 |
+
for branch in node.children:
|
| 727 |
+
_traverse(branch)
|
| 728 |
+
|
| 729 |
+
# This will block until we traverse all children/branches
|
| 730 |
+
_traverse(self)
|
| 731 |
+
|
| 732 |
+
if first_match:
|
| 733 |
+
if results:
|
| 734 |
+
return results[0]
|
| 735 |
+
return self.__convert_results(results)
|
| 736 |
+
|
| 737 |
+
def find_by_regex(
|
| 738 |
+
self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
| 739 |
+
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
| 740 |
+
"""Find elements that its text content matches the input regex pattern.
|
| 741 |
+
:param query: Regex query to match
|
| 742 |
+
:param first_match: Return first element that matches conditions, enabled by default
|
| 743 |
+
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
| 744 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 745 |
+
"""
|
| 746 |
+
results = []
|
| 747 |
+
|
| 748 |
+
def _traverse(node: Adaptor) -> None:
|
| 749 |
+
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
| 750 |
+
node_text = node.text
|
| 751 |
+
# if there's already no text in this node, dodge it to save CPU cycles and time
|
| 752 |
+
if node_text:
|
| 753 |
+
if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
|
| 754 |
+
results.append(node)
|
| 755 |
+
|
| 756 |
+
if results and first_match:
|
| 757 |
+
# we got an element so we should stop
|
| 758 |
+
return
|
| 759 |
+
|
| 760 |
+
for branch in node.children:
|
| 761 |
+
_traverse(branch)
|
| 762 |
+
|
| 763 |
+
# This will block until we traverse all children/branches
|
| 764 |
+
_traverse(self)
|
| 765 |
+
|
| 766 |
+
if results and first_match:
|
| 767 |
+
return results[0]
|
| 768 |
+
return self.__convert_results(results)
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
class Adaptors(List[Adaptor]):
|
| 772 |
+
"""
|
| 773 |
+
The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 774 |
+
"""
|
| 775 |
+
__slots__ = ()
|
| 776 |
+
|
| 777 |
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors[Adaptor]"]:
|
| 778 |
+
lst = super().__getitem__(pos)
|
| 779 |
+
if isinstance(pos, slice):
|
| 780 |
+
return self.__class__(lst)
|
| 781 |
+
else:
|
| 782 |
+
return lst
|
| 783 |
+
|
| 784 |
+
def xpath(
|
| 785 |
+
self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
| 786 |
+
) -> Union["Adaptors[Adaptor]", List]:
|
| 787 |
+
"""
|
| 788 |
+
Call the ``.xpath()`` method for each element in this list and return
|
| 789 |
+
their results as another :class:`Adaptors`.
|
| 790 |
+
|
| 791 |
+
**Important:
|
| 792 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 793 |
+
and want to relocate the same element(s)**
|
| 794 |
+
|
| 795 |
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 796 |
+
|
| 797 |
+
:param selector: The XPath selector to be used.
|
| 798 |
+
:param identifier: A string that will be used to retrieve element's data in auto-matching
|
| 799 |
+
otherwise the selector will be used.
|
| 800 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 801 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 802 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 803 |
+
number unless you must know what you are doing!
|
| 804 |
+
|
| 805 |
+
:return: List as :class:`Adaptors`
|
| 806 |
+
"""
|
| 807 |
+
results = [
|
| 808 |
+
n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self
|
| 809 |
+
]
|
| 810 |
+
return self.__class__(flatten(results))
|
| 811 |
+
|
| 812 |
+
def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> Union["Adaptors[Adaptor]", List]:
|
| 813 |
+
"""
|
| 814 |
+
Call the ``.css()`` method for each element in this list and return
|
| 815 |
+
their results flattened as another :class:`Adaptors`.
|
| 816 |
+
|
| 817 |
+
**Important:
|
| 818 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 819 |
+
and want to relocate the same element(s)**
|
| 820 |
+
|
| 821 |
+
:param selector: The CSS3 selector to be used.
|
| 822 |
+
:param identifier: A string that will be used to retrieve element's data in auto-matching
|
| 823 |
+
otherwise the selector will be used.
|
| 824 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 825 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 826 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 827 |
+
number unless you must know what you are doing!
|
| 828 |
+
|
| 829 |
+
:return: List as :class:`Adaptors`
|
| 830 |
+
"""
|
| 831 |
+
results = [
|
| 832 |
+
n.css(selector, identifier or selector, False, auto_save, percentage) for n in self
|
| 833 |
+
]
|
| 834 |
+
return self.__class__(flatten(results))
|
| 835 |
+
|
| 836 |
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
|
| 837 |
+
"""Call the ``.re()`` method for each element in this list and return
|
| 838 |
+
their results flattened as List of TextHandler.
|
| 839 |
+
|
| 840 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 841 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 842 |
+
"""
|
| 843 |
+
results = [
|
| 844 |
+
n.text.re(regex, replace_entities) for n in self
|
| 845 |
+
]
|
| 846 |
+
return flatten(results)
|
| 847 |
+
|
| 848 |
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
|
| 849 |
+
"""Call the ``.re_first()`` method for each element in this list and return
|
| 850 |
+
their results flattened as List of TextHandler.
|
| 851 |
+
|
| 852 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 853 |
+
:param default: The default value to be returned if there is no match
|
| 854 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 855 |
+
|
| 856 |
+
"""
|
| 857 |
+
results = [
|
| 858 |
+
n.text.re_first(regex, default, replace_entities) for n in self
|
| 859 |
+
]
|
| 860 |
+
return flatten(results)
|
| 861 |
+
|
| 862 |
+
# def __getattr__(self, name):
|
| 863 |
+
# if name in dir(self.__class__):
|
| 864 |
+
# return super().__getattribute__(name)
|
| 865 |
+
#
|
| 866 |
+
# # Execute the method itself on each Adaptor
|
| 867 |
+
# results = []
|
| 868 |
+
# for item in self:
|
| 869 |
+
# results.append(getattr(item, name))
|
| 870 |
+
#
|
| 871 |
+
# if all(callable(r) for r in results):
|
| 872 |
+
# def call_all(*args, **kwargs):
|
| 873 |
+
# final_results = [r(*args, **kwargs) for r in results]
|
| 874 |
+
# if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
|
| 875 |
+
# return self.__class__(final_results)
|
| 876 |
+
# return final_results
|
| 877 |
+
#
|
| 878 |
+
# return call_all
|
| 879 |
+
# else:
|
| 880 |
+
# # Flatten the result if it's a single-item list containing a list
|
| 881 |
+
# if len(self) == 1 and isinstance(results[0], list):
|
| 882 |
+
# return self.__class__(results[0])
|
| 883 |
+
# return self.__class__(results)
|
| 884 |
+
|
| 885 |
+
def get(self, default=None):
|
| 886 |
+
"""Returns the first item of the current list
|
| 887 |
+
:param default: the default value to return if the current list is empty
|
| 888 |
+
"""
|
| 889 |
+
return self[0] if len(self) > 0 else default
|
| 890 |
+
|
| 891 |
+
@property
|
| 892 |
+
def first(self):
|
| 893 |
+
"""Returns the first item of the current list or `None` if the list is empty"""
|
| 894 |
+
return self.get()
|
| 895 |
+
|
| 896 |
+
@property
|
| 897 |
+
def last(self):
|
| 898 |
+
"""Returns the last item of the current list or `None` if the list is empty"""
|
| 899 |
+
return self[-1] if len(self) > 0 else None
|
| 900 |
+
|
| 901 |
+
def __getstate__(self) -> Any:
|
| 902 |
+
# lxml don't like it :)
|
| 903 |
+
raise TypeError("Can't pickle Adaptors object")
|
scrapling/py.typed
ADDED
|
File without changes
|
scrapling/storage_adaptors.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import orjson
|
| 2 |
+
import sqlite3
|
| 3 |
+
import logging
|
| 4 |
+
import threading
|
| 5 |
+
from hashlib import sha256
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import Dict, Optional, Union
|
| 8 |
+
|
| 9 |
+
from scrapling.utils import _StorageTools, cache
|
| 10 |
+
|
| 11 |
+
from lxml import html
|
| 12 |
+
from tldextract import extract as tld
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class StorageSystemMixin(ABC):
|
| 16 |
+
# If you want to make your own storage system, you have to inherit from this
|
| 17 |
+
def __init__(self, url: Union[str, None] = None):
|
| 18 |
+
"""
|
| 19 |
+
:param url: URL of the website we are working on to separate it from other websites data
|
| 20 |
+
"""
|
| 21 |
+
self.url = url
|
| 22 |
+
|
| 23 |
+
@cache
|
| 24 |
+
def _get_base_url(self, default_value: str = 'default') -> str:
|
| 25 |
+
if not self.url or type(self.url) is not str:
|
| 26 |
+
return default_value
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
extracted = tld(self.url)
|
| 30 |
+
return extracted.registered_domain or extracted.domain or default_value
|
| 31 |
+
except AttributeError:
|
| 32 |
+
return default_value
|
| 33 |
+
|
| 34 |
+
@abstractmethod
|
| 35 |
+
def save(self, element: html.HtmlElement, identifier: str) -> None:
|
| 36 |
+
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 37 |
+
|
| 38 |
+
:param element: The element itself that we want to save to storage.
|
| 39 |
+
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 40 |
+
the docs for more info.
|
| 41 |
+
"""
|
| 42 |
+
raise NotImplementedError('Storage system must implement `save` method')
|
| 43 |
+
|
| 44 |
+
@abstractmethod
|
| 45 |
+
def retrieve(self, identifier: str) -> Optional[Dict]:
|
| 46 |
+
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 47 |
+
|
| 48 |
+
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
| 49 |
+
the docs for more info.
|
| 50 |
+
:return: A dictionary of the unique properties
|
| 51 |
+
"""
|
| 52 |
+
raise NotImplementedError('Storage system must implement `save` method')
|
| 53 |
+
|
| 54 |
+
@staticmethod
|
| 55 |
+
@cache
|
| 56 |
+
def _get_hash(identifier: str) -> str:
|
| 57 |
+
"""If you want to hash identifier in your storage system, use this safer"""
|
| 58 |
+
identifier = identifier.lower().strip()
|
| 59 |
+
if isinstance(identifier, str):
|
| 60 |
+
# Hash functions have to take bytes
|
| 61 |
+
identifier = identifier.encode('utf-8')
|
| 62 |
+
|
| 63 |
+
hash_value = sha256(identifier).hexdigest()
|
| 64 |
+
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@cache(None, typed=True)
|
| 68 |
+
class SQLiteStorageSystem(StorageSystemMixin):
|
| 69 |
+
"""The recommended system to use, it's race condition safe and thread safe.
|
| 70 |
+
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
| 71 |
+
> It's optimized for threaded applications but running it without threads shouldn't make it slow."""
|
| 72 |
+
def __init__(self, storage_file: str, url: Union[str, None] = None):
|
| 73 |
+
"""
|
| 74 |
+
:param storage_file: File to be used to store elements
|
| 75 |
+
:param url: URL of the website we are working on to separate it from other websites data
|
| 76 |
+
|
| 77 |
+
"""
|
| 78 |
+
super().__init__(url)
|
| 79 |
+
self.storage_file = storage_file
|
| 80 |
+
# We use a threading.Lock to ensure thread-safety instead of relying on thread-local storage.
|
| 81 |
+
self.lock = threading.Lock()
|
| 82 |
+
# >SQLite default mode in earlier version is 1 not 2 (1=thread-safe 2=serialized)
|
| 83 |
+
# `check_same_thread=False` to allow it to be used across different threads.
|
| 84 |
+
self.connection = sqlite3.connect(self.storage_file, check_same_thread=False)
|
| 85 |
+
# WAL (Write-Ahead Logging) allows for better concurrency.
|
| 86 |
+
self.connection.execute("PRAGMA journal_mode=WAL")
|
| 87 |
+
self.cursor = self.connection.cursor()
|
| 88 |
+
self._setup_database()
|
| 89 |
+
logging.debug(
|
| 90 |
+
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def _setup_database(self) -> None:
|
| 94 |
+
self.cursor.execute("""
|
| 95 |
+
CREATE TABLE IF NOT EXISTS storage (
|
| 96 |
+
id INTEGER PRIMARY KEY,
|
| 97 |
+
url TEXT,
|
| 98 |
+
identifier TEXT,
|
| 99 |
+
element_data TEXT,
|
| 100 |
+
UNIQUE (url, identifier)
|
| 101 |
+
)
|
| 102 |
+
""")
|
| 103 |
+
self.connection.commit()
|
| 104 |
+
|
| 105 |
+
def save(self, element: html.HtmlElement, identifier: str):
|
| 106 |
+
"""Saves the elements unique properties to the storage for retrieval and relocation later
|
| 107 |
+
|
| 108 |
+
:param element: The element itself that we want to save to storage.
|
| 109 |
+
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 110 |
+
the docs for more info.
|
| 111 |
+
"""
|
| 112 |
+
url = self._get_base_url()
|
| 113 |
+
element_data = _StorageTools.element_to_dict(element)
|
| 114 |
+
with self.lock:
|
| 115 |
+
self.cursor.execute("""
|
| 116 |
+
INSERT OR REPLACE INTO storage (url, identifier, element_data)
|
| 117 |
+
VALUES (?, ?, ?)
|
| 118 |
+
""", (url, identifier, orjson.dumps(element_data)))
|
| 119 |
+
self.cursor.fetchall()
|
| 120 |
+
self.connection.commit()
|
| 121 |
+
|
| 122 |
+
def retrieve(self, identifier: str) -> Optional[Dict]:
|
| 123 |
+
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 124 |
+
|
| 125 |
+
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
| 126 |
+
the docs for more info.
|
| 127 |
+
:return: A dictionary of the unique properties
|
| 128 |
+
"""
|
| 129 |
+
url = self._get_base_url()
|
| 130 |
+
with self.lock:
|
| 131 |
+
self.cursor.execute(
|
| 132 |
+
"SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
|
| 133 |
+
(url, identifier)
|
| 134 |
+
)
|
| 135 |
+
result = self.cursor.fetchone()
|
| 136 |
+
if result:
|
| 137 |
+
return orjson.loads(result[0])
|
| 138 |
+
return None
|
| 139 |
+
|
| 140 |
+
def close(self):
|
| 141 |
+
"""Close all connections, will be useful when with some things like scrapy Spider.closed() function/signal"""
|
| 142 |
+
with self.lock:
|
| 143 |
+
self.connection.commit()
|
| 144 |
+
self.cursor.close()
|
| 145 |
+
self.connection.close()
|
| 146 |
+
|
| 147 |
+
def __del__(self):
|
| 148 |
+
"""To ensure all connections are closed when the object is destroyed."""
|
| 149 |
+
self.close()
|
scrapling/translator.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
|
| 3 |
+
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
|
| 4 |
+
which will be important in future releases but most importantly...
|
| 5 |
+
so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
|
| 6 |
+
> if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
from w3lib.html import HTML5_WHITESPACE
|
| 12 |
+
from typing import TYPE_CHECKING, Any, Optional, Protocol
|
| 13 |
+
|
| 14 |
+
from scrapling.utils import cache
|
| 15 |
+
|
| 16 |
+
from cssselect.xpath import ExpressionError
|
| 17 |
+
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
| 18 |
+
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
| 19 |
+
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
| 20 |
+
|
| 21 |
+
if TYPE_CHECKING:
|
| 22 |
+
# typing.Self requires Python 3.11
|
| 23 |
+
from typing_extensions import Self
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
regex = f"[{HTML5_WHITESPACE}]+"
|
| 27 |
+
replace_html5_whitespaces = re.compile(regex).sub
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class XPathExpr(OriginalXPathExpr):
|
| 31 |
+
|
| 32 |
+
textnode: bool = False
|
| 33 |
+
attribute: Optional[str] = None
|
| 34 |
+
|
| 35 |
+
@classmethod
|
| 36 |
+
def from_xpath(
|
| 37 |
+
cls,
|
| 38 |
+
xpath: OriginalXPathExpr,
|
| 39 |
+
textnode: bool = False,
|
| 40 |
+
attribute: Optional[str] = None,
|
| 41 |
+
) -> "Self":
|
| 42 |
+
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
| 43 |
+
x.textnode = textnode
|
| 44 |
+
x.attribute = attribute
|
| 45 |
+
return x
|
| 46 |
+
|
| 47 |
+
def __str__(self) -> str:
|
| 48 |
+
path = super().__str__()
|
| 49 |
+
if self.textnode:
|
| 50 |
+
if path == "*":
|
| 51 |
+
path = "text()"
|
| 52 |
+
elif path.endswith("::*/*"):
|
| 53 |
+
path = path[:-3] + "text()"
|
| 54 |
+
else:
|
| 55 |
+
path += "/text()"
|
| 56 |
+
|
| 57 |
+
if self.attribute is not None:
|
| 58 |
+
if path.endswith("::*/*"):
|
| 59 |
+
path = path[:-2]
|
| 60 |
+
path += f"/@{self.attribute}"
|
| 61 |
+
|
| 62 |
+
return path
|
| 63 |
+
|
| 64 |
+
def join(
|
| 65 |
+
self: "Self",
|
| 66 |
+
combiner: str,
|
| 67 |
+
other: OriginalXPathExpr,
|
| 68 |
+
*args: Any,
|
| 69 |
+
**kwargs: Any,
|
| 70 |
+
) -> "Self":
|
| 71 |
+
if not isinstance(other, XPathExpr):
|
| 72 |
+
raise ValueError(
|
| 73 |
+
f"Expressions of type {__name__}.XPathExpr can ony join expressions"
|
| 74 |
+
f" of the same type (or its descendants), got {type(other)}"
|
| 75 |
+
)
|
| 76 |
+
super().join(combiner, other, *args, **kwargs)
|
| 77 |
+
self.textnode = other.textnode
|
| 78 |
+
self.attribute = other.attribute
|
| 79 |
+
return self
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
| 83 |
+
class TranslatorProtocol(Protocol):
|
| 84 |
+
def xpath_element(self, selector: Element) -> OriginalXPathExpr:
|
| 85 |
+
pass
|
| 86 |
+
|
| 87 |
+
def css_to_xpath(self, css: str, prefix: str = ...) -> str:
|
| 88 |
+
pass
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class TranslatorMixin:
|
| 92 |
+
"""This mixin adds support to CSS pseudo elements via dynamic dispatch.
|
| 93 |
+
|
| 94 |
+
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
|
| 98 |
+
# https://github.com/python/mypy/issues/12344
|
| 99 |
+
xpath = super().xpath_element(selector) # type: ignore[safe-super]
|
| 100 |
+
return XPathExpr.from_xpath(xpath)
|
| 101 |
+
|
| 102 |
+
def xpath_pseudo_element(
|
| 103 |
+
self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
|
| 104 |
+
) -> OriginalXPathExpr:
|
| 105 |
+
"""
|
| 106 |
+
Dispatch method that transforms XPath to support pseudo-elements.
|
| 107 |
+
"""
|
| 108 |
+
if isinstance(pseudo_element, FunctionalPseudoElement):
|
| 109 |
+
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
|
| 110 |
+
method = getattr(self, method_name, None)
|
| 111 |
+
if not method:
|
| 112 |
+
raise ExpressionError(
|
| 113 |
+
f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
|
| 114 |
+
)
|
| 115 |
+
xpath = method(xpath, pseudo_element)
|
| 116 |
+
else:
|
| 117 |
+
method_name = (
|
| 118 |
+
f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
|
| 119 |
+
)
|
| 120 |
+
method = getattr(self, method_name, None)
|
| 121 |
+
if not method:
|
| 122 |
+
raise ExpressionError(
|
| 123 |
+
f"The pseudo-element ::{pseudo_element} is unknown"
|
| 124 |
+
)
|
| 125 |
+
xpath = method(xpath)
|
| 126 |
+
return xpath
|
| 127 |
+
|
| 128 |
+
@staticmethod
|
| 129 |
+
def xpath_attr_functional_pseudo_element(
|
| 130 |
+
xpath: OriginalXPathExpr, function: FunctionalPseudoElement
|
| 131 |
+
) -> XPathExpr:
|
| 132 |
+
"""Support selecting attribute values using ::attr() pseudo-element"""
|
| 133 |
+
if function.argument_types() not in (["STRING"], ["IDENT"]):
|
| 134 |
+
raise ExpressionError(
|
| 135 |
+
f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
|
| 136 |
+
)
|
| 137 |
+
return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
|
| 138 |
+
|
| 139 |
+
@staticmethod
|
| 140 |
+
def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
|
| 141 |
+
"""Support selecting text nodes using ::text pseudo-element"""
|
| 142 |
+
return XPathExpr.from_xpath(xpath, textnode=True)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
| 146 |
+
@cache(maxsize=256)
|
| 147 |
+
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
| 148 |
+
return super().css_to_xpath(css, prefix)
|
scrapling/utils.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
from itertools import chain
|
| 5 |
+
from logging import handlers
|
| 6 |
+
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
| 7 |
+
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 8 |
+
|
| 9 |
+
from typing import Dict, Iterable, Any
|
| 10 |
+
|
| 11 |
+
from lxml import html
|
| 12 |
+
html_forbidden = {html.HtmlComment, }
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.ERROR,
|
| 15 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 16 |
+
handlers=[
|
| 17 |
+
logging.StreamHandler()
|
| 18 |
+
]
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@cache(None, typed=True)
|
| 23 |
+
def setup_basic_logging(level: str = 'debug'):
|
| 24 |
+
levels = {
|
| 25 |
+
'debug': logging.DEBUG,
|
| 26 |
+
'info': logging.INFO,
|
| 27 |
+
'warning': logging.WARNING,
|
| 28 |
+
'error': logging.ERROR,
|
| 29 |
+
'critical': logging.CRITICAL
|
| 30 |
+
}
|
| 31 |
+
formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
| 32 |
+
lvl = levels[level.lower()]
|
| 33 |
+
handler = logging.StreamHandler()
|
| 34 |
+
handler.setFormatter(formatter)
|
| 35 |
+
# Configure the root logger
|
| 36 |
+
logging.basicConfig(level=lvl, handlers=[handler])
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def flatten(lst: Iterable):
|
| 40 |
+
return list(chain.from_iterable(lst))
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _is_iterable(s: Any):
|
| 44 |
+
# This will be used only in regex functions to make sure it's iterable but not string/bytes
|
| 45 |
+
return isinstance(s, (list, tuple,))
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@cache(None, typed=True)
|
| 49 |
+
class _Logger(object):
|
| 50 |
+
# I will leave this class here for now in case I decide I want to come back to use it :)
|
| 51 |
+
__slots__ = ('console_logger', 'logger_file_path',)
|
| 52 |
+
levels = {
|
| 53 |
+
'debug': logging.DEBUG,
|
| 54 |
+
'info': logging.INFO,
|
| 55 |
+
'warning': logging.WARNING,
|
| 56 |
+
'error': logging.ERROR,
|
| 57 |
+
'critical': logging.CRITICAL
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
|
| 61 |
+
os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
|
| 62 |
+
format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
| 63 |
+
|
| 64 |
+
# on-screen output
|
| 65 |
+
lvl = self.levels[level.lower()]
|
| 66 |
+
self.console_logger = logging.getLogger('Scrapling')
|
| 67 |
+
self.console_logger.setLevel(lvl)
|
| 68 |
+
console_handler = logging.StreamHandler()
|
| 69 |
+
console_handler.setLevel(lvl)
|
| 70 |
+
console_handler.setFormatter(format_str)
|
| 71 |
+
self.console_logger.addHandler(console_handler)
|
| 72 |
+
|
| 73 |
+
if lvl == logging.DEBUG:
|
| 74 |
+
filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
|
| 75 |
+
self.logger_file_path = filename
|
| 76 |
+
# Automatically generates the logging file at specified intervals
|
| 77 |
+
file_handler = handlers.TimedRotatingFileHandler(
|
| 78 |
+
# If more than (backcount+1) existed, oldest logs will be deleted
|
| 79 |
+
filename=filename, when=when, backupCount=backcount, encoding='utf-8'
|
| 80 |
+
)
|
| 81 |
+
file_handler.setLevel(lvl)
|
| 82 |
+
file_handler.setFormatter(format_str)
|
| 83 |
+
# This for the logger when it appends the date to the new log
|
| 84 |
+
file_handler.namer = lambda name: name.replace(".log", "") + ".log"
|
| 85 |
+
self.console_logger.addHandler(file_handler)
|
| 86 |
+
self.debug(f'Debug log path: {self.logger_file_path}')
|
| 87 |
+
else:
|
| 88 |
+
self.logger_file_path = None
|
| 89 |
+
|
| 90 |
+
def debug(self, message: str) -> None:
|
| 91 |
+
self.console_logger.debug(message)
|
| 92 |
+
|
| 93 |
+
def info(self, message: str) -> None:
|
| 94 |
+
self.console_logger.info(message)
|
| 95 |
+
|
| 96 |
+
def warning(self, message: str) -> None:
|
| 97 |
+
self.console_logger.warning(message)
|
| 98 |
+
|
| 99 |
+
def error(self, message: str) -> None:
|
| 100 |
+
self.console_logger.error(message)
|
| 101 |
+
|
| 102 |
+
def critical(self, message: str) -> None:
|
| 103 |
+
self.console_logger.critical(message)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class _StorageTools:
|
| 107 |
+
@staticmethod
|
| 108 |
+
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
| 109 |
+
if not element.attrib:
|
| 110 |
+
return {}
|
| 111 |
+
return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
|
| 112 |
+
|
| 113 |
+
@classmethod
|
| 114 |
+
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
|
| 115 |
+
parent = element.getparent()
|
| 116 |
+
result = {
|
| 117 |
+
'tag': str(element.tag),
|
| 118 |
+
'attributes': cls.__clean_attributes(element),
|
| 119 |
+
'text': element.text.strip() if element.text else None,
|
| 120 |
+
'path': cls._get_element_path(element)
|
| 121 |
+
}
|
| 122 |
+
if parent is not None:
|
| 123 |
+
result.update({
|
| 124 |
+
'parent_name': parent.tag,
|
| 125 |
+
'parent_attribs': dict(parent.attrib),
|
| 126 |
+
'parent_text': parent.text.strip() if parent.text else None
|
| 127 |
+
})
|
| 128 |
+
|
| 129 |
+
siblings = [child.tag for child in parent.iterchildren() if child != element]
|
| 130 |
+
if siblings:
|
| 131 |
+
result.update({'siblings': tuple(siblings)})
|
| 132 |
+
|
| 133 |
+
children = [child.tag for child in element.iterchildren() if type(child) not in html_forbidden]
|
| 134 |
+
if children:
|
| 135 |
+
result.update({'children': tuple(children)})
|
| 136 |
+
|
| 137 |
+
return result
|
| 138 |
+
|
| 139 |
+
@classmethod
|
| 140 |
+
def _get_element_path(cls, element: html.HtmlElement):
|
| 141 |
+
parent = element.getparent()
|
| 142 |
+
return tuple(
|
| 143 |
+
(element.tag,) if parent is None else (
|
| 144 |
+
cls._get_element_path(parent) + (element.tag,)
|
| 145 |
+
)
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# def _root_type_verifier(method):
|
| 150 |
+
# # Just to make sure we are safe
|
| 151 |
+
# @wraps(method)
|
| 152 |
+
# def _impl(self, *args, **kw):
|
| 153 |
+
# # All html types inherits from HtmlMixin so this to check for all at once
|
| 154 |
+
# if not issubclass(type(self._root), html.HtmlMixin):
|
| 155 |
+
# raise ValueError(f"Cannot use function on a Node of type {type(self._root)!r}")
|
| 156 |
+
# return method(self, *args, **kw)
|
| 157 |
+
# return _impl
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@cache
|
| 161 |
+
def clean_spaces(string):
|
| 162 |
+
string = string.replace('\t', ' ')
|
| 163 |
+
string = re.sub('[\n|\r]', '', string)
|
| 164 |
+
return re.sub(' +', ' ', string)
|
setup.cfg
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[metadata]
|
| 2 |
+
name = scrapling
|
| 3 |
+
version = 0.1
|
| 4 |
+
author = Karim Shoair
|
| 5 |
+
author_email = karim.shoair@pm.me
|
| 6 |
+
description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.
|
| 7 |
+
license = BSD
|
| 8 |
+
home-page = https://github.com/D4Vinci/Scrapling
|
setup.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup
|
| 2 |
+
|
| 3 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 4 |
+
long_description = fh.read()
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
setup(
|
| 8 |
+
name="scrapling",
|
| 9 |
+
version="0.1",
|
| 10 |
+
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
| 11 |
+
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
| 12 |
+
impressive speed improvements over many popular scraping tools.""",
|
| 13 |
+
long_description=long_description,
|
| 14 |
+
long_description_content_type="text/markdown",
|
| 15 |
+
author="Karim Shoair",
|
| 16 |
+
author_email="karim.shoair@pm.me",
|
| 17 |
+
license="BSD",
|
| 18 |
+
packages=["scrapling",],
|
| 19 |
+
zip_safe=False,
|
| 20 |
+
package_dir={
|
| 21 |
+
"scrapling": "scrapling",
|
| 22 |
+
},
|
| 23 |
+
include_package_data=True,
|
| 24 |
+
classifiers=[
|
| 25 |
+
"Operating System :: OS Independent",
|
| 26 |
+
"Development Status :: 4 - Beta ",
|
| 27 |
+
# "Development Status :: 5 - Production/Stable",
|
| 28 |
+
# "Development Status :: 6 - Mature",
|
| 29 |
+
# "Development Status :: 7 - Inactive",
|
| 30 |
+
"Intended Audience :: Developers",
|
| 31 |
+
"License :: OSI Approved :: BSD License",
|
| 32 |
+
"Natural Language :: English",
|
| 33 |
+
"Topic :: Internet :: WWW/HTTP",
|
| 34 |
+
"Topic :: Text Processing :: Markup",
|
| 35 |
+
"Topic :: Text Processing :: Markup :: HTML",
|
| 36 |
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 37 |
+
"Programming Language :: Python :: 3",
|
| 38 |
+
"Programming Language :: Python :: 3 :: Only",
|
| 39 |
+
"Programming Language :: Python :: 3.6",
|
| 40 |
+
"Programming Language :: Python :: 3.7",
|
| 41 |
+
"Programming Language :: Python :: 3.8",
|
| 42 |
+
"Programming Language :: Python :: 3.9",
|
| 43 |
+
"Programming Language :: Python :: 3.10",
|
| 44 |
+
"Programming Language :: Python :: 3.11",
|
| 45 |
+
"Programming Language :: Python :: 3.12",
|
| 46 |
+
"Programming Language :: Python :: Implementation :: CPython",
|
| 47 |
+
"Typing :: Typed",
|
| 48 |
+
],
|
| 49 |
+
# Instead of using requirements file to dodge possible errors from tox?
|
| 50 |
+
install_requires=[
|
| 51 |
+
"requests>=2.3",
|
| 52 |
+
"lxml>=4.5",
|
| 53 |
+
"cssselect>=1.2",
|
| 54 |
+
"w3lib",
|
| 55 |
+
"orjson>=3",
|
| 56 |
+
"tldextract",
|
| 57 |
+
],
|
| 58 |
+
python_requires=">=3.6",
|
| 59 |
+
url="https://github.com/D4Vinci/Scrapling",
|
| 60 |
+
project_urls={
|
| 61 |
+
"Documentation": "https://github.com/D4Vinci/Scrapling/Docs", # For now
|
| 62 |
+
"Source": "https://github.com/D4Vinci/Scrapling",
|
| 63 |
+
"Tracker": "https://github.com/D4Vinci/Scrapling/issues",
|
| 64 |
+
}
|
| 65 |
+
)
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Package for test project."""
|
tests/requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytest
|
| 2 |
+
pytest-cov
|
tests/test_all_functions.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pickle
|
| 3 |
+
import unittest
|
| 4 |
+
from scrapling import Adaptor
|
| 5 |
+
from cssselect import SelectorError, SelectorSyntaxError
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TestParser(unittest.TestCase):
|
| 9 |
+
def setUp(self):
|
| 10 |
+
self.html = '''
|
| 11 |
+
<html>
|
| 12 |
+
<head>
|
| 13 |
+
<title>Complex Web Page</title>
|
| 14 |
+
<style>
|
| 15 |
+
.hidden { display: none; }
|
| 16 |
+
</style>
|
| 17 |
+
</head>
|
| 18 |
+
<body>
|
| 19 |
+
<header>
|
| 20 |
+
<nav>
|
| 21 |
+
<ul>
|
| 22 |
+
<li><a href="#home">Home</a></li>
|
| 23 |
+
<li><a href="#about">About</a></li>
|
| 24 |
+
<li><a href="#contact">Contact</a></li>
|
| 25 |
+
</ul>
|
| 26 |
+
</nav>
|
| 27 |
+
</header>
|
| 28 |
+
<main>
|
| 29 |
+
<section id="products" schema='{"jsonable": "data"}'>
|
| 30 |
+
<h2>Products</h2>
|
| 31 |
+
<div class="product-list">
|
| 32 |
+
<article class="product" data-id="1">
|
| 33 |
+
<h3>Product 1</h3>
|
| 34 |
+
<p class="description">This is product 1</p>
|
| 35 |
+
<span class="price">$10.99</span>
|
| 36 |
+
<div class="hidden stock">In stock: 5</div>
|
| 37 |
+
</article>
|
| 38 |
+
<article class="product" data-id="2">
|
| 39 |
+
<h3>Product 2</h3>
|
| 40 |
+
<p class="description">This is product 2</p>
|
| 41 |
+
<span class="price">$20.99</span>
|
| 42 |
+
<div class="hidden stock">In stock: 3</div>
|
| 43 |
+
</article>
|
| 44 |
+
<article class="product" data-id="3">
|
| 45 |
+
<h3>Product 3</h3>
|
| 46 |
+
<p class="description">This is product 3</p>
|
| 47 |
+
<span class="price">$15.99</span>
|
| 48 |
+
<div class="hidden stock">Out of stock</div>
|
| 49 |
+
</article>
|
| 50 |
+
</div>
|
| 51 |
+
</section>
|
| 52 |
+
<section id="reviews">
|
| 53 |
+
<h2>Customer Reviews</h2>
|
| 54 |
+
<div class="review-list">
|
| 55 |
+
<div class="review" data-rating="5">
|
| 56 |
+
<p class="review-text">Great product!</p>
|
| 57 |
+
<span class="reviewer">John Doe</span>
|
| 58 |
+
</div>
|
| 59 |
+
<div class="review" data-rating="4">
|
| 60 |
+
<p class="review-text">Good value for money.</p>
|
| 61 |
+
<span class="reviewer">Jane Smith</span>
|
| 62 |
+
</div>
|
| 63 |
+
</div>
|
| 64 |
+
</section>
|
| 65 |
+
</main>
|
| 66 |
+
<footer>
|
| 67 |
+
<p>© 2024 Our Company</p>
|
| 68 |
+
</footer>
|
| 69 |
+
<script id="page-data" type="application/json">
|
| 70 |
+
{"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
|
| 71 |
+
</script>
|
| 72 |
+
</body>
|
| 73 |
+
</html>
|
| 74 |
+
'''
|
| 75 |
+
self.page = Adaptor(self.html, auto_match=False, debug=False)
|
| 76 |
+
|
| 77 |
+
def test_css_selector(self):
|
| 78 |
+
"""Test Selecting elements with complex CSS selectors"""
|
| 79 |
+
elements = self.page.css('main #products .product-list article.product')
|
| 80 |
+
self.assertEqual(len(elements), 3)
|
| 81 |
+
|
| 82 |
+
in_stock_products = self.page.css(
|
| 83 |
+
'main #products .product-list article.product:not(:contains("Out of stock"))')
|
| 84 |
+
self.assertEqual(len(in_stock_products), 2)
|
| 85 |
+
|
| 86 |
+
def test_xpath_selector(self):
|
| 87 |
+
"""Test Selecting elements with Complex XPath selectors"""
|
| 88 |
+
reviews = self.page.xpath(
|
| 89 |
+
'//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]'
|
| 90 |
+
)
|
| 91 |
+
self.assertEqual(len(reviews), 2)
|
| 92 |
+
|
| 93 |
+
high_priced_products = self.page.xpath(
|
| 94 |
+
'//article[contains(@class, "product")]'
|
| 95 |
+
'[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]'
|
| 96 |
+
)
|
| 97 |
+
self.assertEqual(len(high_priced_products), 2)
|
| 98 |
+
|
| 99 |
+
def test_find_by_text(self):
|
| 100 |
+
"""Test Selecting elements with Text matching"""
|
| 101 |
+
stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=False)
|
| 102 |
+
self.assertEqual(len(stock_info), 2)
|
| 103 |
+
|
| 104 |
+
stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
|
| 105 |
+
self.assertEqual(stock_info.text, 'In stock: 5')
|
| 106 |
+
|
| 107 |
+
stock_info = self.page.find_by_text(r'In stock:', partial=True, first_match=False)
|
| 108 |
+
self.assertEqual(len(stock_info), 2)
|
| 109 |
+
|
| 110 |
+
out_of_stock = self.page.find_by_text('Out of stock', partial=False, first_match=False)
|
| 111 |
+
self.assertEqual(len(out_of_stock), 1)
|
| 112 |
+
|
| 113 |
+
def test_find_similar_elements(self):
|
| 114 |
+
"""Test Finding similar elements of an element"""
|
| 115 |
+
first_product = self.page.css('.product')[0]
|
| 116 |
+
similar_products = first_product.find_similar()
|
| 117 |
+
self.assertEqual(len(similar_products), 2)
|
| 118 |
+
|
| 119 |
+
first_review = self.page.css('.review')[0]
|
| 120 |
+
similar_high_rated_reviews = [
|
| 121 |
+
review
|
| 122 |
+
for review in first_review.find_similar()
|
| 123 |
+
if int(review.attrib.get('data-rating', 0)) >= 4
|
| 124 |
+
]
|
| 125 |
+
self.assertEqual(len(similar_high_rated_reviews), 1)
|
| 126 |
+
|
| 127 |
+
def test_expected_errors(self):
|
| 128 |
+
"""Test errors that should raised if it does"""
|
| 129 |
+
with self.assertRaises(ValueError):
|
| 130 |
+
_ = Adaptor()
|
| 131 |
+
|
| 132 |
+
with self.assertRaises(TypeError):
|
| 133 |
+
_ = Adaptor(root="ayo")
|
| 134 |
+
|
| 135 |
+
with self.assertRaises(TypeError):
|
| 136 |
+
_ = Adaptor(text=1)
|
| 137 |
+
|
| 138 |
+
with self.assertRaises(TypeError):
|
| 139 |
+
_ = Adaptor(body=1)
|
| 140 |
+
|
| 141 |
+
with self.assertRaises(ValueError):
|
| 142 |
+
_ = Adaptor(self.html, storage=object, auto_match=True)
|
| 143 |
+
|
| 144 |
+
def test_pickleable(self):
|
| 145 |
+
"""Test that objects aren't pickleable"""
|
| 146 |
+
table = self.page.css('.product-list')[0]
|
| 147 |
+
with self.assertRaises(TypeError): # Adaptors
|
| 148 |
+
pickle.dumps(table)
|
| 149 |
+
|
| 150 |
+
with self.assertRaises(TypeError): # Adaptor
|
| 151 |
+
pickle.dumps(table[0])
|
| 152 |
+
|
| 153 |
+
def test_overridden(self):
|
| 154 |
+
"""Test overridden functions"""
|
| 155 |
+
table = self.page.css('.product-list')[0]
|
| 156 |
+
self.assertTrue(issubclass(type(table.__str__()), str))
|
| 157 |
+
self.assertTrue(issubclass(type(table.__repr__()), str))
|
| 158 |
+
self.assertTrue(issubclass(type(table.attrib.__str__()), str))
|
| 159 |
+
self.assertTrue(issubclass(type(table.attrib.__repr__()), str))
|
| 160 |
+
|
| 161 |
+
def test_bad_selector(self):
|
| 162 |
+
"""Test object can handle bad selector"""
|
| 163 |
+
with self.assertRaises((SelectorError, SelectorSyntaxError,)):
|
| 164 |
+
self.page.css('4 ayo')
|
| 165 |
+
|
| 166 |
+
with self.assertRaises((SelectorError, SelectorSyntaxError,)):
|
| 167 |
+
self.page.xpath('4 ayo')
|
| 168 |
+
|
| 169 |
+
def test_selectors_generation(self):
|
| 170 |
+
"""Try to create selectors for all elements in the page"""
|
| 171 |
+
def _traverse(element: Adaptor):
|
| 172 |
+
self.assertTrue(type(element.css_selector) is str)
|
| 173 |
+
self.assertTrue(type(element.xpath_selector) is str)
|
| 174 |
+
for branch in element.children:
|
| 175 |
+
_traverse(branch)
|
| 176 |
+
|
| 177 |
+
_traverse(self.page)
|
| 178 |
+
|
| 179 |
+
def test_getting_all_text(self):
|
| 180 |
+
"""Test getting all text"""
|
| 181 |
+
self.assertNotEqual(self.page.get_all_text(), '')
|
| 182 |
+
|
| 183 |
+
def test_element_navigation(self):
|
| 184 |
+
"""Test moving in the page from selected element"""
|
| 185 |
+
table = self.page.css('.product-list')[0]
|
| 186 |
+
|
| 187 |
+
self.assertIsNot(table.path, [])
|
| 188 |
+
self.assertNotEqual(table.html_content, '')
|
| 189 |
+
self.assertNotEqual(table.prettify(), '')
|
| 190 |
+
|
| 191 |
+
parent = table.parent
|
| 192 |
+
self.assertEqual(parent.attrib['id'], 'products')
|
| 193 |
+
|
| 194 |
+
children = table.children
|
| 195 |
+
self.assertEqual(len(children), 3)
|
| 196 |
+
|
| 197 |
+
parent_siblings = parent.siblings
|
| 198 |
+
self.assertEqual(len(parent_siblings), 1)
|
| 199 |
+
|
| 200 |
+
child = table.css('[data-id="1"]')[0]
|
| 201 |
+
next_element = child.next
|
| 202 |
+
self.assertEqual(next_element.attrib['data-id'], '2')
|
| 203 |
+
|
| 204 |
+
prev_element = next_element.previous
|
| 205 |
+
self.assertEqual(prev_element.tag, child.tag)
|
| 206 |
+
|
| 207 |
+
all_prices = self.page.css('.price')
|
| 208 |
+
products_with_prices = [
|
| 209 |
+
price.find_ancestor(lambda p: p.has_class('product'))
|
| 210 |
+
for price in all_prices
|
| 211 |
+
]
|
| 212 |
+
self.assertEqual(len(products_with_prices), 3)
|
| 213 |
+
|
| 214 |
+
def test_empty_return(self):
|
| 215 |
+
"""Test cases where functions shouldn't have results"""
|
| 216 |
+
test_html = """
|
| 217 |
+
<html>
|
| 218 |
+
<span id="a"><a></a><!--comment--></span>
|
| 219 |
+
<span id="b"><!--comment--><a></a></span>
|
| 220 |
+
</html>"""
|
| 221 |
+
soup = Adaptor(test_html, auto_match=False, keep_comments=False)
|
| 222 |
+
html_tag = soup.css('html')[0]
|
| 223 |
+
self.assertEqual(html_tag.path, [])
|
| 224 |
+
self.assertEqual(html_tag.siblings, [])
|
| 225 |
+
self.assertEqual(html_tag.parent, None)
|
| 226 |
+
self.assertEqual(html_tag.find_ancestor(lambda e: e), None)
|
| 227 |
+
|
| 228 |
+
self.assertEqual(soup.css('#a a')[0].next, None)
|
| 229 |
+
self.assertEqual(soup.css('#b a')[0].previous, None)
|
| 230 |
+
|
| 231 |
+
def test_text_to_json(self):
|
| 232 |
+
"""Test converting text to json"""
|
| 233 |
+
script_content = self.page.css('#page-data::text')[0]
|
| 234 |
+
self.assertTrue(issubclass(type(script_content.sort()), str))
|
| 235 |
+
page_data = script_content.json()
|
| 236 |
+
self.assertEqual(page_data['totalProducts'], 3)
|
| 237 |
+
self.assertTrue('lastUpdated' in page_data)
|
| 238 |
+
|
| 239 |
+
def test_regex_on_text(self):
|
| 240 |
+
"""Test doing regex on a selected text"""
|
| 241 |
+
element = self.page.css('[data-id="1"] .price')[0]
|
| 242 |
+
match = element.re_first(r'[\.\d]+')
|
| 243 |
+
self.assertEqual(match, '10.99')
|
| 244 |
+
match = element.text.re(r'(\d+)', replace_entities=False)
|
| 245 |
+
self.assertEqual(len(match), 2)
|
| 246 |
+
|
| 247 |
+
def test_attribute_operations(self):
|
| 248 |
+
"""Test operations on elements attributes"""
|
| 249 |
+
products = self.page.css('.product')
|
| 250 |
+
product_ids = [product.attrib['data-id'] for product in products]
|
| 251 |
+
self.assertEqual(product_ids, ['1', '2', '3'])
|
| 252 |
+
self.assertTrue('data-id' in products[0].attrib)
|
| 253 |
+
|
| 254 |
+
reviews = self.page.css('.review')
|
| 255 |
+
review_ratings = [int(review.attrib['data-rating']) for review in reviews]
|
| 256 |
+
self.assertEqual(sum(review_ratings) / len(review_ratings), 4.5)
|
| 257 |
+
|
| 258 |
+
key_value = list(products[0].attrib.search_values('1', partial=False))
|
| 259 |
+
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
| 260 |
+
|
| 261 |
+
key_value = list(products[0].attrib.search_values('1', partial=True))
|
| 262 |
+
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
| 263 |
+
|
| 264 |
+
attr_json = self.page.css('#products')[0].attrib['schema'].json()
|
| 265 |
+
self.assertEqual(attr_json, {'jsonable': 'data'})
|
| 266 |
+
self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
|
| 267 |
+
|
| 268 |
+
def test_element_relocation(self):
|
| 269 |
+
"""Test relocating element after structure change"""
|
| 270 |
+
original_html = '''
|
| 271 |
+
<div class="container">
|
| 272 |
+
<section class="products">
|
| 273 |
+
<article class="product" id="p1">
|
| 274 |
+
<h3>Product 1</h3>
|
| 275 |
+
<p class="description">Description 1</p>
|
| 276 |
+
</article>
|
| 277 |
+
<article class="product" id="p2">
|
| 278 |
+
<h3>Product 2</h3>
|
| 279 |
+
<p class="description">Description 2</p>
|
| 280 |
+
</article>
|
| 281 |
+
</section>
|
| 282 |
+
</div>
|
| 283 |
+
'''
|
| 284 |
+
changed_html = '''
|
| 285 |
+
<div class="new-container">
|
| 286 |
+
<div class="product-wrapper">
|
| 287 |
+
<section class="products">
|
| 288 |
+
<article class="product new-class" data-id="p1">
|
| 289 |
+
<div class="product-info">
|
| 290 |
+
<h3>Product 1</h3>
|
| 291 |
+
<p class="new-description">Description 1</p>
|
| 292 |
+
</div>
|
| 293 |
+
</article>
|
| 294 |
+
<article class="product new-class" data-id="p2">
|
| 295 |
+
<div class="product-info">
|
| 296 |
+
<h3>Product 2</h3>
|
| 297 |
+
<p class="new-description">Description 2</p>
|
| 298 |
+
</div>
|
| 299 |
+
</article>
|
| 300 |
+
</section>
|
| 301 |
+
</div>
|
| 302 |
+
</div>
|
| 303 |
+
'''
|
| 304 |
+
|
| 305 |
+
old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
|
| 306 |
+
new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
|
| 307 |
+
|
| 308 |
+
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 309 |
+
# Also at the same time testing auto-match vs combined selectors
|
| 310 |
+
_ = old_page.css('#p1, #p2', auto_save=True)[0]
|
| 311 |
+
relocated = new_page.css('#p1', auto_match=True)
|
| 312 |
+
|
| 313 |
+
self.assertIsNotNone(relocated)
|
| 314 |
+
self.assertEqual(relocated[0].attrib['data-id'], 'p1')
|
| 315 |
+
self.assertTrue(relocated[0].has_class('new-class'))
|
| 316 |
+
self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
|
| 317 |
+
|
| 318 |
+
def test_performance(self):
|
| 319 |
+
"""Test parsing and selecting speed"""
|
| 320 |
+
import time
|
| 321 |
+
large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
|
| 322 |
+
|
| 323 |
+
start_time = time.time()
|
| 324 |
+
parsed = Adaptor(large_html, auto_match=False, debug=False)
|
| 325 |
+
elements = parsed.css('.item')
|
| 326 |
+
end_time = time.time()
|
| 327 |
+
|
| 328 |
+
self.assertEqual(len(elements), 5000)
|
| 329 |
+
# Converting 5000 elements to a class and doing operations on them will take time
|
| 330 |
+
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
| 331 |
+
self.assertLess(end_time - start_time, 0.1)
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
# Use `coverage run -m unittest --verbose tests/test_all_functions.py` instead for the coverage report
|
| 335 |
+
# if __name__ == '__main__':
|
| 336 |
+
# unittest.main(verbosity=2)
|
tox.ini
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tox (https://tox.readthedocs.io/) is a tool for running tests
|
| 2 |
+
# in multiple virtualenvs. This configuration file will run the
|
| 3 |
+
# test suite on all supported python versions. To use it, "pip install tox"
|
| 4 |
+
# and then run "tox" from this directory.
|
| 5 |
+
|
| 6 |
+
[tox]
|
| 7 |
+
envlist = pre-commit,py36,py37,py38,py39,py310,py311,py312
|
| 8 |
+
|
| 9 |
+
[testenv]
|
| 10 |
+
usedevelop = True
|
| 11 |
+
changedir = tests
|
| 12 |
+
deps =
|
| 13 |
+
-r{toxinidir}/tests/requirements.txt
|
| 14 |
+
commands = pytest --cov=scrapling --cov-report=xml
|
| 15 |
+
|
| 16 |
+
[testenv:pre-commit]
|
| 17 |
+
basepython = python3
|
| 18 |
+
deps = pre-commit
|
| 19 |
+
commands = pre-commit run --all-files --show-diff-on-failure
|
| 20 |
+
skip_install = true
|