Karim shoair commited on
Commit
2b837a0
·
1 Parent(s): abf8a7e

Going online (first public version)

Browse files
.bandit.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ skips:
2
+ - B101
3
+ - B311
4
+ - B320
5
+ - B410
.flake8 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [flake8]
2
+ ignore = E501 # line too long
3
+ exclude = .git,__pycache__,docs,.github,build,dist
.github/FUNDING.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ github: D4Vinci
.github/ISSUE_TEMPLATE/01-bug_report.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bug report
2
+ description: Create a bug report to help us address errors in the repository
3
+ labels: [bug]
4
+ body:
5
+ - type: checkboxes
6
+ attributes:
7
+ label: Have you searched if there an existing issue for this?
8
+ description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/bug).
9
+ options:
10
+ - label: I have searched the existing issues
11
+ required: true
12
+
13
+ - type: input
14
+ attributes:
15
+ label: "Python version (python --version)"
16
+ placeholder: "Python 3.8"
17
+ validations:
18
+ required: true
19
+
20
+ - type: input
21
+ attributes:
22
+ label: "Scrapling version (scrapling.__version__)"
23
+ placeholder: "0.1"
24
+ validations:
25
+ required: true
26
+
27
+ - type: textarea
28
+ attributes:
29
+ label: "Dependencies version (pip3 freeze)"
30
+ description: >
31
+ This is the output of the command `pip3 freeze --all`. Note that the
32
+ actual output might be different as compared to the placeholder text.
33
+ placeholder: |
34
+ cssselect==1.2.0
35
+ lxml==5.3.0
36
+ orjson==3.10.7
37
+ ...
38
+ validations:
39
+ required: true
40
+
41
+ - type: input
42
+ attributes:
43
+ label: "What's your operating system?"
44
+ placeholder: "Windows 10"
45
+ validations:
46
+ required: true
47
+
48
+ - type: dropdown
49
+ attributes:
50
+ label: 'Are you using a separate virtual environment?'
51
+ description: "Please pay attention to this question"
52
+ options:
53
+ - No
54
+ - Yes
55
+ default: 0
56
+ validations:
57
+ required: true
58
+
59
+ - type: textarea
60
+ attributes:
61
+ label: "Expected behavior"
62
+ description: "Describe the behavior you expect. May include images or videos."
63
+ validations:
64
+ required: true
65
+
66
+ - type: textarea
67
+ attributes:
68
+ label: "Actual behavior (Remember to use `debug` parameter)"
69
+ validations:
70
+ required: true
71
+
72
+ - type: textarea
73
+ attributes:
74
+ label: Steps To Reproduce
75
+ description: Steps to reproduce the behavior.
76
+ placeholder: |
77
+ 1. In this environment...
78
+ 2. With this config...
79
+ 3. Run '...'
80
+ 4. See error...
81
+ validations:
82
+ required: false
.github/ISSUE_TEMPLATE/02-feature_request.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Feature request
2
+ description: Suggest features, propose improvements, discuss new ideas.
3
+ labels: [enhancement]
4
+ body:
5
+ - type: checkboxes
6
+ attributes:
7
+ label: Have you searched if there an existing feature request for this?
8
+ description: Please search [existing requests](https://github.com/D4Vinci/Scrapling/labels/enhancement).
9
+ options:
10
+ - label: I have searched the existing requests
11
+ required: true
12
+
13
+ - type: textarea
14
+ attributes:
15
+ label: "Feature description"
16
+ description: >
17
+ This could include new topics or improving any existing features/implementations.
18
+ validations:
19
+ required: true
.github/ISSUE_TEMPLATE/03-other.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Other
2
+ description: Use this for any other issues. PLEASE do not create blank issues
3
+ labels: ["awaiting triage"]
4
+ body:
5
+ - type: textarea
6
+ id: issuedescription
7
+ attributes:
8
+ label: What would you like to share?
9
+ description: Provide a clear and concise explanation of your issue.
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: extrainfo
15
+ attributes:
16
+ label: Additional information
17
+ description: Is there anything else we should know about this issue?
18
+ validations:
19
+ required: false
.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ blank_issues_enabled: false
.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--
2
+ You are amazing! Thanks for contributing to Scrapling!
3
+ Please, DO NOT DELETE ANY TEXT from this template! (unless instructed).
4
+ -->
5
+
6
+ ## Proposed change
7
+ <!--
8
+ Describe the big picture of your changes here to communicate to the
9
+ maintainers why we should accept this pull request. If it fixes a bug
10
+ or resolves a feature request, be sure to link to that issue in the
11
+ additional information section.
12
+ -->
13
+
14
+
15
+ ### Type of change:
16
+ <!--
17
+ What type of change does your PR introduce to Scrapling?
18
+ NOTE: Please, check at least 1 box!
19
+ If your PR requires multiple boxes to be checked, you'll most likely need to
20
+ split it into multiple PRs. This makes things easier and faster to code review.
21
+ -->
22
+
23
+
24
+
25
+ - [ ] Dependency upgrade
26
+ - [ ] Bugfix (non-breaking change which fixes an issue)
27
+ - [ ] New integration (thank you!)
28
+ - [ ] New feature (which adds functionality to an existing integration)
29
+ - [ ] Deprecation (breaking change to happen in the future)
30
+ - [ ] Breaking change (fix/feature causing existing functionality to break)
31
+ - [ ] Code quality improvements to existing code or addition of tests
32
+ - [ ] Add or change doctests? -- Note: Please avoid changing both code and tests in a single pull request.
33
+ - [ ] Documentation change?
34
+
35
+ ### Additional information
36
+ <!--
37
+ Details are important, and help maintainers processing your PR.
38
+ Please be sure to fill out additional details, if applicable.
39
+ -->
40
+
41
+ - This PR fixes or closes issue: fixes #
42
+ - This PR is related to issue:
43
+ - Link to documentation pull request: **
44
+
45
+ ### Checklist:
46
+ * [ ] I have read [CONTRIBUTING.md](/CONTRIBUTING.md).
47
+ * [ ] This pull request is all my own work -- I have not plagiarized.
48
+ * [ ] I know that pull requests will not be merged if they fail the automated tests.
49
+ * [ ] All new Python files are placed inside an existing directory.
50
+ * [ ] All filenames are in all lowercase characters with no spaces or dashes.
51
+ * [ ] All functions and variable names follow Python naming conventions.
52
+ * [ ] All function parameters and return values are annotated with Python [type hints](https://docs.python.org/3/library/typing.html).
53
+ * [ ] All functions have doc-strings.
.github/workflows/publish.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish Python 🐍 distributions 📦 to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [created]
6
+
7
+ jobs:
8
+ build-n-publish:
9
+ name: Build and publish Python 🐍 distributions 📦 to PyPI
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.x" # Latest available Python version
17
+
18
+ - name: Upgrade pip
19
+ run: python3 -m pip install --upgrade pip
20
+
21
+ - name: Install build
22
+ run: python3 -m pip install --upgrade build twine setuptools
23
+
24
+ - name: Build a binary wheel and a source tarball
25
+ run: python3 -m build --sdist --wheel --outdir dist/
26
+
27
+ - name: Publish distribution 📦 to PyPI
28
+ uses: pypa/gh-action-pypi-publish@release/v1.10.3
29
+ with:
30
+ user: __token__
31
+ password: ${{ secrets.PYPI_API_TOKEN }}
.github/workflows/tests.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Tests
2
+ on: [push, pull_request]
3
+
4
+ concurrency:
5
+ group: ${{github.workflow}}-${{ github.ref }}
6
+ cancel-in-progress: true
7
+
8
+ jobs:
9
+ tests:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ include:
15
+ - python-version: "3.6"
16
+ env:
17
+ TOXENV: py
18
+ - python-version: "3.7"
19
+ env:
20
+ TOXENV: py
21
+ - python-version: "3.8"
22
+ env:
23
+ TOXENV: py
24
+ - python-version: "3.9"
25
+ env:
26
+ TOXENV: py
27
+ - python-version: "3.10"
28
+ env:
29
+ TOXENV: py
30
+ - python-version: "3.11"
31
+ env:
32
+ TOXENV: py
33
+ - python-version: "3.12"
34
+ env:
35
+ TOXENV: py
36
+
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+ - name: Set up Python ${{ matrix.python-version }}
40
+ uses: actions/setup-python@v5
41
+ with:
42
+ python-version: ${{ matrix.python-version }}
43
+
44
+ - name: Run tests
45
+ env: ${{ matrix.env }}
46
+ run: |
47
+ pip install -U tox
48
+ tox
.gitignore CHANGED
@@ -1,128 +1,25 @@
1
- # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
4
- *$py.class
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- share/python-wheels/
24
- *.egg-info/
25
- .installed.cfg
26
- *.egg
27
- MANIFEST
28
-
29
- # PyInstaller
30
- # Usually these files are written by a python script from a template
31
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
- *.manifest
33
- *.spec
34
-
35
- # Installer logs
36
- pip-log.txt
37
- pip-delete-this-directory.txt
38
-
39
- # Unit test / coverage reports
40
- htmlcov/
41
- .tox/
42
- .nox/
43
- .coverage
44
- .coverage.*
45
  .cache
46
- nosetests.xml
47
- coverage.xml
48
- *.cover
49
- *.py,cover
50
- .hypothesis/
51
- .pytest_cache/
52
- cover/
53
-
54
- # Translations
55
- *.mo
56
- *.pot
57
-
58
- # Django stuff:
59
- *.log
60
- local_settings.py
61
- db.sqlite3
62
- db.sqlite3-journal
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- .pybuilder/
76
- target/
77
-
78
- # Jupyter Notebook
79
- .ipynb_checkpoints
80
-
81
- # IPython
82
- profile_default/
83
- ipython_config.py
84
-
85
- # pyenv
86
- # For a library or package, you might want to ignore these files since the code is
87
- # intended to run in multiple environments; otherwise, check them in:
88
- # .python-version
89
-
90
- # pipenv
91
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
- # install all needed dependencies.
95
- #Pipfile.lock
96
-
97
- # poetry
98
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
- # This is especially recommended for binary packages to ensure reproducibility, and is more
100
- # commonly ignored for libraries.
101
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
- #poetry.lock
103
-
104
- # pdm
105
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
- #pdm.lock
107
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
- # in version control.
109
- # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
- .pdm.toml
111
- .pdm-python
112
- .pdm-build/
113
-
114
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
- __pypackages__/
116
-
117
- # Celery stuff
118
- celerybeat-schedule
119
- celerybeat.pid
120
 
121
- # SageMath parsed files
122
- *.sage.py
 
 
123
 
124
- # Environments
125
- .env
126
  .venv
127
  env/
128
  venv/
@@ -130,33 +27,73 @@ ENV/
130
  env.bak/
131
  venv.bak/
132
 
133
- # Spyder project settings
134
- .spyderproject
135
- .spyproject
 
 
136
 
137
- # Rope project settings
138
- .ropeproject
139
 
140
- # mkdocs documentation
141
- /site
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  # mypy
144
  .mypy_cache/
145
  .dmypy.json
146
  dmypy.json
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- # Pyre type checker
149
- .pyre/
150
 
151
- # pytype static type analyzer
152
- .pytype/
 
 
 
 
153
 
154
- # Cython debug symbols
155
- cython_debug/
 
156
 
157
- # PyCharm
158
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
- # and can be added to the global gitignore or merged into this file. For a more nuclear
161
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
- #.idea/
 
1
+ # cached files
2
  __pycache__/
3
  *.py[cod]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  .cache
5
+ .DS_Store
6
+ *~
7
+ .*.sw[po]
8
+ .build
9
+ .ve
10
+ .env
11
+ .pytest
12
+ .benchmarks
13
+ .bootstrap
14
+ .appveyor.token
15
+ *.bak
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # installation package
18
+ *.egg-info/
19
+ dist/
20
+ build/
21
 
22
+ # environments
 
23
  .venv
24
  env/
25
  venv/
 
27
  env.bak/
28
  venv.bak/
29
 
30
+ # C extensions
31
+ *.so
32
+
33
+ # pycharm
34
+ .idea/
35
 
36
+ # vscode
37
+ *.code-workspace
38
 
39
+ # Packages
40
+ *.egg
41
+ *.egg-info
42
+ dist
43
+ build
44
+ eggs
45
+ .eggs
46
+ parts
47
+ bin
48
+ var
49
+ sdist
50
+ wheelhouse
51
+ develop-eggs
52
+ .installed.cfg
53
+ lib
54
+ lib64
55
+ venv*/
56
+ .venv*/
57
+ pyvenv*/
58
+ pip-wheel-metadata/
59
+ poetry.lock
60
+
61
+ # Installer logs
62
+ pip-log.txt
63
 
64
  # mypy
65
  .mypy_cache/
66
  .dmypy.json
67
  dmypy.json
68
+ mypy.ini
69
+
70
+ # test caches
71
+ .tox/
72
+ .pytest_cache/
73
+ .coverage
74
+ htmlcov
75
+ report.xml
76
+ nosetests.xml
77
+ coverage.xml
78
+
79
+ # Translations
80
+ *.mo
81
 
82
+ # Buildout
83
+ .mr.developer.cfg
84
 
85
+ # IDE project files
86
+ .project
87
+ .pydevproject
88
+ .idea
89
+ *.iml
90
+ *.komodoproject
91
 
92
+ # Complexity
93
+ output/*.html
94
+ output/*/index.html
95
 
96
+ # Sphinx
97
+ docs/_build
98
+ public/
99
+ web/
 
 
.pre-commit-config.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/PyCQA/bandit
3
+ rev: 1.7.8
4
+ hooks:
5
+ - id: bandit
6
+ args: [-r, -c, .bandit.yml]
7
+ - repo: https://github.com/PyCQA/flake8
8
+ rev: 7.0.0
9
+ hooks:
10
+ - id: flake8
11
+ - repo: https://github.com/pycqa/isort
12
+ rev: 5.13.2
13
+ hooks:
14
+ - id: isort
CONTRIBUTING.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to Scrapling
2
+ Everybody is invited and welcome to contribute to Scrapling. Smaller changes have a better chance to get included in a timely manner. Adding unit tests for new features or test cases for bugs you've fixed help us to ensure that the Pull Request (PR) is fine.
3
+
4
+ There is a lot to do...
5
+ - If you are not a developer perhaps you would like to help with the [documentation](/docs)?
6
+ - If you are a developer, most of the features I'm planning to add in the future are moved to [roadmap file](/ROADMAP.md) so consider reading it.
7
+
8
+ Scrapling includes a comprehensive test suite which can be executed with pytest:
9
+ ```bash
10
+ $ pytest
11
+ =============================== test session starts ===============================
12
+ platform darwin -- Python 3.12.7, pytest-8.3.3, pluggy-1.5.0
13
+ rootdir: /<some_where>/Scrapling
14
+ configfile: pytest.ini
15
+ plugins: cov-5.0.0, anyio-4.6.0
16
+ collected 16 items
17
+
18
+ tests/test_all_functions.py ................ [100%]
19
+
20
+ =============================== 16 passed in 0.22s ================================
21
+ ```
22
+ Also, consider setting `debug` to `True` while initializing the Adaptor object so it's easier to know what's happening in the background.
23
+
24
+ ### The process is straight-forward.
25
+
26
+ - Read [How to get faster PR reviews](https://github.com/kubernetes/community/blob/master/contributors/guide/pull-requests.md#best-practices-for-faster-reviews) by Kubernetes (but skip step 0 and 1)
27
+ - Fork Scrapling [git repository](https://github.com/D4Vinci/Scrapling).
28
+ - Make your changes.
29
+ - Ensure tests work.
30
+ - Create a Pull Request against the [**dev**](https://github.com/D4Vinci/Scraplin/tree/dev) branch of Scrapling.
MANIFEST.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ include LICENSE
2
+ include scrapling/py.typed
3
+
4
+ recursive-exclude * __pycache__
5
+ recursive-exclude * *.py[co]
README.md ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
2
+ [![PyPI version](https://badge.fury.io/py/scrapling.svg)](https://badge.fury.io/py/scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![License](https://img.shields.io/badge/License-BSD--3-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
3
+
4
+ Dealing with failing web scrapers due to website changes? Meet Scrapling.
5
+
6
+ Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. Whether you're a beginner or an expert, Scrapling provides powerful features while maintaining simplicity.
7
+
8
+ ```python
9
+ from scrapling import Adaptor
10
+
11
+ # Scrape data that survives website changes
12
+ page = Adaptor(html, auto_match=True)
13
+ products = page.css('.product', auto_save=True)
14
+ # Later, even if selectors change:
15
+ products = page.css('.product', auto_match=True) # Still finds them!
16
+ ```
17
+
18
+ ## Key Features
19
+
20
+ ### Adaptive Scraping
21
+ - 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
22
+ - 🎯 **Flexible Querying**: Use CSS selectors, XPath, text search, or regex - chain them however you want!
23
+ - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
24
+ - 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using its powerful features.
25
+
26
+ ### Performance
27
+ - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup by up to 237x in our tests).
28
+ - 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
29
+ - ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
30
+
31
+ ### Developing Experience
32
+ - 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
33
+ - 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
34
+ - 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
35
+ - 🔌 **Scrapy-Compatible API**: Familiar methods and similar pseudo-elements for Scrapy users.
36
+ - 📘 **Type hints**: Complete type coverage for better IDE support and fewer bugs.
37
+
38
+ ## Getting Started
39
+
40
+ Let's walk through a basic example that demonstrates small group of Scrapling's core features:
41
+
42
+ ```python
43
+ import requests
44
+ from scrapling import Adaptor
45
+
46
+ # Fetch a web page
47
+ url = 'https://quotes.toscrape.com/'
48
+ response = requests.get(url)
49
+
50
+ # Create an Adaptor instance
51
+ page = Adaptor(response.text, url=url)
52
+ # Get all strings in the full page
53
+ page.get_all_text(ignore_tags=('script', 'style'))
54
+
55
+ # Get all quotes, any of these methods will return a list of strings (TextHandlers)
56
+ quotes = page.css('.quote .text::text') # CSS selector
57
+ quotes = page.xpath('//span[@class="text"]/text()') # XPath
58
+ quotes = page.css('.quote').css('.text::text') # Chained selectors
59
+ quotes = [element.text for element in page.css('.quote').css('.text')] # Slower than bulk query above
60
+
61
+ # Get the first quote element
62
+ quote = page.css('.quote').first # or [0] or .get()
63
+
64
+ # Working with elements
65
+ quote.html_content # Inner HTML
66
+ quote.prettify() # Prettified version of Inner HTML
67
+ quote.attrib # Element attributes
68
+ quote.path # DOM path to element (List)
69
+ ```
70
+ To keep it simple, all methods can be chained on top of each other as long as you are chaining methods that return an element (It's called an `Adaptor` object) or a List of Adaptors (It's called `Adaptors` object)
71
+
72
+ ### Installation
73
+ Scrapling is a breeze to get started with - We only require at least Python 3.6 to work and the rest of the requirements are installed automatically with the package.
74
+ ```bash
75
+ # Using pip
76
+ pip install scrapling
77
+
78
+ # Or the latest from GitHub
79
+ pip install git+https://github.com/D4Vinci/Scrapling.git@master
80
+ ```
81
+
82
+ ## Performance
83
+
84
+ Scrapling isn't just powerful - it's also blazing fast. Scrapling implements many best practices, design patterns, and numerous optimizations to save fractions of seconds. All of that while focusing exclusively on parsing HTML documents.
85
+ Here are benchmarks comparing Scrapling to popular Python libraries in two tests.
86
+
87
+ ### Text Extraction Speed Test (5000 nested elements).
88
+
89
+ | # | Library | Time (ms) | vs Scrapling |
90
+ |---|:-----------------:|:---------:|:------------:|
91
+ | 1 | Scrapling | 5.44 | 1.0x |
92
+ | 2 | Parsel/Scrapy | 5.53 | 1.017x |
93
+ | 3 | Raw Lxml | 6.76 | 1.243x |
94
+ | 4 | PyQuery | 21.96 | 4.037x |
95
+ | 5 | Selectolax | 67.12 | 12.338x |
96
+ | 6 | BS4 with Lxml | 1307.03 | 240.263x |
97
+ | 7 | MechanicalSoup | 1322.64 | 243.132x |
98
+ | 8 | BS4 with html5lib | 3373.75 | 620.175x |
99
+
100
+ As you see, Scrapling is on par with Scrapy and slightly faster than Lxml which both libraries are built on top of. These are the closest results to Scrapling. PyQuery is also built on top of Lxml but still, Scrapling is 4 times faster.
101
+
102
+ ### Extraction By Text Speed Test
103
+
104
+ | Library | Time (ms) | vs Scrapling |
105
+ |:-----------:|:---------:|:------------:|
106
+ | Scrapling | 2.51 | 1.0x |
107
+ | AutoScraper | 11.41 | 4.546x |
108
+
109
+ Scrapling can find elements with more methods and it returns full element `Adaptor` objects not only the text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them. As you see, Scrapling is still 4.5 times faster at same task.
110
+
111
+ > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](/benchmarks.py) for methodology and to run your comparisons.
112
+
113
+ ## Advanced Features
114
+ ### Smart Navigation
115
+ ```python
116
+ >>> quote.tag
117
+ 'div'
118
+
119
+ >>> quote.parent
120
+ <data='<div class="col-md-8"> <div class="quote...' parent='<div class="row"> <div class="col-md-8">...'>
121
+
122
+ >>> quote.parent.tag
123
+ 'div'
124
+
125
+ >>> quote.children
126
+ [<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>,
127
+ <data='<span>by <small class="author" itemprop=...' parent='<div class="quote" itemscope itemtype="h...'>,
128
+ <data='<div class="tags"> Tags: <meta class="ke...' parent='<div class="quote" itemscope itemtype="h...'>]
129
+
130
+ >>> quote.siblings
131
+ [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
132
+ <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
133
+ <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
134
+ ...]
135
+
136
+ >>> quote.next # gets the next element, the same logic applies to `quote.previous`
137
+ <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>
138
+
139
+ >>> quote.children.css(".author::text")
140
+ ['Albert Einstein']
141
+
142
+ >>> quote.has_class('quote')
143
+ True
144
+
145
+ # Generate new selectors for any element
146
+ >>> quote.css_selector
147
+ 'body > div > div:nth-of-type(2) > div > div'
148
+
149
+ # Test these selectors on your favorite browser or reuse them again in the library in other methods!
150
+ >>> quote.xpath_selector
151
+ '//body/div/div[2]/div/div'
152
+ ```
153
+ If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element like below
154
+ ```python
155
+ for ancestor in quote.iterancestors():
156
+ # do something with it...
157
+ ```
158
+ You can search for a specific ancestor of an element that satisfies a function, all you need to do is to pass a function that takes an `Adaptor` object as an argument and return `True` if the condition satisfies or `False` otherwise like below:
159
+ ```python
160
+ >>> quote.find_ancestor(lambda ancestor: ancestor.has_class('row'))
161
+ <data='<div class="row"> <div class="col-md-8">...' parent='<div class="container"> <div class="row...'>
162
+ ```
163
+
164
+ ### Content-based Selection & Finding Similar Elements
165
+ You can select elements by their text content in multiple ways, here's a full example on another website:
166
+ ```python
167
+ >>> response = requests.get('https://books.toscrape.com/index.html')
168
+
169
+ >>> page = Adaptor(response.text, url=response.url)
170
+
171
+ >>> page.find_by_text('Tipping the Velvet') # Find the first element that its text fully matches this text
172
+ <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
173
+
174
+ >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
175
+ [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
176
+
177
+ >>> page.find_by_regex(r'£[\d\.]+') # Get the first element that its text content matches my price regex
178
+ <data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>
179
+
180
+ >>> page.find_by_regex(r'£[\d\.]+', first_match=False) # Get all elements that matches my price regex
181
+ [<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>,
182
+ <data='<p class="price_color">£53.74</p>' parent='<div class="product_price"> <p class="pr...'>,
183
+ <data='<p class="price_color">£50.10</p>' parent='<div class="product_price"> <p class="pr...'>,
184
+ <data='<p class="price_color">£47.82</p>' parent='<div class="product_price"> <p class="pr...'>,
185
+ ...]
186
+ ```
187
+ Find all elements that are similar to the current element in location and attributes
188
+ ```python
189
+ # For this case, ignore the 'title' attribute while matching
190
+ >>> page.find_by_text('Tipping the Velvet').find_similar(ignore_attributes=['title'])
191
+ [<data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
192
+ <data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
193
+ <data='<a href="catalogue/sharp-objects_997/ind...' parent='<h3><a href="catalogue/sharp-objects_997...'>,
194
+ ...]
195
+
196
+ # You will notice that the number of elements is 19 not 20 because the current element is not included.
197
+ >>> len(page.find_by_text('Tipping the Velvet').find_similar(ignore_attributes=['title']))
198
+ 19
199
+
200
+ # Get the `href` attribute from all similar elements
201
+ >>> [element.attrib['href'] for element in page.find_by_text('Tipping the Velvet').find_similar(ignore_attributes=['title'])]
202
+ ['catalogue/a-light-in-the-attic_1000/index.html',
203
+ 'catalogue/soumission_998/index.html',
204
+ 'catalogue/sharp-objects_997/index.html',
205
+ ...]
206
+ ```
207
+ To increase the complexity a little bit, let's say we want to get all books' data using that element as a starting point for some reason
208
+ ```python
209
+ >>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
210
+ print({
211
+ "name": product.css('h3 a::text')[0],
212
+ "price": product.css('.price_color')[0].re_first(r'[\d\.]+'),
213
+ "stock": product.css('.availability::text')[-1].clean()
214
+ })
215
+ {'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
216
+ {'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
217
+ {'name': 'Sharp Objects', 'price': '47.82', 'stock': 'In stock'}
218
+ ...
219
+ ```
220
+ The [documentation](/docs/Examples) will provide more advanced examples.
221
+
222
+ ### Handling Structural Changes
223
+ > Because [the internet archive](https://web.archive.org/) is down at the time of writing this, I can't use real websites as examples even though I tested that before (I mean browsing an old version of a website and then counting the current version of the website as structural changes)
224
+
225
+ Let's say you are scraping a page with a structure like this:
226
+ ```html
227
+ <div class="container">
228
+ <section class="products">
229
+ <article class="product" id="p1">
230
+ <h3>Product 1</h3>
231
+ <p class="description">Description 1</p>
232
+ </article>
233
+ <article class="product" id="p2">
234
+ <h3>Product 2</h3>
235
+ <p class="description">Description 2</p>
236
+ </article>
237
+ </section>
238
+ </div>
239
+ ```
240
+ and you want to scrape the first product, the one with the `p1` ID. You will probably write a selector like this
241
+ ```python
242
+ page.css('#p1')
243
+ ```
244
+ When website owners implement structural changes like
245
+ ```html
246
+ <div class="new-container">
247
+ <div class="product-wrapper">
248
+ <section class="products">
249
+ <article class="product new-class" data-id="p1">
250
+ <div class="product-info">
251
+ <h3>Product 1</h3>
252
+ <p class="new-description">Description 1</p>
253
+ </div>
254
+ </article>
255
+ <article class="product new-class" data-id="p2">
256
+ <div class="product-info">
257
+ <h3>Product 2</h3>
258
+ <p class="new-description">Description 2</p>
259
+ </div>
260
+ </article>
261
+ </section>
262
+ </div>
263
+ </div>
264
+ ```
265
+ The selector will no longer function and your code needs maintenance. That's where Scrapling auto-matching feature comes into play.
266
+
267
+ ```python
268
+ # Before the change
269
+ page = Adaptor(page_source, url='example.com', auto_match=True)
270
+ element = page.css('#p1' auto_save=True)
271
+ if not element: # One day website changes?
272
+ element = page.css('#p1', auto_match=True) # Still finds it!
273
+ # the rest of the code...
274
+ ```
275
+ > How does the auto-matching work? Check the [FAQs](#FAQs) section for that and other possible issues while auto-matching.
276
+
277
+ **Notes:**
278
+ 1. Passing the `auto_save` argument without setting `auto_match` to `True` while initializing the Adaptor object will only result in ignoring the `auto_save` argument value and the following warning message
279
+ ```text
280
+ Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
281
+ ```
282
+ This behavior is purely for performance reasons so the database gets created/connected only when you are planning to use the auto-matching features. Same case with the `auto_match` argument.
283
+
284
+ 2. The `auto_match` parameter works only for `Adaptor` instances not `Adaptors` so if you do something like this you will get an error
285
+ ```python
286
+ page.css('body').css('#p1', auto_match=True)
287
+ ```
288
+ because you can't auto-match a whole list, you have to be specific and do something like
289
+ ```python
290
+ page.css('body')[0].css('#p1', auto_match=True)
291
+ ```
292
+
293
+ ### Is That All?
294
+ Here's what else you can do with Scrapling:
295
+
296
+ - Accessing the `lxml.etree` object itself of any element directly
297
+ ```python
298
+ >>> quote._root
299
+ <Element div at 0x107f98870>
300
+ ```
301
+ - Saving and retrieving elements manually to auto-match them outside the `css` and the `xpath` methods but you have to set the identifier by yourself.
302
+
303
+ - To save element to the database:
304
+ ```python
305
+ >>> element = page.find_by_text('Tipping the Velvet', first_match=True)
306
+ >>> page.save(element, 'my_special_element')
307
+ ```
308
+ - Now later when you want to retrieve it and relocate it in the page with auto-matching, it would be like this
309
+ ```python
310
+ >>> element_dict = page.retrieve('my_special_element')
311
+ >>> page.relocate(element_dict, adaptor_type=True)
312
+ [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
313
+ >>> page.relocate(element_dict, adaptor_type=True).css('::text')
314
+ ['Tipping the Velvet']
315
+ ```
316
+ - if you want to keep it as `lxml.etree` object, leave the `adaptor_type` argument
317
+ ```python
318
+ >>> page.relocate(element_dict)
319
+ [<Element a at 0x105a2a7b0>]
320
+ ```
321
+
322
+ - Doing operations on element content is the same as scrapy
323
+ ```python
324
+ quote.re(r'somethings') # Get all strings (TextHandlers) that match the regex pattern
325
+ quote.re_first(r'something') # Get the first string (TextHandler) only
326
+ quote.json() # If the content text is jsonable, then convert it to json using `orjson` which is 10x faster than the standard json library and provides more options
327
+ ```
328
+ Hence all of these methods are actually methods from the `TextHandler` within that contains the text content so the same can be done directly if you call the `.text` property or equivalent selector function.
329
+
330
+
331
+ - Doing operations on the text content itself includes
332
+ - Cleaning the text from any white spaces and replacing consecutive spaces with single space
333
+ ```python
334
+ quote.clean()
335
+ ```
336
+ - You already know about the regex matching and the fast json parsing but did you know that all strings returned from the regex search are actually `TextHandler` objects too? so in cases where you have for example a JS object assigned to a JS variable inside JS code and want to extract it with regex and then convert it to json object, in other libraries, these would be more than 1 line of code but here you can do it in 1 line like this
337
+ ```python
338
+ page.xpath('//script/text()').re_first(r'var dataLayer = (.+);').json()
339
+ ```
340
+ - Sort all characters in the string as if it were a list and return the new string
341
+ ```python
342
+ quote.sort()
343
+ ```
344
+ > To be clear, `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work with it.
345
+
346
+ - Any element's attributes are not exactly a dictionary but a sub-class of [mapping](https://docs.python.org/3/glossary.html#term-mapping) called `AttributesHandler` that's read-only so it's faster and string values returned are actually `TextHandler` objects so all operations above can be done on them, standard dictionary operations that doesn't modify the data, and more :)
347
+ - Unlike standard dictionaries, here you can search by values too and can do partial searches. It might be handy in some cases (returns a generator of matches)
348
+ ```python
349
+ >>> for item in element.attrib.search_values('catalogue', partial=True):
350
+ print(item)
351
+ {'href': 'catalogue/tipping-the-velvet_999/index.html'}
352
+ ```
353
+ - Serialize the current attributes to JSON bytes:
354
+ ```python
355
+ >>> element.attrib.json_string
356
+ b'{"href":"catalogue/tipping-the-velvet_999/index.html","title":"Tipping the Velvet"}'
357
+ ```
358
+ - Converting it to a normal dictionary
359
+ ```python
360
+ >>> dict(element.attrib)
361
+ {'href': 'catalogue/tipping-the-velvet_999/index.html',
362
+ 'title': 'Tipping the Velvet'}
363
+ ```
364
+
365
+ Scrapling is under active development so expect many more features coming soon :)
366
+
367
+ ## More Advanced Usage
368
+
369
+ There are a lot of deep details skipped here to make this as short as possible so to take a deep dive, head to the [docs](/docs) section. I will try to keep it updated as possible and add complex examples. There I will explain points like how to write your storage system, write spiders that don't depend on selectors at all, and more...
370
+
371
+ Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
372
+
373
+
374
+ ## FAQs
375
+ This section addresses common questions about Scrapling, please read this section before opening an issue.
376
+
377
+ ### How does auto-matching work?
378
+ 1. You need to get a working selector and run it at least once with methods `css` or `xpath` with the `auto_save` parameter set to `True` before structural changes happen.
379
+ 2. Before returning results for you, Scrapling uses its configured database and saves unique properties about that element.
380
+ 3. Now because everything about the element can be changed or removed, nothing from the element can be used as a unique identifier for the database. To solve this issue, I made the storage system rely on two things:
381
+ 1. The domain of the URL you gave while initializing the first Adaptor object
382
+ 2. The `identifier` parameter you passed to the method while selecting. If you didn't pass one, then the selector string itself will be used as an identifier but remember you will have to use it as an identifier value later when the structure changes and you want to pass the new selector.
383
+
384
+ Together both are used to retrieve the element's unique properties from the database later.
385
+ 4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
386
+ 5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
387
+ 6. The score for each element is stored in the table and in the end, the element(s) with the highest combined similarity scores are returned.
388
+
389
+ ### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
390
+ Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
391
+
392
+ ### If all things about an element can change or get removed, what are the unique properties to be saved?
393
+ For each element, Scrapling will extract:
394
+ - Element tag name, text, attributes (names and values), siblings (tag names only), and path (tag names only).
395
+ - Element's parent tag name, attributes (names and values), and text.
396
+
397
+ ### I have enabled the `auto_save`/`auto_match` parameter while selecting and it got completely ignored with a warning message
398
+ That's because passing the `auto_save`/`auto_match` argument without setting `auto_match` to `True` while initializing the Adaptor object will only result in ignoring the `auto_save`/`auto_match` argument value. This behavior is purely for performance reasons so the database gets created only when you are planning to use the auto-matching features.
399
+
400
+ ### I have done everything as the docs but the auto-matching didn't return anything, what's wrong?
401
+ It could be one of these reasons:
402
+ 1. No data were saved/stored for this element before.
403
+ 2. The selector passed is not the one used while storing element data. The solution is simple
404
+ - Pass the old selector again as an identifier to the method called.
405
+ - Retrieve the element with the retrieve method using the old selector as identifier then save it again with the save method and the new selector as identifier.
406
+ - Start using the identifier argument more often if you are planning to use every new selector from now on.
407
+ 3. The website had some extreme structural changes like a new full design. If this happens a lot with this website, the solution would be to make your code as selector-free as possible using Scrapling features.
408
+
409
+ ### Can Scrapling replace code built on top of BeautifulSoup4?
410
+ Pretty much yeah, almost all features you get from BeautifulSoup can be found or achieved in Scrapling one way or another. In fact, if you see there's a feature in bs4 that is missing in Scrapling, please make a feature request from the issues tab to let me know.
411
+
412
+ ### Can Scrapling replace code built on top of AutoScraper?
413
+ Of course, you can find elements by text/regex, find similar elements in a more reliable way than AutoScraper, and finally save/retrieve elements manually to use later as the model feature in AutoScraper. I have pulled all top articles about AutoScraper from Google and tested Scrapling against examples in them. In all examples, Scrapling got the same results as AutoScraper in much less time.
414
+
415
+ ### Is Scrapling thread-safe?
416
+ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
417
+
418
+ ## Contributing
419
+ Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
420
+
421
+ Please read the [contributing file](/CONTRIBUTING.md) before doing anything.
422
+
423
+ ## License
424
+ This work is licensed under BSD-3
425
+
426
+ ## Acknowledgments
427
+ This project includes code adapted from:
428
+ - Parsel (BSD License) - Used for [translator](/scrapling/translator.py) submodule
429
+
430
+ ## Known Issues
431
+ - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
432
+ - Currently, Scrapling is not compatible with async/await.
433
+
434
+ <div align="center"><small>Made with ❤️ by Karim Shoair</small></div><br>
ROADMAP.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## TODOs
2
+ - Add more tests and increase the code coverage.
3
+ - Structure the tests folder in a better way.
4
+ - Add more documentation.
5
+ - Add the browsing ability.
6
+ - Create detailed documentation for 'readthedocs' website, preferably add Github action for deploying it.
7
+ - Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
8
+ - Need to add more functionality to `AttributesHandler` and more navigation functions to `Adaptor` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
9
+ - Add `.filter` method to `Adaptors` object and other similar methods.
10
+ - Add functionality to automatically detect pagination URLs
11
+ - Add the ability to auto-detect schemas in pages and manipulate them
12
+ - Add ability to generate a regex from a group of elements (Like for all href attributes)
13
+ -
benchmarks.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import timeit
3
+ import functools
4
+ import requests
5
+ from statistics import mean
6
+
7
+ from scrapling import Adaptor
8
+ from parsel import Selector
9
+ from lxml import etree, html
10
+ from bs4 import BeautifulSoup
11
+ from pyquery import PyQuery as pq
12
+ from autoscraper import AutoScraper
13
+ from selectolax.parser import HTMLParser
14
+ from mechanicalsoup import StatefulBrowser
15
+
16
+ large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
17
+
18
+
19
+ def benchmark(func):
20
+ @functools.wraps(func)
21
+ def wrapper(*args, **kwargs):
22
+ benchmark_name = func.__name__.replace('test_', '').replace('_', ' ')
23
+ print(f"-> {benchmark_name}", end=" ", flush=True)
24
+ # Warm-up phase
25
+ timeit.repeat(lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals())
26
+ # Measure time (1 run, repeat 100 times, take average)
27
+ times = timeit.repeat(
28
+ lambda: func(*args, **kwargs), number=1, repeat=100, globals=globals(), timer=time.process_time
29
+ )
30
+ min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
31
+ print(f"average execution time: {min_time} ms")
32
+ return min_time
33
+
34
+ return wrapper
35
+
36
+
37
+ @benchmark
38
+ def test_lxml():
39
+ return [
40
+ e.text
41
+ for e in etree.fromstring(
42
+ large_html,
43
+ # Scrapling and Parsel use the same parser inside so this is just to make it fair
44
+ parser=html.HTMLParser(recover=True, huge_tree=True)
45
+ ).cssselect('.item')]
46
+
47
+
48
+ @benchmark
49
+ def test_bs4_lxml():
50
+ return [e.text for e in BeautifulSoup(large_html, 'lxml').select('.item')]
51
+
52
+
53
+ @benchmark
54
+ def test_bs4_html5lib():
55
+ return [e.text for e in BeautifulSoup(large_html, 'html5lib').select('.item')]
56
+
57
+
58
+ @benchmark
59
+ def test_pyquery():
60
+ return [e.text() for e in pq(large_html)('.item').items()]
61
+
62
+
63
+ @benchmark
64
+ def test_scrapling():
65
+ # No need to do `.extract()` like parsel to extract text
66
+ # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False, debug=False).css('.item')]`
67
+ # for obvious reasons, of course.
68
+ return Adaptor(large_html, auto_match=False, debug=False).css('.item::text')
69
+
70
+
71
+ @benchmark
72
+ def test_parsel():
73
+ return Selector(text=large_html).css('.item::text').extract()
74
+
75
+
76
+ @benchmark
77
+ def test_mechanicalsoup():
78
+ browser = StatefulBrowser()
79
+ browser.open_fake_page(large_html)
80
+ return [e.text for e in browser.page.select('.item')]
81
+
82
+
83
+ @benchmark
84
+ def test_selectolax():
85
+ return [node.text() for node in HTMLParser(large_html).css('.item')]
86
+
87
+
88
+ def display(results):
89
+ # Sort and display results
90
+ sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
91
+ scrapling_time = results['Scrapling']
92
+ print("\nRanked Results (fastest to slowest):")
93
+ print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
94
+ print('-' * 50)
95
+ for i, (test_name, test_time) in enumerate(sorted_results, 1):
96
+ compare = round(test_time / scrapling_time, 3)
97
+ print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
98
+
99
+
100
+ @benchmark
101
+ def test_scrapling_text(request_html):
102
+ # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
103
+ return [
104
+ element.text for element in Adaptor(
105
+ request_html, auto_match=False, debug=False
106
+ ).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
107
+ ]
108
+
109
+
110
+ @benchmark
111
+ def test_autoscraper(request_html):
112
+ # autoscraper by default returns elements text
113
+ return AutoScraper().build(html=request_html, wanted_list=['Tipping the Velvet'])
114
+
115
+
116
+ if __name__ == "__main__":
117
+ print(' Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n')
118
+ results1 = {
119
+ "Raw Lxml": test_lxml(),
120
+ "Parsel/Scrapy": test_parsel(),
121
+ "Scrapling": test_scrapling(),
122
+ 'Selectolax': test_selectolax(),
123
+ "PyQuery": test_pyquery(),
124
+ "BS4 with Lxml": test_bs4_lxml(),
125
+ "MechanicalSoup": test_mechanicalsoup(),
126
+ "BS4 with html5lib": test_bs4_html5lib(),
127
+ }
128
+
129
+ display(results1)
130
+ print('\n' + "="*25)
131
+ req = requests.get('https://books.toscrape.com/index.html')
132
+ print(
133
+ ' Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n'
134
+ )
135
+ results2 = {
136
+ "Scrapling": test_scrapling_text(req.text),
137
+ "AutoScraper": test_autoscraper(req.text),
138
+ }
139
+ display(results2)
docs/Core/using scrapling custom types.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > You can take advantage from the custom-made types for Scrapling and use it outside the library if you want. It's better than copying their code after all :)
2
+
3
+ ### All current types can be imported alone like below
4
+ ```python
5
+ >>> from scrapling import TextHandler, AttributesHandler
6
+
7
+ >>> somestring = TextHandler('{}')
8
+ >>> somestring.json()
9
+ '{}'
10
+ >>> somedict_1 = AttributesHandler({'a': 1})
11
+ >>> somedict_2 = AttributesHandler(a=1)
12
+ ```
13
+
14
+ Note `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work.
15
+ If you want to check for the type in your code, it's better to depend on Python built-in function `issubclass`.
16
+
17
+ The class `AttributesHandler` is a sub-class of `collections.abc.Mapping` so it's immutable (read-only) and all operations are inherited from it. The data passed can be accessed later though the `._data` method but careful it's of type `types.MappingProxyType` so it's immutable (read-only) as well (faster than `collections.abc.Mapping` by fractions of seconds).
18
+
19
+ So basically to make it simple to you if you are new to Python, the same operations and methods from Python standard `dict` type will all work with class `AttributesHandler` except the ones that try to modify the actual data.
20
+
21
+ If you want to modify the data inside `AttributesHandler`, you have to convert it to dictionary first like with using the `dict` function and modify it outside.
docs/Examples/selectorless_stackoverflow.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ I only made this example to show how Scrapling features can be used to scrape a website without writing any selector
3
+ so this script doesn't depend on the website structure.
4
+ """
5
+
6
+ import requests
7
+ from scrapling import Adaptor
8
+
9
+ response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
10
+ page = Adaptor(response.text, url=response.url)
11
+ # First we will extract the first question title and its author based on the text content
12
+ first_question_title = page.find_by_text('Run Selenium Python Script on Remote Server')
13
+ first_question_author = page.find_by_text('Ryan')
14
+ # If you want you can extract other questions tags like below
15
+ first_question = first_question_title.find_ancestor(
16
+ lambda ancestor: ancestor.attrib.get('id') and 'question-summary' in ancestor.attrib.get('id')
17
+ )
18
+ rest_of_questions = first_question.find_similar()
19
+ # But since nothing to rely on to extract other titles/authors from these elements without CSS/XPath selectors due to the website nature
20
+ # We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
21
+ for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
22
+ print(i, title.text, author.text)
23
+
docs/Extending Scrapling/writing storage system.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Scrapling by default is using SQLite but in case you want to write your storage system to store elements properties there for the auto-matching, this tutorial got you covered.
2
+
3
+ You might want to use FireBase for example and share the database between multiple spiders on different machines, it's a great idea to use an online database like that because this way the spiders will share with each others.
4
+
5
+ So first to make your storage class work, it must do the big 3:
6
+ 1. Inherit from the abstract class `scrapling.storage_adaptors.StorageSystemMixin` and accept a string argument which will be the `url` argument to maintain the library logic.
7
+ 2. Use the decorator `functools.lru_cache` on top of the class itself to follow the Singleton design pattern as other classes.
8
+ 3. Implement methods `save` and `retrieve`, as you see from the type hints:
9
+ - The method `save` returns nothing and will get two arguments from the library
10
+ * The first one is of type `lxml.html.HtmlElement` which is the element itself, ofc. It must be converted to dictionary using the function `scrapling.utils._StorageTools.element_to_dict` so we keep the same format then saved to your database as you wish.
11
+ * The second one is string which is the identifier used for retrieval. The combination of this identifier and the `url` argument from initialization must be unique for each row or the auto-match will be messed up.
12
+ - The method `retrieve` takes a string which is the identifier, using it with the `url` passed on initialization the element's dictionary is retrieved from the database and returned if it exist otherwise it returns `None`
13
+ > If the instructions weren't clear enough for you, you can check my implementation using SQLite3 in [storage_adaptors](/scrapling/storage_adaptors.py) file
14
+
15
+ If your class satisfy this, the rest is easy. If you are planning to use the library in a threaded application, make sure that your class supports it. The default used class is thread-safe.
16
+
17
+ There are some helper functions added to the abstract class if you want to use it. It's easier to see it for yourself in the [code](/scrapling/storage_adaptors.py), it's heavily commented :)
docs/index.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # This section is still under work but any help is highly appreciated
2
+ ## I will try to make full detailed documentation with Sphinx ASAP.
pytest.ini ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [pytest]
2
+ addopts = -p no:warnings --doctest-modules --ignore=setup.py
scrapling/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Declare top-level shortcuts
2
+ from scrapling.parser import Adaptor, Adaptors
3
+ from scrapling.custom_types import TextHandler, AttributesHandler
4
+
5
+ __author__ = "Karim Shoair (karim.shoair@pm.me)"
6
+ __version__ = "0.1"
7
+ __copyright__ = "Copyright (c) 2024 Karim Shoair"
8
+
9
+
10
+ __all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']
scrapling/custom_types.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from types import MappingProxyType
3
+ from collections.abc import Mapping
4
+ from typing import Dict, List, Union, Pattern
5
+
6
+ from scrapling.utils import _is_iterable, flatten
7
+
8
+ from orjson import loads, dumps
9
+ from w3lib.html import replace_entities as _replace_entities
10
+
11
+
12
+ class TextHandler(str):
13
+ """Extends standard Python string by adding more functionality"""
14
+ __slots__ = ()
15
+
16
+ def __new__(cls, string):
17
+ # Because str is immutable and we can't override __init__
18
+ if type(string) is str:
19
+ return super().__new__(cls, string)
20
+ else:
21
+ return super().__new__(cls, '')
22
+
23
+ def sort(self, reverse: bool = False) -> str:
24
+ """Return a sorted version of the string"""
25
+ return self.__class__("".join(sorted(self, reverse=reverse)))
26
+
27
+ def clean(self) -> str:
28
+ """Return a new version of the string after removing all white spaces and consecutive spaces"""
29
+ data = re.sub(r'[\t|\r|\n]', '', self)
30
+ data = re.sub(' +', ' ', data)
31
+ return self.__class__(data.strip())
32
+
33
+ def json(self) -> Dict:
34
+ """Return json response if the response is jsonable otherwise throw error"""
35
+ # Using __str__ function as a workaround for orjson issue with subclasses of str
36
+ # Check this out: https://github.com/ijl/orjson/issues/445
37
+ return loads(self.__str__())
38
+
39
+ def re(
40
+ self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
41
+ case_sensitive: bool = False, check_match: bool = False
42
+ ) -> Union[List[str], bool]:
43
+ """Apply the given regex to the current text and return a list of strings with the matches.
44
+
45
+ :param regex: Can be either a compiled regular expression or a string.
46
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
47
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
48
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
49
+ :param check_match: used to quickly check if this regex matches or not without any operations on the results
50
+
51
+ """
52
+ if isinstance(regex, str):
53
+ if not case_sensitive:
54
+ regex = re.compile(regex, re.UNICODE)
55
+ else:
56
+ regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
57
+
58
+ input_text = self.clean() if clean_match else self
59
+ results = regex.findall(input_text)
60
+ if check_match:
61
+ return bool(results)
62
+
63
+ if all(_is_iterable(res) for res in results):
64
+ results = flatten(results)
65
+
66
+ if not replace_entities:
67
+ return [TextHandler(string) for string in results]
68
+
69
+ return [TextHandler(_replace_entities(s)) for s in results]
70
+
71
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
72
+ clean_match: bool = False, case_sensitive: bool = False,):
73
+ """Apply the given regex to text and return the first match if found, otherwise return the default value.
74
+
75
+ :param regex: Can be either a compiled regular expression or a string.
76
+ :param default: The default value to be returned if there is no match
77
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
78
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
79
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
80
+
81
+ """
82
+ result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
83
+ return result[0] if result else default
84
+
85
+
86
+ class AttributesHandler(Mapping):
87
+ """A read-only mapping to use instead of the standard dictionary for the speed boost but
88
+ at the same time I use it to add more functionalities.
89
+ If standard dictionary is needed, just convert this class to dictionary with `dict` function
90
+ """
91
+ __slots__ = ('_data',)
92
+
93
+ def __init__(self, mapping=None, **kwargs):
94
+ mapping = {
95
+ key: TextHandler(value) if type(value) is str else value
96
+ for key, value in mapping.items()
97
+ } if mapping is not None else {}
98
+
99
+ if kwargs:
100
+ mapping.update({
101
+ key: TextHandler(value) if type(value) is str else value
102
+ for key, value in kwargs.items()
103
+ })
104
+
105
+ # Fastest read-only mapping type
106
+ self._data = MappingProxyType(mapping)
107
+
108
+ def get(self, key, default=None):
109
+ """Acts like standard dictionary `.get()` method"""
110
+ return self._data.get(key, default)
111
+
112
+ def search_values(self, keyword, partial=False):
113
+ """Search current attributes by values and return dictionary of each matching item
114
+ :param keyword: The keyword to search for in the attributes values
115
+ :param partial: If True, the function will search if keyword in each value instead of perfect match
116
+ """
117
+ for key, value in self._data.items():
118
+ if partial:
119
+ if keyword in value:
120
+ yield AttributesHandler({key: value})
121
+ else:
122
+ if keyword == value:
123
+ yield AttributesHandler({key: value})
124
+
125
+ @property
126
+ def json_string(self):
127
+ """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
128
+ return dumps(dict(self._data))
129
+
130
+ def __getitem__(self, key):
131
+ return self._data[key]
132
+
133
+ def __iter__(self):
134
+ return iter(self._data)
135
+
136
+ def __len__(self):
137
+ return len(self._data)
138
+
139
+ def __repr__(self):
140
+ return f"{self.__class__.__name__}({self._data})"
141
+
142
+ def __str__(self):
143
+ return str(self._data)
144
+
145
+ def __contains__(self, key):
146
+ return key in self._data
scrapling/mixins.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ class SelectorsGeneration:
3
+ """Selectors generation functions
4
+ Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
5
+ Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
6
+
7
+ def __general_selection(self, selection: str = 'css') -> str:
8
+ """Generate a selector for the current element.
9
+ :return: A string of the generated selector.
10
+ """
11
+ selectorPath = []
12
+ target = self
13
+ css = selection.lower() == 'css'
14
+ while target is not None:
15
+ if target.parent:
16
+ if target.attrib.get('id'):
17
+ # id is enough
18
+ part = (
19
+ f'#{target.attrib["id"]}' if css
20
+ else f"[@id='{target.attrib['id']}']"
21
+ )
22
+ selectorPath.append(part)
23
+ return (
24
+ " > ".join(reversed(selectorPath)) if css
25
+ else '//*' + "/".join(reversed(selectorPath))
26
+ )
27
+ else:
28
+ part = f'{target.tag}'
29
+ # We won't use classes anymore because I some websites share exact classes between elements
30
+ # classes = target.attrib.get('class', '').split()
31
+ # if classes and css:
32
+ # part += f".{'.'.join(classes)}"
33
+ # else:
34
+ counter = {}
35
+ for child in target.parent.children:
36
+ counter.setdefault(child.tag, 0)
37
+ counter[child.tag] += 1
38
+ if child._root == target._root:
39
+ break
40
+
41
+ if counter[target.tag] > 1:
42
+ part += (
43
+ f":nth-of-type({counter[target.tag]})" if css
44
+ else f"[{counter[target.tag]}]"
45
+ )
46
+
47
+ selectorPath.append(part)
48
+ target = target.parent
49
+ if target is None or target.tag == 'html':
50
+ return (
51
+ " > ".join(reversed(selectorPath)) if css
52
+ else '//' + "/".join(reversed(selectorPath))
53
+ )
54
+ else:
55
+ break
56
+
57
+ return (
58
+ " > ".join(reversed(selectorPath)) if css
59
+ else '//' + "/".join(reversed(selectorPath))
60
+ )
61
+
62
+ @property
63
+ def css_selector(self) -> str:
64
+ """Generate a CSS selector for the current element
65
+ :return: A string of the generated selector.
66
+ """
67
+ return self.__general_selection()
68
+
69
+ @property
70
+ def xpath_selector(self) -> str:
71
+ """Generate a XPath selector for the current element
72
+ :return: A string of the generated selector.
73
+ """
74
+ return self.__general_selection('xpath')
scrapling/parser.py ADDED
@@ -0,0 +1,903 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from difflib import SequenceMatcher
3
+ from typing import Any, Dict, List, Tuple, Optional, Pattern, SupportsIndex, Union, Callable, Generator
4
+
5
+ from scrapling.translator import HTMLTranslator
6
+ from scrapling.mixins import SelectorsGeneration
7
+ from scrapling.custom_types import TextHandler, AttributesHandler
8
+ from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
9
+ from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
10
+
11
+ from lxml import etree, html
12
+ from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
13
+
14
+
15
+ class Adaptor(SelectorsGeneration):
16
+ __slots__ = (
17
+ 'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
18
+ '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
19
+ )
20
+
21
+ def __init__(
22
+ self,
23
+ text: Optional[str] = None,
24
+ url: Optional[str] = None,
25
+ body: bytes = b"",
26
+ encoding: str = "utf8",
27
+ huge_tree: bool = True,
28
+ root: Optional[html.HtmlElement] = None,
29
+ keep_comments: Optional[bool] = False,
30
+ auto_match: Optional[bool] = False,
31
+ storage: Any = SQLiteStorageSystem,
32
+ storage_args: Optional[Dict] = None,
33
+ debug: Optional[bool] = True,
34
+ ):
35
+ """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
36
+ with expressions in CSS, XPath, or with simply text. Check the docs for more info.
37
+
38
+ Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
39
+ inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
40
+ not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
41
+ It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
42
+
43
+ :param text: HTML body passed as text.
44
+ :param url: allows storing a URL with the html data for retrieving later.
45
+ :param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
46
+ :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
47
+ :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
48
+ libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
49
+ :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
50
+ Don't use it unless you know what you are doing!
51
+ :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
52
+ :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
53
+ priority over all auto-match related arguments/functions in the class.
54
+ :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
55
+ :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
56
+ If empty, default values will be used.
57
+ :param debug: Enable debug mode
58
+ """
59
+ if root is None and not body and text is None:
60
+ raise ValueError("Adaptor class needs text, body, or root arguments to work")
61
+
62
+ if root is None:
63
+ if text is None:
64
+ if not body or not isinstance(body, bytes):
65
+ raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
66
+
67
+ body = body.replace(b"\x00", b"").strip()
68
+ else:
69
+ if not isinstance(text, str):
70
+ raise TypeError(f"text argument must be of type str, got {text.__class__}")
71
+
72
+ body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
73
+
74
+ parser = html.HTMLParser(
75
+ # https://lxml.de/api/lxml.etree.HTMLParser-class.html
76
+ recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
77
+ compact=True, huge_tree=huge_tree, default_doctype=True
78
+ )
79
+ self._root = etree.fromstring(body, parser=parser, base_url=url)
80
+
81
+ else:
82
+ # All html types inherits from HtmlMixin so this to check for all at once
83
+ if not issubclass(type(root), html.HtmlMixin):
84
+ raise TypeError(
85
+ f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
86
+ )
87
+
88
+ self._root = root
89
+
90
+ setup_basic_logging(level='debug' if debug else 'info')
91
+ self.__auto_match_enabled = auto_match
92
+
93
+ if self.__auto_match_enabled:
94
+ if not storage_args:
95
+ storage_args = {
96
+ 'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
97
+ 'url': url
98
+ }
99
+
100
+ if not hasattr(storage, '__wrapped__'):
101
+ raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
102
+
103
+ if not issubclass(storage.__wrapped__, StorageSystemMixin):
104
+ raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
105
+
106
+ self._storage = storage(**storage_args)
107
+
108
+ self.__keep_comments = keep_comments
109
+ self.__huge_tree_enabled = huge_tree
110
+ self.encoding = encoding
111
+ self.url = url
112
+ # For selector stuff
113
+ self.__attributes = None
114
+ self.__text = None
115
+ self.__tag = None
116
+ self.__debug = debug
117
+
118
+ # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
119
+ @staticmethod
120
+ def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
121
+ """Return True if given element is a result of a string expression
122
+ Examples:
123
+ Xpath -> '/text()', '/@attribute' etc...
124
+ CSS3 -> '::text', '::attr(attrib)'...
125
+ """
126
+ # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
127
+ return issubclass(type(element), etree._ElementUnicodeResult)
128
+
129
+ def __get_correct_result(
130
+ self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
131
+ ) -> Union[TextHandler, html.HtmlElement, 'Adaptor', str]:
132
+ """Used internally in all functions to convert results to type (Adaptor|Adaptors) when possible"""
133
+ if self._is_text_node(element):
134
+ # etree._ElementUnicodeResult basically inherit from `str` so it's fine
135
+ return TextHandler(str(element))
136
+ else:
137
+ if issubclass(type(element), html.HtmlMixin):
138
+ return self.__class__(
139
+ root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
140
+ keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
141
+ )
142
+ return element
143
+
144
+ def __convert_results(
145
+ self, result: Union[List[html.HtmlElement], html.HtmlElement]
146
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor', List, None]:
147
+ """Used internally in all functions to convert results to type (Adaptor|Adaptors) in bulk when possible"""
148
+ if result is None:
149
+ return None
150
+ elif result == []: # Lxml will give a warning if I used something like `not result`
151
+ return []
152
+
153
+ if isinstance(result, Adaptors):
154
+ return result
155
+
156
+ if type(result) is list:
157
+ results = [self.__get_correct_result(n) for n in result]
158
+ if all(isinstance(res, self.__class__) for res in results):
159
+ return Adaptors(results)
160
+ return results
161
+
162
+ return self.__get_correct_result(result)
163
+
164
+ def __getstate__(self) -> Any:
165
+ # lxml don't like it :)
166
+ raise TypeError("Can't pickle Adaptor objects")
167
+
168
+ # The following four properties I made them into functions instead of variables directly
169
+ # So they don't slow down the process of initializing many instances of the class and gets executed only
170
+ # when the user need them for the first time for that specific element and gets cached for next times
171
+ # Doing that only made the library performance test sky rocked multiple times faster than before
172
+ # because I was executing them on initialization before :))
173
+ @property
174
+ def tag(self) -> str:
175
+ """Get tag name of the element"""
176
+ if not self.__tag:
177
+ self.__tag = self._root.tag
178
+ return self.__tag
179
+
180
+ @property
181
+ def text(self) -> TextHandler:
182
+ """Get text content of the element"""
183
+ if not self.__text:
184
+ self.__text = TextHandler(self._root.text)
185
+ return self.__text
186
+
187
+ def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
188
+ """Get all child strings of this element, concatenated using the given separator.
189
+
190
+ :param separator: Strings will be concatenated using this separator.
191
+ :param strip: If True, strings will be stripped before being concatenated.
192
+ :param ignore_tags: A tuple of all tag names you want to ignore
193
+ :param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored
194
+
195
+ :return: A TextHandler
196
+ """
197
+ _all_strings = []
198
+
199
+ def _traverse(node: html.HtmlElement) -> None:
200
+ """Traverse element children and get text content of each
201
+
202
+ :param node: Current node in the tree structure
203
+ :return:
204
+ """
205
+ if node.tag not in ignore_tags:
206
+ text = node.text
207
+ if text and type(text) is str:
208
+ if valid_values:
209
+ if text.strip():
210
+ _all_strings.append(text if not strip else text.strip())
211
+ else:
212
+ _all_strings.append(text if not strip else text.strip())
213
+
214
+ for branch in node.iterchildren():
215
+ _traverse(branch)
216
+
217
+ # We will start using Lxml directly for the speed boost
218
+ _traverse(self._root)
219
+
220
+ return TextHandler(separator.join([s for s in _all_strings]))
221
+
222
+ @property
223
+ def attrib(self) -> AttributesHandler:
224
+ """Get attributes of the element"""
225
+ if not self.__attributes:
226
+ self.__attributes = AttributesHandler(self._root.attrib)
227
+ return self.__attributes
228
+
229
+ @property
230
+ def html_content(self) -> str:
231
+ """Return the inner html code of the element"""
232
+ return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
233
+
234
+ body = html_content
235
+
236
+ def prettify(self) -> str:
237
+ """Return a prettified version of the element's inner html-code"""
238
+ return etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False)
239
+
240
+ def has_class(self, class_name: str) -> bool:
241
+ """Check if element has a specific class
242
+ :param class_name: The class name to check for
243
+ :return: True if element has class with that name otherwise False
244
+ """
245
+ return class_name in self._root.classes
246
+
247
+ @property
248
+ def parent(self) -> Union['Adaptor', None]:
249
+ """Return the direct parent of the element or ``None`` otherwise"""
250
+ return self.__convert_results(self._root.getparent())
251
+
252
+ @property
253
+ def children(self) -> Union['Adaptors[Adaptor]', List]:
254
+ """Return the children elements of the current element or empty list otherwise"""
255
+ return self.__convert_results(list(
256
+ child for child in self._root.iterchildren() if type(child) not in html_forbidden
257
+ ))
258
+
259
+ @property
260
+ def siblings(self) -> Union['Adaptors[Adaptor]', List]:
261
+ """Return other children of the current element's parent or empty list otherwise"""
262
+ if self.parent:
263
+ return Adaptors([child for child in self.parent.children if child._root != self._root])
264
+ return []
265
+
266
+ def iterancestors(self) -> Generator['Adaptor', None, None]:
267
+ """Return a generator that loops over all ancestors of the element, starting with element's parent."""
268
+ for ancestor in self._root.iterancestors():
269
+ yield self.__convert_results(ancestor)
270
+
271
+ def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
272
+ """Loop over all ancestors of the element till one match the passed function
273
+ :param func: A function that takes each ancestor as an argument and returns True/False
274
+ :return: The first ancestor that match the function or ``None`` otherwise.
275
+ """
276
+ for ancestor in self.iterancestors():
277
+ if func(ancestor):
278
+ return ancestor
279
+ return None
280
+
281
+ @property
282
+ def path(self) -> 'Adaptors[Adaptor]':
283
+ """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
284
+ lst = list(self.iterancestors())
285
+ return Adaptors(lst)
286
+
287
+ @property
288
+ def next(self) -> Union['Adaptor', None]:
289
+ """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
290
+ next_element = self._root.getnext()
291
+ if next_element is not None:
292
+ while type(next_element) in html_forbidden:
293
+ # Ignore html comments and unwanted types
294
+ next_element = next_element.getnext()
295
+
296
+ return self.__convert_results(next_element)
297
+
298
+ @property
299
+ def previous(self) -> Union['Adaptor', None]:
300
+ """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
301
+ prev_element = self._root.getprevious()
302
+ if prev_element is not None:
303
+ while type(prev_element) in html_forbidden:
304
+ # Ignore html comments and unwanted types
305
+ prev_element = prev_element.getprevious()
306
+
307
+ return self.__convert_results(prev_element)
308
+
309
+ def __str__(self) -> str:
310
+ return self.html_content
311
+
312
+ def __repr__(self) -> str:
313
+ length_limit = 40
314
+ data = "<"
315
+ content = clean_spaces(self.html_content)
316
+ if len(content) > length_limit:
317
+ content = content[:length_limit].strip() + '...'
318
+ data += f"data='{content}'"
319
+
320
+ if self.parent:
321
+ parent_content = clean_spaces(self.parent.html_content)
322
+ if len(parent_content) > length_limit:
323
+ parent_content = parent_content[:length_limit].strip() + '...'
324
+
325
+ data += f" parent='{parent_content}'"
326
+
327
+ return data + ">"
328
+
329
+ # From here we start the selecting functions
330
+ def relocate(
331
+ self, element: Union[Dict, html.HtmlElement, 'Adaptor'], percentage: int = 0, adaptor_type: bool = False
332
+ ) -> Union[List[Union[html.HtmlElement, None]], 'Adaptors']:
333
+ """This function will search again for the element in the page tree, used automatically on page structure change
334
+
335
+ :param element: The element we want to relocate in the tree
336
+ :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
337
+ calculation depends solely on the page structure so don't play with this number unless you must know
338
+ what you are doing!
339
+ :param adaptor_type: If True, the return result will be converted to `Adaptors` object
340
+ :return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
341
+ """
342
+ score_table = {}
343
+ # Note: `element` will be most likely always be a dictionary at this point.
344
+ if isinstance(element, self.__class__):
345
+ element = element._root
346
+
347
+ if issubclass(type(element), html.HtmlElement):
348
+ element = _StorageTools.element_to_dict(element)
349
+
350
+ # TODO: Optimize the traverse logic a bit, maybe later
351
+ def _traverse(node: html.HtmlElement, ele: Dict) -> None:
352
+ """Get the matching score of the given element against the node then traverse the children
353
+
354
+ :param node: Current node in the tree structure
355
+ :param ele: The element we are searching for as dictionary
356
+ :return:
357
+ """
358
+ # Hence: the code doesn't stop even if the score was 100%
359
+ # because there might be another element(s) left in page with the same score
360
+ score = self.__calculate_similarity_score(ele, node)
361
+ score_table.setdefault(score, []).append(node)
362
+ for branch in node.iterchildren():
363
+ _traverse(branch, ele)
364
+
365
+ # This will block until we traverse all children/branches
366
+ _traverse(self._root, element)
367
+
368
+ if score_table:
369
+ highest_probability = max(score_table.keys())
370
+ if score_table[highest_probability] and highest_probability >= percentage:
371
+ logging.debug(f'Highest probability was {highest_probability}%')
372
+ logging.debug('Top 5 best matching elements are: ')
373
+ for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
374
+ logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
375
+ if not adaptor_type:
376
+ return score_table[highest_probability]
377
+ return self.__convert_results(score_table[highest_probability])
378
+ return []
379
+
380
+ def css(self, selector: str, identifier: str = '',
381
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0
382
+ ) -> Union['Adaptors[Adaptor]', List]:
383
+ """Search current tree with CSS3 selectors
384
+
385
+ **Important:
386
+ It's recommended to use the identifier argument if you plan to use different selector later
387
+ and want to relocate the same element(s)**
388
+
389
+ :param selector: The CSS3 selector to be used.
390
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
391
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
392
+ otherwise the selector will be used.
393
+ :param auto_save: Automatically save new elements for `auto_match` later
394
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
395
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
396
+ number unless you must know what you are doing!
397
+
398
+ :return: List as :class:`Adaptors`
399
+ """
400
+ try:
401
+ if not self.__auto_match_enabled:
402
+ # No need to split selectors in this case, let's save some CPU cycles :)
403
+ xpath_selector = HTMLTranslator().css_to_xpath(selector)
404
+ return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
405
+
406
+ results = []
407
+ if ',' in selector:
408
+ for single_selector in split_selectors(selector):
409
+ # I'm doing this only so the `save` function save data correctly for combined selectors
410
+ # Like using the ',' to combine two different selectors that point to different elements.
411
+ xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
412
+ results += self.xpath(
413
+ xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
414
+ )
415
+ else:
416
+ xpath_selector = HTMLTranslator().css_to_xpath(selector)
417
+ return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
418
+
419
+ return self.__convert_results(results)
420
+ except (SelectorError, SelectorSyntaxError,):
421
+ raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
422
+
423
+ def xpath(self, selector: str, identifier: str = '',
424
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
425
+ ) -> Union['Adaptors[Adaptor]', List]:
426
+ """Search current tree with XPath selectors
427
+
428
+ **Important:
429
+ It's recommended to use the identifier argument if you plan to use different selector later
430
+ and want to relocate the same element(s)**
431
+
432
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
433
+
434
+ :param selector: The XPath selector to be used.
435
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
436
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
437
+ otherwise the selector will be used.
438
+ :param auto_save: Automatically save new elements for `auto_match` later
439
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
440
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
441
+ number unless you must know what you are doing!
442
+
443
+ :return: List as :class:`Adaptors`
444
+ """
445
+ try:
446
+ selected_elements = self._root.xpath(selector, **kwargs)
447
+
448
+ if selected_elements:
449
+ if not self.__auto_match_enabled and auto_save:
450
+ logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
451
+
452
+ elif self.__auto_match_enabled and auto_save:
453
+ self.save(selected_elements[0], identifier or selector)
454
+
455
+ return self.__convert_results(selected_elements)
456
+ else:
457
+ if self.__auto_match_enabled and auto_match:
458
+ element_data = self.retrieve(identifier or selector)
459
+ if element_data:
460
+ relocated = self.relocate(element_data, percentage)
461
+ if relocated is not None and auto_save:
462
+ self.save(relocated[0], identifier or selector)
463
+
464
+ return self.__convert_results(relocated)
465
+ else:
466
+ return self.__convert_results(selected_elements)
467
+
468
+ elif not self.__auto_match_enabled and auto_match:
469
+ logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
470
+
471
+ return self.__convert_results(selected_elements)
472
+
473
+ except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
474
+ raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
475
+
476
+ def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
477
+ """Used internally to calculate a score that shows how candidate element similar to the original one
478
+
479
+ :param original: The original element in the form of the dictionary generated from `element_to_dict` function
480
+ :param candidate: The element to compare with the original element.
481
+ :return: A percentage score of how similar is the candidate to the original element
482
+ """
483
+ score, checks = 0, 0
484
+ candidate = _StorageTools.element_to_dict(candidate)
485
+
486
+ # Possible TODO:
487
+ # Study the idea of giving weight to each test below so some are more important than others
488
+ # Current results: With weights some websites had better score while it was worse for others
489
+ score += 1 if original['tag'] == candidate['tag'] else 0 # * 0.3 # 30%
490
+ checks += 1
491
+
492
+ if original['text']:
493
+ score += SequenceMatcher(None, original['text'], candidate.get('text') or '').ratio() # * 0.3 # 30%
494
+ checks += 1
495
+
496
+ # if both doesn't have attributes, it still count for something!
497
+ score += self.__calculate_dict_diff(original['attributes'], candidate['attributes']) # * 0.3 # 30%
498
+ checks += 1
499
+
500
+ # Separate similarity test for class, id, href,... this will help in full structural changes
501
+ for attrib in ('class', 'id', 'href', 'src',):
502
+ if original['attributes'].get(attrib):
503
+ score += SequenceMatcher(
504
+ None, original['attributes'][attrib], candidate['attributes'].get(attrib) or ''
505
+ ).ratio() # * 0.3 # 30%
506
+ checks += 1
507
+
508
+ score += SequenceMatcher(None, original['path'], candidate['path']).ratio() # * 0.1 # 10%
509
+ checks += 1
510
+
511
+ if original.get('parent_name'):
512
+ # Then we start comparing parents' data
513
+ if candidate.get('parent_name'):
514
+ score += SequenceMatcher(
515
+ None, original['parent_name'], candidate.get('parent_name') or ''
516
+ ).ratio() # * 0.2 # 20%
517
+ checks += 1
518
+
519
+ score += self.__calculate_dict_diff(
520
+ original['parent_attribs'], candidate.get('parent_attribs') or {}
521
+ ) # * 0.2 # 20%
522
+ checks += 1
523
+
524
+ if original['parent_text']:
525
+ score += SequenceMatcher(
526
+ None, original['parent_text'], candidate.get('parent_text') or ''
527
+ ).ratio() # * 0.1 # 10%
528
+ checks += 1
529
+ # else:
530
+ # # The original element have a parent and this one not, this is not a good sign
531
+ # score -= 0.1
532
+
533
+ if original.get('siblings'):
534
+ score += SequenceMatcher(
535
+ None, original['siblings'], candidate.get('siblings') or []
536
+ ).ratio() # * 0.1 # 10%
537
+ checks += 1
538
+
539
+ # How % sure? let's see
540
+ return round((score / checks) * 100, 2)
541
+
542
+ @staticmethod
543
+ def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
544
+ """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
545
+ """
546
+ score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
547
+ score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
548
+ return score
549
+
550
+ def save(self, element: Union['Adaptor', html.HtmlElement], identifier: str) -> None:
551
+ """Saves the element's unique properties to the storage for retrieval and relocation later
552
+
553
+ :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
554
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
555
+ the docs for more info.
556
+ """
557
+ if self.__auto_match_enabled:
558
+ if isinstance(element, self.__class__):
559
+ element = element._root
560
+
561
+ if self._is_text_node(element):
562
+ element = element.getparent()
563
+
564
+ self._storage.save(element, identifier)
565
+ else:
566
+ logging.critical(
567
+ "Can't use Auto-match features with disabled globally, you have to start a new class instance."
568
+ )
569
+
570
+ def retrieve(self, identifier: str) -> Optional[Dict]:
571
+ """Using the identifier, we search the storage and return the unique properties of the element
572
+
573
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
574
+ the docs for more info.
575
+ :return: A dictionary of the unique properties
576
+ """
577
+ if self.__auto_match_enabled:
578
+ return self._storage.retrieve(identifier)
579
+
580
+ logging.critical(
581
+ "Can't use Auto-match features with disabled globally, you have to start a new class instance."
582
+ )
583
+
584
+ # Operations on text functions
585
+ def json(self) -> Dict:
586
+ """Return json response if the response is jsonable otherwise throws error"""
587
+ return self.text.json()
588
+
589
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
590
+ """Apply the given regex to the current text and return a list of strings with the matches.
591
+
592
+ :param regex: Can be either a compiled regular expression or a string.
593
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
594
+ """
595
+ return self.text.re(regex, replace_entities)
596
+
597
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
598
+ """Apply the given regex to text and return the first match if found, otherwise return the default value.
599
+
600
+ :param regex: Can be either a compiled regular expression or a string.
601
+ :param default: The default value to be returned if there is no match
602
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
603
+
604
+ """
605
+ return self.text.re_first(regex, default, replace_entities)
606
+
607
+ def find_similar(
608
+ self,
609
+ similarity_threshold: float = 0.2,
610
+ ignore_attributes: Union[List, Tuple] = ('href', 'src',),
611
+ match_text: bool = False
612
+ ) -> Union['Adaptors[Adaptor]', List]:
613
+ """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
614
+ then return the ones that match the current element attributes with percentage higher than the input threshold.
615
+
616
+ This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
617
+ a products-list container and want to find other products using that that element as a starting point EXCEPT
618
+ this function works in any case without depending on the element type.
619
+
620
+ :param similarity_threshold: The percentage to use while comparing elements attributes.
621
+ Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
622
+ same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless your are
623
+ extremely unlucky then attributes matching comes into play so basically don't play with this number unless
624
+ you are getting the results you don't want.
625
+ Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
626
+ :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
627
+ The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
628
+ :param match_text: If True, elements text content will be taken into calculation while matching.
629
+ Not recommended to use in normal cases but it depends.
630
+
631
+ :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
632
+ """
633
+ def get_attributes(element: html.HtmlElement) -> Dict:
634
+ """Return attributes dictionary without the ignored list"""
635
+ return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
636
+
637
+ def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
638
+ """Calculate a score of how much these elements are alike and return True
639
+ if score is higher or equal the threshold"""
640
+ candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
641
+ score, checks = 0, 0
642
+
643
+ if original_attributes:
644
+ score += sum(
645
+ SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
646
+ for k, v in original_attributes.items()
647
+ )
648
+ checks += len(candidate_attributes)
649
+ else:
650
+ if not candidate_attributes:
651
+ # Both doesn't have attributes, this must mean something
652
+ score += 1
653
+ checks += 1
654
+
655
+ if match_text:
656
+ score += SequenceMatcher(
657
+ None, clean_spaces(original.text or ''), clean_spaces(candidate.text or '')
658
+ ).ratio()
659
+ checks += 1
660
+
661
+ if checks:
662
+ return round(score / checks, 2) >= similarity_threshold
663
+ return False
664
+
665
+ # We will use the elements root from now on to get the speed boost of using Lxml directly
666
+ root = self._root
667
+ current_depth = len(list(root.iterancestors()))
668
+ target_attrs = get_attributes(root) if ignore_attributes else root.attrib
669
+ similar_elements = list()
670
+ # + root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth-1}]")
671
+ parent = root.getparent()
672
+ if parent is not None:
673
+ grandparent = parent.getparent() # lol
674
+ if grandparent is not None:
675
+ potential_matches = root.xpath(
676
+ f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
677
+ )
678
+ else:
679
+ potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
680
+ else:
681
+ potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
682
+
683
+ for potential_match in potential_matches:
684
+ if potential_match != root and are_alike(root, target_attrs, potential_match):
685
+ similar_elements.append(potential_match)
686
+
687
+ return self.__convert_results(similar_elements)
688
+
689
+ def find_by_text(
690
+ self, text: str, first_match: bool = True, partial: bool = False,
691
+ case_sensitive: bool = False, clean_match: bool = True
692
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
693
+ """Find elements that its text content fully/partially matches input.
694
+ :param text: Text query to match
695
+ :param first_match: Return first element that matches conditions, enabled by default
696
+ :param partial: If enabled, function return elements that contains the input text
697
+ :param case_sensitive: if enabled, letters case will be taken into consideration
698
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
699
+ """
700
+
701
+ results = []
702
+ if not case_sensitive:
703
+ text = text.lower()
704
+
705
+ def _traverse(node: Adaptor) -> None:
706
+ """Check if element matches given text otherwise, traverse the children tree and iterate"""
707
+ node_text = node.text
708
+ # if there's already no text in this node, dodge it to save CPU cycles and time
709
+ if node_text:
710
+ if clean_match:
711
+ node_text = node_text.clean()
712
+
713
+ if not case_sensitive:
714
+ node_text = node_text.lower()
715
+
716
+ if partial:
717
+ if text in node_text:
718
+ results.append(node)
719
+ elif text == node_text:
720
+ results.append(node)
721
+
722
+ if results and first_match:
723
+ # we got an element so we should stop
724
+ return
725
+
726
+ for branch in node.children:
727
+ _traverse(branch)
728
+
729
+ # This will block until we traverse all children/branches
730
+ _traverse(self)
731
+
732
+ if first_match:
733
+ if results:
734
+ return results[0]
735
+ return self.__convert_results(results)
736
+
737
+ def find_by_regex(
738
+ self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
739
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
740
+ """Find elements that its text content matches the input regex pattern.
741
+ :param query: Regex query to match
742
+ :param first_match: Return first element that matches conditions, enabled by default
743
+ :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
744
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
745
+ """
746
+ results = []
747
+
748
+ def _traverse(node: Adaptor) -> None:
749
+ """Check if element matches given regex otherwise, traverse the children tree and iterate"""
750
+ node_text = node.text
751
+ # if there's already no text in this node, dodge it to save CPU cycles and time
752
+ if node_text:
753
+ if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
754
+ results.append(node)
755
+
756
+ if results and first_match:
757
+ # we got an element so we should stop
758
+ return
759
+
760
+ for branch in node.children:
761
+ _traverse(branch)
762
+
763
+ # This will block until we traverse all children/branches
764
+ _traverse(self)
765
+
766
+ if results and first_match:
767
+ return results[0]
768
+ return self.__convert_results(results)
769
+
770
+
771
+ class Adaptors(List[Adaptor]):
772
+ """
773
+ The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
774
+ """
775
+ __slots__ = ()
776
+
777
+ def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors[Adaptor]"]:
778
+ lst = super().__getitem__(pos)
779
+ if isinstance(pos, slice):
780
+ return self.__class__(lst)
781
+ else:
782
+ return lst
783
+
784
+ def xpath(
785
+ self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
786
+ ) -> Union["Adaptors[Adaptor]", List]:
787
+ """
788
+ Call the ``.xpath()`` method for each element in this list and return
789
+ their results as another :class:`Adaptors`.
790
+
791
+ **Important:
792
+ It's recommended to use the identifier argument if you plan to use different selector later
793
+ and want to relocate the same element(s)**
794
+
795
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
796
+
797
+ :param selector: The XPath selector to be used.
798
+ :param identifier: A string that will be used to retrieve element's data in auto-matching
799
+ otherwise the selector will be used.
800
+ :param auto_save: Automatically save new elements for `auto_match` later
801
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
802
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
803
+ number unless you must know what you are doing!
804
+
805
+ :return: List as :class:`Adaptors`
806
+ """
807
+ results = [
808
+ n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self
809
+ ]
810
+ return self.__class__(flatten(results))
811
+
812
+ def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> Union["Adaptors[Adaptor]", List]:
813
+ """
814
+ Call the ``.css()`` method for each element in this list and return
815
+ their results flattened as another :class:`Adaptors`.
816
+
817
+ **Important:
818
+ It's recommended to use the identifier argument if you plan to use different selector later
819
+ and want to relocate the same element(s)**
820
+
821
+ :param selector: The CSS3 selector to be used.
822
+ :param identifier: A string that will be used to retrieve element's data in auto-matching
823
+ otherwise the selector will be used.
824
+ :param auto_save: Automatically save new elements for `auto_match` later
825
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
826
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
827
+ number unless you must know what you are doing!
828
+
829
+ :return: List as :class:`Adaptors`
830
+ """
831
+ results = [
832
+ n.css(selector, identifier or selector, False, auto_save, percentage) for n in self
833
+ ]
834
+ return self.__class__(flatten(results))
835
+
836
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
837
+ """Call the ``.re()`` method for each element in this list and return
838
+ their results flattened as List of TextHandler.
839
+
840
+ :param regex: Can be either a compiled regular expression or a string.
841
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
842
+ """
843
+ results = [
844
+ n.text.re(regex, replace_entities) for n in self
845
+ ]
846
+ return flatten(results)
847
+
848
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
849
+ """Call the ``.re_first()`` method for each element in this list and return
850
+ their results flattened as List of TextHandler.
851
+
852
+ :param regex: Can be either a compiled regular expression or a string.
853
+ :param default: The default value to be returned if there is no match
854
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
855
+
856
+ """
857
+ results = [
858
+ n.text.re_first(regex, default, replace_entities) for n in self
859
+ ]
860
+ return flatten(results)
861
+
862
+ # def __getattr__(self, name):
863
+ # if name in dir(self.__class__):
864
+ # return super().__getattribute__(name)
865
+ #
866
+ # # Execute the method itself on each Adaptor
867
+ # results = []
868
+ # for item in self:
869
+ # results.append(getattr(item, name))
870
+ #
871
+ # if all(callable(r) for r in results):
872
+ # def call_all(*args, **kwargs):
873
+ # final_results = [r(*args, **kwargs) for r in results]
874
+ # if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
875
+ # return self.__class__(final_results)
876
+ # return final_results
877
+ #
878
+ # return call_all
879
+ # else:
880
+ # # Flatten the result if it's a single-item list containing a list
881
+ # if len(self) == 1 and isinstance(results[0], list):
882
+ # return self.__class__(results[0])
883
+ # return self.__class__(results)
884
+
885
+ def get(self, default=None):
886
+ """Returns the first item of the current list
887
+ :param default: the default value to return if the current list is empty
888
+ """
889
+ return self[0] if len(self) > 0 else default
890
+
891
+ @property
892
+ def first(self):
893
+ """Returns the first item of the current list or `None` if the list is empty"""
894
+ return self.get()
895
+
896
+ @property
897
+ def last(self):
898
+ """Returns the last item of the current list or `None` if the list is empty"""
899
+ return self[-1] if len(self) > 0 else None
900
+
901
+ def __getstate__(self) -> Any:
902
+ # lxml don't like it :)
903
+ raise TypeError("Can't pickle Adaptors object")
scrapling/py.typed ADDED
File without changes
scrapling/storage_adaptors.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import orjson
2
+ import sqlite3
3
+ import logging
4
+ import threading
5
+ from hashlib import sha256
6
+ from abc import ABC, abstractmethod
7
+ from typing import Dict, Optional, Union
8
+
9
+ from scrapling.utils import _StorageTools, cache
10
+
11
+ from lxml import html
12
+ from tldextract import extract as tld
13
+
14
+
15
+ class StorageSystemMixin(ABC):
16
+ # If you want to make your own storage system, you have to inherit from this
17
+ def __init__(self, url: Union[str, None] = None):
18
+ """
19
+ :param url: URL of the website we are working on to separate it from other websites data
20
+ """
21
+ self.url = url
22
+
23
+ @cache
24
+ def _get_base_url(self, default_value: str = 'default') -> str:
25
+ if not self.url or type(self.url) is not str:
26
+ return default_value
27
+
28
+ try:
29
+ extracted = tld(self.url)
30
+ return extracted.registered_domain or extracted.domain or default_value
31
+ except AttributeError:
32
+ return default_value
33
+
34
+ @abstractmethod
35
+ def save(self, element: html.HtmlElement, identifier: str) -> None:
36
+ """Saves the element's unique properties to the storage for retrieval and relocation later
37
+
38
+ :param element: The element itself that we want to save to storage.
39
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
40
+ the docs for more info.
41
+ """
42
+ raise NotImplementedError('Storage system must implement `save` method')
43
+
44
+ @abstractmethod
45
+ def retrieve(self, identifier: str) -> Optional[Dict]:
46
+ """Using the identifier, we search the storage and return the unique properties of the element
47
+
48
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
49
+ the docs for more info.
50
+ :return: A dictionary of the unique properties
51
+ """
52
+ raise NotImplementedError('Storage system must implement `save` method')
53
+
54
+ @staticmethod
55
+ @cache
56
+ def _get_hash(identifier: str) -> str:
57
+ """If you want to hash identifier in your storage system, use this safer"""
58
+ identifier = identifier.lower().strip()
59
+ if isinstance(identifier, str):
60
+ # Hash functions have to take bytes
61
+ identifier = identifier.encode('utf-8')
62
+
63
+ hash_value = sha256(identifier).hexdigest()
64
+ return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
65
+
66
+
67
+ @cache(None, typed=True)
68
+ class SQLiteStorageSystem(StorageSystemMixin):
69
+ """The recommended system to use, it's race condition safe and thread safe.
70
+ Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
71
+ > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
72
+ def __init__(self, storage_file: str, url: Union[str, None] = None):
73
+ """
74
+ :param storage_file: File to be used to store elements
75
+ :param url: URL of the website we are working on to separate it from other websites data
76
+
77
+ """
78
+ super().__init__(url)
79
+ self.storage_file = storage_file
80
+ # We use a threading.Lock to ensure thread-safety instead of relying on thread-local storage.
81
+ self.lock = threading.Lock()
82
+ # >SQLite default mode in earlier version is 1 not 2 (1=thread-safe 2=serialized)
83
+ # `check_same_thread=False` to allow it to be used across different threads.
84
+ self.connection = sqlite3.connect(self.storage_file, check_same_thread=False)
85
+ # WAL (Write-Ahead Logging) allows for better concurrency.
86
+ self.connection.execute("PRAGMA journal_mode=WAL")
87
+ self.cursor = self.connection.cursor()
88
+ self._setup_database()
89
+ logging.debug(
90
+ f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
91
+ )
92
+
93
+ def _setup_database(self) -> None:
94
+ self.cursor.execute("""
95
+ CREATE TABLE IF NOT EXISTS storage (
96
+ id INTEGER PRIMARY KEY,
97
+ url TEXT,
98
+ identifier TEXT,
99
+ element_data TEXT,
100
+ UNIQUE (url, identifier)
101
+ )
102
+ """)
103
+ self.connection.commit()
104
+
105
+ def save(self, element: html.HtmlElement, identifier: str):
106
+ """Saves the elements unique properties to the storage for retrieval and relocation later
107
+
108
+ :param element: The element itself that we want to save to storage.
109
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
110
+ the docs for more info.
111
+ """
112
+ url = self._get_base_url()
113
+ element_data = _StorageTools.element_to_dict(element)
114
+ with self.lock:
115
+ self.cursor.execute("""
116
+ INSERT OR REPLACE INTO storage (url, identifier, element_data)
117
+ VALUES (?, ?, ?)
118
+ """, (url, identifier, orjson.dumps(element_data)))
119
+ self.cursor.fetchall()
120
+ self.connection.commit()
121
+
122
+ def retrieve(self, identifier: str) -> Optional[Dict]:
123
+ """Using the identifier, we search the storage and return the unique properties of the element
124
+
125
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
126
+ the docs for more info.
127
+ :return: A dictionary of the unique properties
128
+ """
129
+ url = self._get_base_url()
130
+ with self.lock:
131
+ self.cursor.execute(
132
+ "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
133
+ (url, identifier)
134
+ )
135
+ result = self.cursor.fetchone()
136
+ if result:
137
+ return orjson.loads(result[0])
138
+ return None
139
+
140
+ def close(self):
141
+ """Close all connections, will be useful when with some things like scrapy Spider.closed() function/signal"""
142
+ with self.lock:
143
+ self.connection.commit()
144
+ self.cursor.close()
145
+ self.connection.close()
146
+
147
+ def __del__(self):
148
+ """To ensure all connections are closed when the object is destroyed."""
149
+ self.close()
scrapling/translator.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
3
+ To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
4
+ which will be important in future releases but most importantly...
5
+ so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
6
+ > if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
7
+ """
8
+
9
+ import re
10
+
11
+ from w3lib.html import HTML5_WHITESPACE
12
+ from typing import TYPE_CHECKING, Any, Optional, Protocol
13
+
14
+ from scrapling.utils import cache
15
+
16
+ from cssselect.xpath import ExpressionError
17
+ from cssselect.xpath import XPathExpr as OriginalXPathExpr
18
+ from cssselect import HTMLTranslator as OriginalHTMLTranslator
19
+ from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
20
+
21
+ if TYPE_CHECKING:
22
+ # typing.Self requires Python 3.11
23
+ from typing_extensions import Self
24
+
25
+
26
+ regex = f"[{HTML5_WHITESPACE}]+"
27
+ replace_html5_whitespaces = re.compile(regex).sub
28
+
29
+
30
+ class XPathExpr(OriginalXPathExpr):
31
+
32
+ textnode: bool = False
33
+ attribute: Optional[str] = None
34
+
35
+ @classmethod
36
+ def from_xpath(
37
+ cls,
38
+ xpath: OriginalXPathExpr,
39
+ textnode: bool = False,
40
+ attribute: Optional[str] = None,
41
+ ) -> "Self":
42
+ x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
43
+ x.textnode = textnode
44
+ x.attribute = attribute
45
+ return x
46
+
47
+ def __str__(self) -> str:
48
+ path = super().__str__()
49
+ if self.textnode:
50
+ if path == "*":
51
+ path = "text()"
52
+ elif path.endswith("::*/*"):
53
+ path = path[:-3] + "text()"
54
+ else:
55
+ path += "/text()"
56
+
57
+ if self.attribute is not None:
58
+ if path.endswith("::*/*"):
59
+ path = path[:-2]
60
+ path += f"/@{self.attribute}"
61
+
62
+ return path
63
+
64
+ def join(
65
+ self: "Self",
66
+ combiner: str,
67
+ other: OriginalXPathExpr,
68
+ *args: Any,
69
+ **kwargs: Any,
70
+ ) -> "Self":
71
+ if not isinstance(other, XPathExpr):
72
+ raise ValueError(
73
+ f"Expressions of type {__name__}.XPathExpr can ony join expressions"
74
+ f" of the same type (or its descendants), got {type(other)}"
75
+ )
76
+ super().join(combiner, other, *args, **kwargs)
77
+ self.textnode = other.textnode
78
+ self.attribute = other.attribute
79
+ return self
80
+
81
+
82
+ # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
83
+ class TranslatorProtocol(Protocol):
84
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr:
85
+ pass
86
+
87
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str:
88
+ pass
89
+
90
+
91
+ class TranslatorMixin:
92
+ """This mixin adds support to CSS pseudo elements via dynamic dispatch.
93
+
94
+ Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
95
+ """
96
+
97
+ def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
98
+ # https://github.com/python/mypy/issues/12344
99
+ xpath = super().xpath_element(selector) # type: ignore[safe-super]
100
+ return XPathExpr.from_xpath(xpath)
101
+
102
+ def xpath_pseudo_element(
103
+ self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
104
+ ) -> OriginalXPathExpr:
105
+ """
106
+ Dispatch method that transforms XPath to support pseudo-elements.
107
+ """
108
+ if isinstance(pseudo_element, FunctionalPseudoElement):
109
+ method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
110
+ method = getattr(self, method_name, None)
111
+ if not method:
112
+ raise ExpressionError(
113
+ f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
114
+ )
115
+ xpath = method(xpath, pseudo_element)
116
+ else:
117
+ method_name = (
118
+ f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
119
+ )
120
+ method = getattr(self, method_name, None)
121
+ if not method:
122
+ raise ExpressionError(
123
+ f"The pseudo-element ::{pseudo_element} is unknown"
124
+ )
125
+ xpath = method(xpath)
126
+ return xpath
127
+
128
+ @staticmethod
129
+ def xpath_attr_functional_pseudo_element(
130
+ xpath: OriginalXPathExpr, function: FunctionalPseudoElement
131
+ ) -> XPathExpr:
132
+ """Support selecting attribute values using ::attr() pseudo-element"""
133
+ if function.argument_types() not in (["STRING"], ["IDENT"]):
134
+ raise ExpressionError(
135
+ f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
136
+ )
137
+ return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
138
+
139
+ @staticmethod
140
+ def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
141
+ """Support selecting text nodes using ::text pseudo-element"""
142
+ return XPathExpr.from_xpath(xpath, textnode=True)
143
+
144
+
145
+ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
146
+ @cache(maxsize=256)
147
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
148
+ return super().css_to_xpath(css, prefix)
scrapling/utils.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import logging
4
+ from itertools import chain
5
+ from logging import handlers
6
+ # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
7
+ from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
8
+
9
+ from typing import Dict, Iterable, Any
10
+
11
+ from lxml import html
12
+ html_forbidden = {html.HtmlComment, }
13
+ logging.basicConfig(
14
+ level=logging.ERROR,
15
+ format='%(asctime)s - %(levelname)s - %(message)s',
16
+ handlers=[
17
+ logging.StreamHandler()
18
+ ]
19
+ )
20
+
21
+
22
+ @cache(None, typed=True)
23
+ def setup_basic_logging(level: str = 'debug'):
24
+ levels = {
25
+ 'debug': logging.DEBUG,
26
+ 'info': logging.INFO,
27
+ 'warning': logging.WARNING,
28
+ 'error': logging.ERROR,
29
+ 'critical': logging.CRITICAL
30
+ }
31
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
32
+ lvl = levels[level.lower()]
33
+ handler = logging.StreamHandler()
34
+ handler.setFormatter(formatter)
35
+ # Configure the root logger
36
+ logging.basicConfig(level=lvl, handlers=[handler])
37
+
38
+
39
+ def flatten(lst: Iterable):
40
+ return list(chain.from_iterable(lst))
41
+
42
+
43
+ def _is_iterable(s: Any):
44
+ # This will be used only in regex functions to make sure it's iterable but not string/bytes
45
+ return isinstance(s, (list, tuple,))
46
+
47
+
48
+ @cache(None, typed=True)
49
+ class _Logger(object):
50
+ # I will leave this class here for now in case I decide I want to come back to use it :)
51
+ __slots__ = ('console_logger', 'logger_file_path',)
52
+ levels = {
53
+ 'debug': logging.DEBUG,
54
+ 'info': logging.INFO,
55
+ 'warning': logging.WARNING,
56
+ 'error': logging.ERROR,
57
+ 'critical': logging.CRITICAL
58
+ }
59
+
60
+ def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
61
+ os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
62
+ format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
63
+
64
+ # on-screen output
65
+ lvl = self.levels[level.lower()]
66
+ self.console_logger = logging.getLogger('Scrapling')
67
+ self.console_logger.setLevel(lvl)
68
+ console_handler = logging.StreamHandler()
69
+ console_handler.setLevel(lvl)
70
+ console_handler.setFormatter(format_str)
71
+ self.console_logger.addHandler(console_handler)
72
+
73
+ if lvl == logging.DEBUG:
74
+ filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
75
+ self.logger_file_path = filename
76
+ # Automatically generates the logging file at specified intervals
77
+ file_handler = handlers.TimedRotatingFileHandler(
78
+ # If more than (backcount+1) existed, oldest logs will be deleted
79
+ filename=filename, when=when, backupCount=backcount, encoding='utf-8'
80
+ )
81
+ file_handler.setLevel(lvl)
82
+ file_handler.setFormatter(format_str)
83
+ # This for the logger when it appends the date to the new log
84
+ file_handler.namer = lambda name: name.replace(".log", "") + ".log"
85
+ self.console_logger.addHandler(file_handler)
86
+ self.debug(f'Debug log path: {self.logger_file_path}')
87
+ else:
88
+ self.logger_file_path = None
89
+
90
+ def debug(self, message: str) -> None:
91
+ self.console_logger.debug(message)
92
+
93
+ def info(self, message: str) -> None:
94
+ self.console_logger.info(message)
95
+
96
+ def warning(self, message: str) -> None:
97
+ self.console_logger.warning(message)
98
+
99
+ def error(self, message: str) -> None:
100
+ self.console_logger.error(message)
101
+
102
+ def critical(self, message: str) -> None:
103
+ self.console_logger.critical(message)
104
+
105
+
106
+ class _StorageTools:
107
+ @staticmethod
108
+ def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
109
+ if not element.attrib:
110
+ return {}
111
+ return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
112
+
113
+ @classmethod
114
+ def element_to_dict(cls, element: html.HtmlElement) -> Dict:
115
+ parent = element.getparent()
116
+ result = {
117
+ 'tag': str(element.tag),
118
+ 'attributes': cls.__clean_attributes(element),
119
+ 'text': element.text.strip() if element.text else None,
120
+ 'path': cls._get_element_path(element)
121
+ }
122
+ if parent is not None:
123
+ result.update({
124
+ 'parent_name': parent.tag,
125
+ 'parent_attribs': dict(parent.attrib),
126
+ 'parent_text': parent.text.strip() if parent.text else None
127
+ })
128
+
129
+ siblings = [child.tag for child in parent.iterchildren() if child != element]
130
+ if siblings:
131
+ result.update({'siblings': tuple(siblings)})
132
+
133
+ children = [child.tag for child in element.iterchildren() if type(child) not in html_forbidden]
134
+ if children:
135
+ result.update({'children': tuple(children)})
136
+
137
+ return result
138
+
139
+ @classmethod
140
+ def _get_element_path(cls, element: html.HtmlElement):
141
+ parent = element.getparent()
142
+ return tuple(
143
+ (element.tag,) if parent is None else (
144
+ cls._get_element_path(parent) + (element.tag,)
145
+ )
146
+ )
147
+
148
+
149
+ # def _root_type_verifier(method):
150
+ # # Just to make sure we are safe
151
+ # @wraps(method)
152
+ # def _impl(self, *args, **kw):
153
+ # # All html types inherits from HtmlMixin so this to check for all at once
154
+ # if not issubclass(type(self._root), html.HtmlMixin):
155
+ # raise ValueError(f"Cannot use function on a Node of type {type(self._root)!r}")
156
+ # return method(self, *args, **kw)
157
+ # return _impl
158
+
159
+
160
+ @cache
161
+ def clean_spaces(string):
162
+ string = string.replace('\t', ' ')
163
+ string = re.sub('[\n|\r]', '', string)
164
+ return re.sub(' +', ' ', string)
setup.cfg ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [metadata]
2
+ name = scrapling
3
+ version = 0.1
4
+ author = Karim Shoair
5
+ author_email = karim.shoair@pm.me
6
+ description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.
7
+ license = BSD
8
+ home-page = https://github.com/D4Vinci/Scrapling
setup.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+
7
+ setup(
8
+ name="scrapling",
9
+ version="0.1",
10
+ description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
+ simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
+ impressive speed improvements over many popular scraping tools.""",
13
+ long_description=long_description,
14
+ long_description_content_type="text/markdown",
15
+ author="Karim Shoair",
16
+ author_email="karim.shoair@pm.me",
17
+ license="BSD",
18
+ packages=["scrapling",],
19
+ zip_safe=False,
20
+ package_dir={
21
+ "scrapling": "scrapling",
22
+ },
23
+ include_package_data=True,
24
+ classifiers=[
25
+ "Operating System :: OS Independent",
26
+ "Development Status :: 4 - Beta ",
27
+ # "Development Status :: 5 - Production/Stable",
28
+ # "Development Status :: 6 - Mature",
29
+ # "Development Status :: 7 - Inactive",
30
+ "Intended Audience :: Developers",
31
+ "License :: OSI Approved :: BSD License",
32
+ "Natural Language :: English",
33
+ "Topic :: Internet :: WWW/HTTP",
34
+ "Topic :: Text Processing :: Markup",
35
+ "Topic :: Text Processing :: Markup :: HTML",
36
+ "Topic :: Software Development :: Libraries :: Python Modules",
37
+ "Programming Language :: Python :: 3",
38
+ "Programming Language :: Python :: 3 :: Only",
39
+ "Programming Language :: Python :: 3.6",
40
+ "Programming Language :: Python :: 3.7",
41
+ "Programming Language :: Python :: 3.8",
42
+ "Programming Language :: Python :: 3.9",
43
+ "Programming Language :: Python :: 3.10",
44
+ "Programming Language :: Python :: 3.11",
45
+ "Programming Language :: Python :: 3.12",
46
+ "Programming Language :: Python :: Implementation :: CPython",
47
+ "Typing :: Typed",
48
+ ],
49
+ # Instead of using requirements file to dodge possible errors from tox?
50
+ install_requires=[
51
+ "requests>=2.3",
52
+ "lxml>=4.5",
53
+ "cssselect>=1.2",
54
+ "w3lib",
55
+ "orjson>=3",
56
+ "tldextract",
57
+ ],
58
+ python_requires=">=3.6",
59
+ url="https://github.com/D4Vinci/Scrapling",
60
+ project_urls={
61
+ "Documentation": "https://github.com/D4Vinci/Scrapling/Docs", # For now
62
+ "Source": "https://github.com/D4Vinci/Scrapling",
63
+ "Tracker": "https://github.com/D4Vinci/Scrapling/issues",
64
+ }
65
+ )
tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Package for test project."""
tests/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pytest
2
+ pytest-cov
tests/test_all_functions.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pickle
3
+ import unittest
4
+ from scrapling import Adaptor
5
+ from cssselect import SelectorError, SelectorSyntaxError
6
+
7
+
8
+ class TestParser(unittest.TestCase):
9
+ def setUp(self):
10
+ self.html = '''
11
+ <html>
12
+ <head>
13
+ <title>Complex Web Page</title>
14
+ <style>
15
+ .hidden { display: none; }
16
+ </style>
17
+ </head>
18
+ <body>
19
+ <header>
20
+ <nav>
21
+ <ul>
22
+ <li><a href="#home">Home</a></li>
23
+ <li><a href="#about">About</a></li>
24
+ <li><a href="#contact">Contact</a></li>
25
+ </ul>
26
+ </nav>
27
+ </header>
28
+ <main>
29
+ <section id="products" schema='{"jsonable": "data"}'>
30
+ <h2>Products</h2>
31
+ <div class="product-list">
32
+ <article class="product" data-id="1">
33
+ <h3>Product 1</h3>
34
+ <p class="description">This is product 1</p>
35
+ <span class="price">$10.99</span>
36
+ <div class="hidden stock">In stock: 5</div>
37
+ </article>
38
+ <article class="product" data-id="2">
39
+ <h3>Product 2</h3>
40
+ <p class="description">This is product 2</p>
41
+ <span class="price">$20.99</span>
42
+ <div class="hidden stock">In stock: 3</div>
43
+ </article>
44
+ <article class="product" data-id="3">
45
+ <h3>Product 3</h3>
46
+ <p class="description">This is product 3</p>
47
+ <span class="price">$15.99</span>
48
+ <div class="hidden stock">Out of stock</div>
49
+ </article>
50
+ </div>
51
+ </section>
52
+ <section id="reviews">
53
+ <h2>Customer Reviews</h2>
54
+ <div class="review-list">
55
+ <div class="review" data-rating="5">
56
+ <p class="review-text">Great product!</p>
57
+ <span class="reviewer">John Doe</span>
58
+ </div>
59
+ <div class="review" data-rating="4">
60
+ <p class="review-text">Good value for money.</p>
61
+ <span class="reviewer">Jane Smith</span>
62
+ </div>
63
+ </div>
64
+ </section>
65
+ </main>
66
+ <footer>
67
+ <p>&copy; 2024 Our Company</p>
68
+ </footer>
69
+ <script id="page-data" type="application/json">
70
+ {"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
71
+ </script>
72
+ </body>
73
+ </html>
74
+ '''
75
+ self.page = Adaptor(self.html, auto_match=False, debug=False)
76
+
77
+ def test_css_selector(self):
78
+ """Test Selecting elements with complex CSS selectors"""
79
+ elements = self.page.css('main #products .product-list article.product')
80
+ self.assertEqual(len(elements), 3)
81
+
82
+ in_stock_products = self.page.css(
83
+ 'main #products .product-list article.product:not(:contains("Out of stock"))')
84
+ self.assertEqual(len(in_stock_products), 2)
85
+
86
+ def test_xpath_selector(self):
87
+ """Test Selecting elements with Complex XPath selectors"""
88
+ reviews = self.page.xpath(
89
+ '//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]'
90
+ )
91
+ self.assertEqual(len(reviews), 2)
92
+
93
+ high_priced_products = self.page.xpath(
94
+ '//article[contains(@class, "product")]'
95
+ '[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]'
96
+ )
97
+ self.assertEqual(len(high_priced_products), 2)
98
+
99
+ def test_find_by_text(self):
100
+ """Test Selecting elements with Text matching"""
101
+ stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=False)
102
+ self.assertEqual(len(stock_info), 2)
103
+
104
+ stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
105
+ self.assertEqual(stock_info.text, 'In stock: 5')
106
+
107
+ stock_info = self.page.find_by_text(r'In stock:', partial=True, first_match=False)
108
+ self.assertEqual(len(stock_info), 2)
109
+
110
+ out_of_stock = self.page.find_by_text('Out of stock', partial=False, first_match=False)
111
+ self.assertEqual(len(out_of_stock), 1)
112
+
113
+ def test_find_similar_elements(self):
114
+ """Test Finding similar elements of an element"""
115
+ first_product = self.page.css('.product')[0]
116
+ similar_products = first_product.find_similar()
117
+ self.assertEqual(len(similar_products), 2)
118
+
119
+ first_review = self.page.css('.review')[0]
120
+ similar_high_rated_reviews = [
121
+ review
122
+ for review in first_review.find_similar()
123
+ if int(review.attrib.get('data-rating', 0)) >= 4
124
+ ]
125
+ self.assertEqual(len(similar_high_rated_reviews), 1)
126
+
127
+ def test_expected_errors(self):
128
+ """Test errors that should raised if it does"""
129
+ with self.assertRaises(ValueError):
130
+ _ = Adaptor()
131
+
132
+ with self.assertRaises(TypeError):
133
+ _ = Adaptor(root="ayo")
134
+
135
+ with self.assertRaises(TypeError):
136
+ _ = Adaptor(text=1)
137
+
138
+ with self.assertRaises(TypeError):
139
+ _ = Adaptor(body=1)
140
+
141
+ with self.assertRaises(ValueError):
142
+ _ = Adaptor(self.html, storage=object, auto_match=True)
143
+
144
+ def test_pickleable(self):
145
+ """Test that objects aren't pickleable"""
146
+ table = self.page.css('.product-list')[0]
147
+ with self.assertRaises(TypeError): # Adaptors
148
+ pickle.dumps(table)
149
+
150
+ with self.assertRaises(TypeError): # Adaptor
151
+ pickle.dumps(table[0])
152
+
153
+ def test_overridden(self):
154
+ """Test overridden functions"""
155
+ table = self.page.css('.product-list')[0]
156
+ self.assertTrue(issubclass(type(table.__str__()), str))
157
+ self.assertTrue(issubclass(type(table.__repr__()), str))
158
+ self.assertTrue(issubclass(type(table.attrib.__str__()), str))
159
+ self.assertTrue(issubclass(type(table.attrib.__repr__()), str))
160
+
161
+ def test_bad_selector(self):
162
+ """Test object can handle bad selector"""
163
+ with self.assertRaises((SelectorError, SelectorSyntaxError,)):
164
+ self.page.css('4 ayo')
165
+
166
+ with self.assertRaises((SelectorError, SelectorSyntaxError,)):
167
+ self.page.xpath('4 ayo')
168
+
169
+ def test_selectors_generation(self):
170
+ """Try to create selectors for all elements in the page"""
171
+ def _traverse(element: Adaptor):
172
+ self.assertTrue(type(element.css_selector) is str)
173
+ self.assertTrue(type(element.xpath_selector) is str)
174
+ for branch in element.children:
175
+ _traverse(branch)
176
+
177
+ _traverse(self.page)
178
+
179
+ def test_getting_all_text(self):
180
+ """Test getting all text"""
181
+ self.assertNotEqual(self.page.get_all_text(), '')
182
+
183
+ def test_element_navigation(self):
184
+ """Test moving in the page from selected element"""
185
+ table = self.page.css('.product-list')[0]
186
+
187
+ self.assertIsNot(table.path, [])
188
+ self.assertNotEqual(table.html_content, '')
189
+ self.assertNotEqual(table.prettify(), '')
190
+
191
+ parent = table.parent
192
+ self.assertEqual(parent.attrib['id'], 'products')
193
+
194
+ children = table.children
195
+ self.assertEqual(len(children), 3)
196
+
197
+ parent_siblings = parent.siblings
198
+ self.assertEqual(len(parent_siblings), 1)
199
+
200
+ child = table.css('[data-id="1"]')[0]
201
+ next_element = child.next
202
+ self.assertEqual(next_element.attrib['data-id'], '2')
203
+
204
+ prev_element = next_element.previous
205
+ self.assertEqual(prev_element.tag, child.tag)
206
+
207
+ all_prices = self.page.css('.price')
208
+ products_with_prices = [
209
+ price.find_ancestor(lambda p: p.has_class('product'))
210
+ for price in all_prices
211
+ ]
212
+ self.assertEqual(len(products_with_prices), 3)
213
+
214
+ def test_empty_return(self):
215
+ """Test cases where functions shouldn't have results"""
216
+ test_html = """
217
+ <html>
218
+ <span id="a"><a></a><!--comment--></span>
219
+ <span id="b"><!--comment--><a></a></span>
220
+ </html>"""
221
+ soup = Adaptor(test_html, auto_match=False, keep_comments=False)
222
+ html_tag = soup.css('html')[0]
223
+ self.assertEqual(html_tag.path, [])
224
+ self.assertEqual(html_tag.siblings, [])
225
+ self.assertEqual(html_tag.parent, None)
226
+ self.assertEqual(html_tag.find_ancestor(lambda e: e), None)
227
+
228
+ self.assertEqual(soup.css('#a a')[0].next, None)
229
+ self.assertEqual(soup.css('#b a')[0].previous, None)
230
+
231
+ def test_text_to_json(self):
232
+ """Test converting text to json"""
233
+ script_content = self.page.css('#page-data::text')[0]
234
+ self.assertTrue(issubclass(type(script_content.sort()), str))
235
+ page_data = script_content.json()
236
+ self.assertEqual(page_data['totalProducts'], 3)
237
+ self.assertTrue('lastUpdated' in page_data)
238
+
239
+ def test_regex_on_text(self):
240
+ """Test doing regex on a selected text"""
241
+ element = self.page.css('[data-id="1"] .price')[0]
242
+ match = element.re_first(r'[\.\d]+')
243
+ self.assertEqual(match, '10.99')
244
+ match = element.text.re(r'(\d+)', replace_entities=False)
245
+ self.assertEqual(len(match), 2)
246
+
247
+ def test_attribute_operations(self):
248
+ """Test operations on elements attributes"""
249
+ products = self.page.css('.product')
250
+ product_ids = [product.attrib['data-id'] for product in products]
251
+ self.assertEqual(product_ids, ['1', '2', '3'])
252
+ self.assertTrue('data-id' in products[0].attrib)
253
+
254
+ reviews = self.page.css('.review')
255
+ review_ratings = [int(review.attrib['data-rating']) for review in reviews]
256
+ self.assertEqual(sum(review_ratings) / len(review_ratings), 4.5)
257
+
258
+ key_value = list(products[0].attrib.search_values('1', partial=False))
259
+ self.assertEqual(list(key_value[0].keys()), ['data-id'])
260
+
261
+ key_value = list(products[0].attrib.search_values('1', partial=True))
262
+ self.assertEqual(list(key_value[0].keys()), ['data-id'])
263
+
264
+ attr_json = self.page.css('#products')[0].attrib['schema'].json()
265
+ self.assertEqual(attr_json, {'jsonable': 'data'})
266
+ self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
267
+
268
+ def test_element_relocation(self):
269
+ """Test relocating element after structure change"""
270
+ original_html = '''
271
+ <div class="container">
272
+ <section class="products">
273
+ <article class="product" id="p1">
274
+ <h3>Product 1</h3>
275
+ <p class="description">Description 1</p>
276
+ </article>
277
+ <article class="product" id="p2">
278
+ <h3>Product 2</h3>
279
+ <p class="description">Description 2</p>
280
+ </article>
281
+ </section>
282
+ </div>
283
+ '''
284
+ changed_html = '''
285
+ <div class="new-container">
286
+ <div class="product-wrapper">
287
+ <section class="products">
288
+ <article class="product new-class" data-id="p1">
289
+ <div class="product-info">
290
+ <h3>Product 1</h3>
291
+ <p class="new-description">Description 1</p>
292
+ </div>
293
+ </article>
294
+ <article class="product new-class" data-id="p2">
295
+ <div class="product-info">
296
+ <h3>Product 2</h3>
297
+ <p class="new-description">Description 2</p>
298
+ </div>
299
+ </article>
300
+ </section>
301
+ </div>
302
+ </div>
303
+ '''
304
+
305
+ old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
306
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
307
+
308
+ # 'p1' was used as ID and now it's not and all the path elements have changes
309
+ # Also at the same time testing auto-match vs combined selectors
310
+ _ = old_page.css('#p1, #p2', auto_save=True)[0]
311
+ relocated = new_page.css('#p1', auto_match=True)
312
+
313
+ self.assertIsNotNone(relocated)
314
+ self.assertEqual(relocated[0].attrib['data-id'], 'p1')
315
+ self.assertTrue(relocated[0].has_class('new-class'))
316
+ self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
317
+
318
+ def test_performance(self):
319
+ """Test parsing and selecting speed"""
320
+ import time
321
+ large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
322
+
323
+ start_time = time.time()
324
+ parsed = Adaptor(large_html, auto_match=False, debug=False)
325
+ elements = parsed.css('.item')
326
+ end_time = time.time()
327
+
328
+ self.assertEqual(len(elements), 5000)
329
+ # Converting 5000 elements to a class and doing operations on them will take time
330
+ # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
331
+ self.assertLess(end_time - start_time, 0.1)
332
+
333
+
334
+ # Use `coverage run -m unittest --verbose tests/test_all_functions.py` instead for the coverage report
335
+ # if __name__ == '__main__':
336
+ # unittest.main(verbosity=2)
tox.ini ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tox (https://tox.readthedocs.io/) is a tool for running tests
2
+ # in multiple virtualenvs. This configuration file will run the
3
+ # test suite on all supported python versions. To use it, "pip install tox"
4
+ # and then run "tox" from this directory.
5
+
6
+ [tox]
7
+ envlist = pre-commit,py36,py37,py38,py39,py310,py311,py312
8
+
9
+ [testenv]
10
+ usedevelop = True
11
+ changedir = tests
12
+ deps =
13
+ -r{toxinidir}/tests/requirements.txt
14
+ commands = pytest --cov=scrapling --cov-report=xml
15
+
16
+ [testenv:pre-commit]
17
+ basepython = python3
18
+ deps = pre-commit
19
+ commands = pre-commit run --all-files --show-diff-on-failure
20
+ skip_install = true