Commit ·
493d4f4
1
Parent(s): d2cc651
updated pattern
Browse files- README.md +12 -1
- get_text.ipynb +98 -74
- textData.md +49 -0
- updated_api.py +4 -4
README.md
CHANGED
|
@@ -1,4 +1,15 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
config the ngrok auth: ngrok config add-authtoken 2Qm8hS1zPhVXiLjEdlI4738tLzF_2QJwGJMK5oTbQD33QSVXS
|
| 3 |
|
| 4 |
ngrok http --domain=batnlp.ngrok.app 1111
|
|
|
|
| 1 |
+
# Arabic NLP
|
| 2 |
+
HuggingFace: https://huggingface.co/rakib72642/Arabic_NLP
|
| 3 |
+
|
| 4 |
+
sudo apt install iproute2 && sudo apt install wget && sudo apt install unzip && sudo apt install nvtop && sudo apt-get install git-lfs && sudo apt-get update && sudo apt-get install libgl1 && curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null && echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list && sudo apt update && sudo apt install ngrok && ngrok config add-authtoken 2Qm8hS1zPhVXiLjEdlI4738tLzF_2QJwGJMK5oTbQD33QSVXS && sudo apt update && sudo apt upgrade && ngrok http --domain=hawkeyes.ngrok.app 8000
|
| 5 |
+
|
| 6 |
+
git clone https://huggingface.co/rakib72642/Arabic_NLP && cd Arabic_NLP && sudo apt update && sudo apt upgrade && python updated_api.py
|
| 7 |
+
|
| 8 |
+
cd Arabic_NLP && python updated_api.py
|
| 9 |
+
|
| 10 |
+
hypercorn updated_api:app --bind 127.0.0.1:8020 --workers 4
|
| 11 |
+
|
| 12 |
+
|
| 13 |
config the ngrok auth: ngrok config add-authtoken 2Qm8hS1zPhVXiLjEdlI4738tLzF_2QJwGJMK5oTbQD33QSVXS
|
| 14 |
|
| 15 |
ngrok http --domain=batnlp.ngrok.app 1111
|
get_text.ipynb
CHANGED
|
@@ -2,26 +2,9 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"metadata": {},
|
| 7 |
-
"outputs": [
|
| 8 |
-
{
|
| 9 |
-
"ename": "ModuleNotFoundError",
|
| 10 |
-
"evalue": "No module named 'certifi'",
|
| 11 |
-
"output_type": "error",
|
| 12 |
-
"traceback": [
|
| 13 |
-
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 14 |
-
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
| 15 |
-
"\u001b[1;32md:\\Projects\\BAT\\BAT_NLP_Campaign\\get_text.ipynb Cell 1\u001b[0m line \u001b[0;36m2\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mre\u001b[39;00m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mupdated_api\u001b[39;00m \u001b[39mimport\u001b[39;00m \u001b[39m*\u001b[39m\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtyping_extensions\u001b[39;00m \u001b[39mimport\u001b[39;00m Annotated\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnltk\u001b[39;00m\n",
|
| 16 |
-
"File \u001b[1;32md:\\Projects\\BAT\\BAT_NLP_Campaign\\updated_api.py:9\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39muvicorn\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnltk\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mhttpx\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mfastapi\u001b[39;00m \u001b[39mimport\u001b[39;00m FastAPI\n\u001b[0;32m 11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpydantic\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseModel\n",
|
| 17 |
-
"File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m__version__\u001b[39;00m \u001b[39mimport\u001b[39;00m __description__, __title__, __version__\n\u001b[1;32m----> 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_api\u001b[39;00m \u001b[39mimport\u001b[39;00m delete, get, head, options, patch, post, put, request, stream\n\u001b[0;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_auth\u001b[39;00m \u001b[39mimport\u001b[39;00m Auth, BasicAuth, DigestAuth, NetRCAuth\n\u001b[0;32m 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_client\u001b[39;00m \u001b[39mimport\u001b[39;00m USE_CLIENT_DEFAULT, AsyncClient, Client\n",
|
| 18 |
-
"File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\_api.py:4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtyping\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mcontextlib\u001b[39;00m \u001b[39mimport\u001b[39;00m contextmanager\n\u001b[1;32m----> 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_client\u001b[39;00m \u001b[39mimport\u001b[39;00m Client\n\u001b[0;32m 5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_config\u001b[39;00m \u001b[39mimport\u001b[39;00m DEFAULT_TIMEOUT_CONFIG\n\u001b[0;32m 6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_models\u001b[39;00m \u001b[39mimport\u001b[39;00m Response\n",
|
| 19 |
-
"File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\_client.py:11\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m__version__\u001b[39;00m \u001b[39mimport\u001b[39;00m __version__\n\u001b[0;32m 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_auth\u001b[39;00m \u001b[39mimport\u001b[39;00m Auth, BasicAuth, FunctionAuth\n\u001b[1;32m---> 11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_config\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[0;32m 12\u001b[0m DEFAULT_LIMITS,\n\u001b[0;32m 13\u001b[0m DEFAULT_MAX_REDIRECTS,\n\u001b[0;32m 14\u001b[0m DEFAULT_TIMEOUT_CONFIG,\n\u001b[0;32m 15\u001b[0m Limits,\n\u001b[0;32m 16\u001b[0m Proxy,\n\u001b[0;32m 17\u001b[0m Timeout,\n\u001b[0;32m 18\u001b[0m )\n\u001b[0;32m 19\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_decoders\u001b[39;00m \u001b[39mimport\u001b[39;00m SUPPORTED_DECODERS\n\u001b[0;32m 20\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_exceptions\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[0;32m 21\u001b[0m InvalidURL,\n\u001b[0;32m 22\u001b[0m RemoteProtocolError,\n\u001b[0;32m 23\u001b[0m TooManyRedirects,\n\u001b[0;32m 24\u001b[0m request_context,\n\u001b[0;32m 25\u001b[0m )\n",
|
| 20 |
-
"File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\_config.py:7\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtyping\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpathlib\u001b[39;00m \u001b[39mimport\u001b[39;00m Path\n\u001b[1;32m----> 7\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mcertifi\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_compat\u001b[39;00m \u001b[39mimport\u001b[39;00m set_minimum_tls_version_1_2\n\u001b[0;32m 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_models\u001b[39;00m \u001b[39mimport\u001b[39;00m Headers\n",
|
| 21 |
-
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'certifi'"
|
| 22 |
-
]
|
| 23 |
-
}
|
| 24 |
-
],
|
| 25 |
"source": [
|
| 26 |
"import re\n",
|
| 27 |
"from updated_api import *\n",
|
|
@@ -45,7 +28,7 @@
|
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"cell_type": "code",
|
| 48 |
-
"execution_count":
|
| 49 |
"metadata": {},
|
| 50 |
"outputs": [],
|
| 51 |
"source": [
|
|
@@ -58,7 +41,7 @@
|
|
| 58 |
},
|
| 59 |
{
|
| 60 |
"cell_type": "code",
|
| 61 |
-
"execution_count":
|
| 62 |
"metadata": {},
|
| 63 |
"outputs": [],
|
| 64 |
"source": [
|
|
@@ -80,20 +63,9 @@
|
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"cell_type": "code",
|
| 83 |
-
"execution_count":
|
| 84 |
"metadata": {},
|
| 85 |
-
"outputs": [
|
| 86 |
-
{
|
| 87 |
-
"data": {
|
| 88 |
-
"text/plain": [
|
| 89 |
-
"<coroutine object detect_audio at 0x00000255D1384900>"
|
| 90 |
-
]
|
| 91 |
-
},
|
| 92 |
-
"execution_count": 17,
|
| 93 |
-
"metadata": {},
|
| 94 |
-
"output_type": "execute_result"
|
| 95 |
-
}
|
| 96 |
-
],
|
| 97 |
"source": [
|
| 98 |
"filename = input(\"Give Audio Name: \")\n",
|
| 99 |
"audio_url = upload(filename)\n",
|
|
@@ -103,7 +75,7 @@
|
|
| 103 |
},
|
| 104 |
{
|
| 105 |
"cell_type": "code",
|
| 106 |
-
"execution_count":
|
| 107 |
"metadata": {},
|
| 108 |
"outputs": [],
|
| 109 |
"source": [
|
|
@@ -116,7 +88,7 @@
|
|
| 116 |
},
|
| 117 |
{
|
| 118 |
"cell_type": "code",
|
| 119 |
-
"execution_count":
|
| 120 |
"metadata": {},
|
| 121 |
"outputs": [],
|
| 122 |
"source": [
|
|
@@ -129,7 +101,7 @@
|
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"cell_type": "code",
|
| 132 |
-
"execution_count":
|
| 133 |
"metadata": {},
|
| 134 |
"outputs": [],
|
| 135 |
"source": [
|
|
@@ -149,7 +121,7 @@
|
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"cell_type": "code",
|
| 152 |
-
"execution_count":
|
| 153 |
"metadata": {},
|
| 154 |
"outputs": [],
|
| 155 |
"source": [
|
|
@@ -171,29 +143,9 @@
|
|
| 171 |
},
|
| 172 |
{
|
| 173 |
"cell_type": "code",
|
| 174 |
-
"execution_count":
|
| 175 |
"metadata": {},
|
| 176 |
-
"outputs": [
|
| 177 |
-
{
|
| 178 |
-
"name": "stdout",
|
| 179 |
-
"output_type": "stream",
|
| 180 |
-
"text": [
|
| 181 |
-
"{'Unique Capsule': ['unique capsul'], 'Refreshing Taste and Smell': ['refreshing taste smell'], 'Benson & Hadges Breeze': [('banson', 'b', 'a', 'hages niyashe ekti unique capsule offer panson hages', 'br'), ('panson', 'p', 'a', 'hages', 'br')]}\n"
|
| 182 |
-
]
|
| 183 |
-
},
|
| 184 |
-
{
|
| 185 |
-
"data": {
|
| 186 |
-
"text/plain": [
|
| 187 |
-
"{'Unique Capsule': 1,\n",
|
| 188 |
-
" 'Refreshing Taste and Smell': 1,\n",
|
| 189 |
-
" 'Benson & Hadges Breeze': 2}"
|
| 190 |
-
]
|
| 191 |
-
},
|
| 192 |
-
"execution_count": 10,
|
| 193 |
-
"metadata": {},
|
| 194 |
-
"output_type": "execute_result"
|
| 195 |
-
}
|
| 196 |
-
],
|
| 197 |
"source": [
|
| 198 |
"text = \"Clean text : apnea janet kushihaban banson hages niyashe ekti unique capsule offer panson hages bridge panson hages breeze air capsule atom agnoton tharna refreshing taste smell darn offer tea trial cora jonu apnea ekti trial kit nitaparin thunobat\"\n",
|
| 199 |
"\n",
|
|
@@ -202,7 +154,7 @@
|
|
| 202 |
},
|
| 203 |
{
|
| 204 |
"cell_type": "code",
|
| 205 |
-
"execution_count":
|
| 206 |
"metadata": {},
|
| 207 |
"outputs": [],
|
| 208 |
"source": [
|
|
@@ -215,7 +167,7 @@
|
|
| 215 |
},
|
| 216 |
{
|
| 217 |
"cell_type": "code",
|
| 218 |
-
"execution_count":
|
| 219 |
"metadata": {},
|
| 220 |
"outputs": [],
|
| 221 |
"source": [
|
|
@@ -237,17 +189,9 @@
|
|
| 237 |
},
|
| 238 |
{
|
| 239 |
"cell_type": "code",
|
| 240 |
-
"execution_count":
|
| 241 |
"metadata": {},
|
| 242 |
-
"outputs": [
|
| 243 |
-
{
|
| 244 |
-
"name": "stdout",
|
| 245 |
-
"output_type": "stream",
|
| 246 |
-
"text": [
|
| 247 |
-
"<assemblyai.transcriber.Transcript object at 0x0000029377EFD480>\n"
|
| 248 |
-
]
|
| 249 |
-
}
|
| 250 |
-
],
|
| 251 |
"source": [
|
| 252 |
"import assemblyai as aai\n",
|
| 253 |
"from updated_api import *\n",
|
|
@@ -266,12 +210,92 @@
|
|
| 266 |
"print(transcript)\n"
|
| 267 |
]
|
| 268 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
{
|
| 270 |
"cell_type": "code",
|
| 271 |
"execution_count": null,
|
| 272 |
"metadata": {},
|
| 273 |
"outputs": [],
|
| 274 |
-
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
}
|
| 276 |
],
|
| 277 |
"metadata": {
|
|
@@ -290,7 +314,7 @@
|
|
| 290 |
"name": "python",
|
| 291 |
"nbconvert_exporter": "python",
|
| 292 |
"pygments_lexer": "ipython3",
|
| 293 |
-
"version": "3.
|
| 294 |
}
|
| 295 |
},
|
| 296 |
"nbformat": 4,
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
"metadata": {},
|
| 7 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"source": [
|
| 9 |
"import re\n",
|
| 10 |
"from updated_api import *\n",
|
|
|
|
| 28 |
},
|
| 29 |
{
|
| 30 |
"cell_type": "code",
|
| 31 |
+
"execution_count": null,
|
| 32 |
"metadata": {},
|
| 33 |
"outputs": [],
|
| 34 |
"source": [
|
|
|
|
| 41 |
},
|
| 42 |
{
|
| 43 |
"cell_type": "code",
|
| 44 |
+
"execution_count": null,
|
| 45 |
"metadata": {},
|
| 46 |
"outputs": [],
|
| 47 |
"source": [
|
|
|
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"cell_type": "code",
|
| 66 |
+
"execution_count": null,
|
| 67 |
"metadata": {},
|
| 68 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
"source": [
|
| 70 |
"filename = input(\"Give Audio Name: \")\n",
|
| 71 |
"audio_url = upload(filename)\n",
|
|
|
|
| 75 |
},
|
| 76 |
{
|
| 77 |
"cell_type": "code",
|
| 78 |
+
"execution_count": null,
|
| 79 |
"metadata": {},
|
| 80 |
"outputs": [],
|
| 81 |
"source": [
|
|
|
|
| 88 |
},
|
| 89 |
{
|
| 90 |
"cell_type": "code",
|
| 91 |
+
"execution_count": null,
|
| 92 |
"metadata": {},
|
| 93 |
"outputs": [],
|
| 94 |
"source": [
|
|
|
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"cell_type": "code",
|
| 104 |
+
"execution_count": null,
|
| 105 |
"metadata": {},
|
| 106 |
"outputs": [],
|
| 107 |
"source": [
|
|
|
|
| 121 |
},
|
| 122 |
{
|
| 123 |
"cell_type": "code",
|
| 124 |
+
"execution_count": null,
|
| 125 |
"metadata": {},
|
| 126 |
"outputs": [],
|
| 127 |
"source": [
|
|
|
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"cell_type": "code",
|
| 146 |
+
"execution_count": null,
|
| 147 |
"metadata": {},
|
| 148 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
"source": [
|
| 150 |
"text = \"Clean text : apnea janet kushihaban banson hages niyashe ekti unique capsule offer panson hages bridge panson hages breeze air capsule atom agnoton tharna refreshing taste smell darn offer tea trial cora jonu apnea ekti trial kit nitaparin thunobat\"\n",
|
| 151 |
"\n",
|
|
|
|
| 154 |
},
|
| 155 |
{
|
| 156 |
"cell_type": "code",
|
| 157 |
+
"execution_count": null,
|
| 158 |
"metadata": {},
|
| 159 |
"outputs": [],
|
| 160 |
"source": [
|
|
|
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"cell_type": "code",
|
| 170 |
+
"execution_count": null,
|
| 171 |
"metadata": {},
|
| 172 |
"outputs": [],
|
| 173 |
"source": [
|
|
|
|
| 189 |
},
|
| 190 |
{
|
| 191 |
"cell_type": "code",
|
| 192 |
+
"execution_count": null,
|
| 193 |
"metadata": {},
|
| 194 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
"source": [
|
| 196 |
"import assemblyai as aai\n",
|
| 197 |
"from updated_api import *\n",
|
|
|
|
| 210 |
"print(transcript)\n"
|
| 211 |
]
|
| 212 |
},
|
| 213 |
+
{
|
| 214 |
+
"cell_type": "code",
|
| 215 |
+
"execution_count": 57,
|
| 216 |
+
"metadata": {},
|
| 217 |
+
"outputs": [
|
| 218 |
+
{
|
| 219 |
+
"name": "stdout",
|
| 220 |
+
"output_type": "stream",
|
| 221 |
+
"text": [
|
| 222 |
+
"Processing Audio\n",
|
| 223 |
+
"town saw revival new business opening\n",
|
| 224 |
+
"{'Unique Capsule': [], 'Refreshing Taste and Smell': [], 'Benson & Hadges Breeze': []}\n"
|
| 225 |
+
]
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"data": {
|
| 229 |
+
"text/plain": [
|
| 230 |
+
"{'Unique Capsule': 0,\n",
|
| 231 |
+
" 'Refreshing Taste and Smell': 0,\n",
|
| 232 |
+
" 'Benson & Hadges Breeze': 0}"
|
| 233 |
+
]
|
| 234 |
+
},
|
| 235 |
+
"execution_count": 57,
|
| 236 |
+
"metadata": {},
|
| 237 |
+
"output_type": "execute_result"
|
| 238 |
+
}
|
| 239 |
+
],
|
| 240 |
+
"source": [
|
| 241 |
+
"import re\n",
|
| 242 |
+
"from nlp_api import *\n",
|
| 243 |
+
"from typing_extensions import Annotated\n",
|
| 244 |
+
"import string\n",
|
| 245 |
+
"\n",
|
| 246 |
+
"patterns = {\n",
|
| 247 |
+
" 'Unique Capsule': r\"\\b(((u(?:nit|niq).*?)\\s+(?:capsul))|(?:.*?uni.*?capsul))\",\n",
|
| 248 |
+
" 'Refreshing Taste and Smell': r\"\\b((((ref|rif|rip|rep|ep|pre).*?)\\s+t(?:a|e|i|y)s(.*?)\\s+(sm|(?:.*?(sm|m)))(?:el|il|al|ol|.*?))|((?:in.*?)\\s+t(?:a|e|i|y)s.*?\\s+(.*?)(sm|m)(?:el|il|al|ol|ail|eal)))\",\n",
|
| 249 |
+
" 'Benson & Hadges Breeze':r\"\\b((b|p|v|f)(?:(an|en|a|e)(?:s|ch|t)(?:on|an|en).*?)\\s+h(?:.*?)\\s+(b|p|v|f)(?:re|ee|e|ri))\",\n",
|
| 250 |
+
"}\n",
|
| 251 |
+
"\n",
|
| 252 |
+
"\n",
|
| 253 |
+
" # Find and count matches for each pattern\n",
|
| 254 |
+
"def nlp_bat(text):\n",
|
| 255 |
+
" results = {}\n",
|
| 256 |
+
" all_match = {}\n",
|
| 257 |
+
" for name, pattern in patterns.items():\n",
|
| 258 |
+
" matches = re.findall(pattern, text, re.IGNORECASE)\n",
|
| 259 |
+
" m = {name:matches}\n",
|
| 260 |
+
" all_match.update(m)\n",
|
| 261 |
+
" count = len(matches)\n",
|
| 262 |
+
" results[name] = count\n",
|
| 263 |
+
" \n",
|
| 264 |
+
" \n",
|
| 265 |
+
" print(all_match) \n",
|
| 266 |
+
"\n",
|
| 267 |
+
" return results\n",
|
| 268 |
+
"\n",
|
| 269 |
+
"async def lemmatize_and_clean(text):\n",
|
| 270 |
+
" words = nltk.word_tokenize(text.lower())\n",
|
| 271 |
+
" words = [word for word in words if word.isalpha() and word not in set(stopwords.words('english'))]\n",
|
| 272 |
+
" lemmatizer = WordNetLemmatizer()\n",
|
| 273 |
+
" words = [await asyncio.to_thread(lemmatizer.lemmatize, word) for word in words]\n",
|
| 274 |
+
" return ' '.join(words)\n",
|
| 275 |
+
"\n",
|
| 276 |
+
"\n",
|
| 277 |
+
"# # input\n",
|
| 278 |
+
"filename = input(\"Give Audio Name: \")\n",
|
| 279 |
+
"audio_url = upload(filename)\n",
|
| 280 |
+
"\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"# # transcribe\n",
|
| 283 |
+
"detect_audio(audio_url, 'file_title')"
|
| 284 |
+
]
|
| 285 |
+
},
|
| 286 |
{
|
| 287 |
"cell_type": "code",
|
| 288 |
"execution_count": null,
|
| 289 |
"metadata": {},
|
| 290 |
"outputs": [],
|
| 291 |
+
"source": [
|
| 292 |
+
"patterns = {\n",
|
| 293 |
+
" \"Clear\":r\"\\b((c|k)(:?l..r))\",\n",
|
| 294 |
+
" \"Confidence\":r\"\\b(((f|t|th)(u|i))(?:(|c|q|k|ck)(?:(a|e|o))))|(((f|t|th)(u|i))(?:(|c|q|k|ck)(?:a|.a)))\",\n",
|
| 295 |
+
" \"Revival\":r\"((a)(?:(sh|yush|rch)))\",\n",
|
| 296 |
+
" \"Anti-Dandruff\":r\"((al)(?:.*?(k|q)(?:a|i|o|u)(?:(s|sh))))|((k|q)(?:a|i|o|u)(?:(s|sh)((?:a|o))))|((k|q)(?:a|i|o|u)(?:(s|sh)(r(?:a|o|u))))\",\n",
|
| 297 |
+
"}"
|
| 298 |
+
]
|
| 299 |
}
|
| 300 |
],
|
| 301 |
"metadata": {
|
|
|
|
| 314 |
"name": "python",
|
| 315 |
"nbconvert_exporter": "python",
|
| 316 |
"pygments_lexer": "ipython3",
|
| 317 |
+
"version": "3.12.2"
|
| 318 |
}
|
| 319 |
},
|
| 320 |
"nbformat": 4,
|
textData.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1. clear fikoh ash al qasra
|
| 2 |
+
2. clear fika ash al qaeda
|
| 3 |
+
3. clear fikah ash al kishra
|
| 4 |
+
4. ash alokosra vika clear
|
| 5 |
+
5. clear al kashara
|
| 6 |
+
6. clear fuqah ayush vidal kashakti
|
| 7 |
+
7. clear fuqah ayush vidal kashakti
|
| 8 |
+
8. ash alokosra vika clear
|
| 9 |
+
9. clear fik india loch
|
| 10 |
+
10. al koshro clear fikah asha
|
| 11 |
+
11. clear vika asha al koshuru
|
| 12 |
+
12. clear fik ash vid al khushra
|
| 13 |
+
13. asha al ghoshuru clear fikah
|
| 14 |
+
14. asha al kosharo clear
|
| 15 |
+
15. lehrer fik ash dudul kusha
|
| 16 |
+
16. clear fika taj vit al khashoggi
|
| 17 |
+
17. fikah clear asha al kosru
|
| 18 |
+
18. clear fikar ash vid al khusho
|
| 19 |
+
19. clear fika inash bidal kushu may
|
| 20 |
+
20. clear fika ash mid al kushu
|
| 21 |
+
21. clear fico bush
|
| 22 |
+
22. clear ar vital kosho
|
| 23 |
+
23. clear fico vidal kusho
|
| 24 |
+
24. clear fikah ash al kishra
|
| 25 |
+
25. clear thickah ash al kishra
|
| 26 |
+
26. fika vidal kishra ash fika
|
| 27 |
+
27. clear fik ayesh vidal kashir
|
| 28 |
+
28. player fika faith al kish ash
|
| 29 |
+
30. fid alakishra ash fika clear ash fika alakashra
|
| 30 |
+
31. clear fikach ash vidal kosher
|
| 31 |
+
32. clear fica arch alcohol
|
| 32 |
+
33. clear fikach ash vidal kosher
|
| 33 |
+
34. clear fika ash al kosher
|
| 34 |
+
35. clear thicker arch alcohol
|
| 35 |
+
# ###################################################
|
| 36 |
+
1. yunkenulil belsamil mudadil il kashrati yuhadiya farwata rasi waemna al jafa
|
| 37 |
+
2. yusaidu shampul mudadul il kushrati ala muharrabatil kushrati wal hakati
|
| 38 |
+
4. intarshal edabul kilesi kiyo fishabiyetihi
|
| 39 |
+
6. alistair mul mustamirula jatil mudodati lil kushrati yunkinu yuaziza sahata faruatiratsi
|
| 40 |
+
7. hatidanal fashali kafur satin lita lumiyua zizuthiqata
|
| 41 |
+
8. artini zojajatan mina sham ko shafi
|
| 42 |
+
10. alistair mul muntadimul muntajatil mudadati lil kashrati yumkinu yukala minta yuji faruatirat
|
| 43 |
+
11. tahdida ehdefin kabilatin litikata
|
| 44 |
+
12. tahdida ehdefin kabilatin litikata
|
| 45 |
+
13. ihatota nafsibi afraid
|
| 46 |
+
14. shahid al mokaotari khiyu inti ashan ladazuari badata tashdidi
|
| 47 |
+
15. shahid al fari kuriyad yuntyashan mahmuda ribin jadedan
|
| 48 |
+
16. town saw revival new business opening
|
| 49 |
+
|
updated_api.py
CHANGED
|
@@ -62,12 +62,12 @@ async def lemmatize_and_clean(text):
|
|
| 62 |
|
| 63 |
|
| 64 |
patterns = {
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
| 68 |
}
|
| 69 |
|
| 70 |
-
|
| 71 |
async def nlp_bat(text):
|
| 72 |
results = {}
|
| 73 |
all_match = {}
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
patterns = {
|
| 65 |
+
"Clear":r"\b(((c|k)(?:(l..r|lir|lar|il))))",
|
| 66 |
+
"Confidence":r"\b((((f|t|th)(u|i))(?:(|c|q|k|ck)(?:(a|e|o))))|(((f|t|th)(u|i))(?:(|c|q|k|ck)(?:a|.a))))",
|
| 67 |
+
"Revival":r"\b(((a)(?:(sh|yush|rch))))",
|
| 68 |
+
"Anti-Dandruff":r"\b(((al)(?:.*?(k|q|kh)(?:a|i|o|u)(?:(s|sh))))|((k|q|kh)(?:a|i|o|u)(?:(s|sh)((?:a|o))))|((k|q|kh)(?:a|i|o|u)(?:(s|sh)(r(?:a|o|u)))))",
|
| 69 |
}
|
| 70 |
|
|
|
|
| 71 |
async def nlp_bat(text):
|
| 72 |
results = {}
|
| 73 |
all_match = {}
|