{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2022-03-10T00:01:02.682985Z", "start_time": "2022-03-10T00:00:58.195800Z" } }, "outputs": [], "source": [ "import easyocr\n", "import pandas as pd\n", "import numpy as np\n", "from os import listdir\n", "from difflib import SequenceMatcher\n", "from autocorrect import Speller" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2022-03-10T00:01:10.249133Z", "start_time": "2022-03-10T00:01:02.715992Z" } }, "outputs": [], "source": [ "all_files = [\"monkey_puppet\",\"surprised_pikachu\",\"well_yes_but_actually_no\",\"10_Guy\",\"Spiderman_Computer_Desk\", \"Kevin_Hart\", \"laughing_leo\", \"Lisa\", \"Roll_Safe_Think_About_It\", \"Change_My_Mind\", \"Futurama_Fry\", \"First_World_Problems\"]\n", "reader = easyocr.Reader(['en']) \n", "spell = Speller(lang='en')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2022-03-10T00:01:10.297143Z", "start_time": "2022-03-10T00:01:10.282141Z" } }, "outputs": [], "source": [ "def only_text(lis):\n", " text = []\n", " for i in result:\n", " x = i[1]\n", " text.append(x)\n", " return text\n", "def only_eval(lis):\n", " evl = []\n", " for i in result:\n", " x = float(i[2])\n", " evl.append(x)\n", " return evl\n", "def purify(text):\n", " if not text:\n", " return \"no text\"\n", " pu_text = spell((text.lower()))\n", " waste = [\"well yes but actually no\", \"change my mind\"]\n", " for i in waste:\n", " if i in pu_text:\n", " pu_text.replace(i,\" \")\n", " sp_text = pu_text.split(\" \")\n", " for i in range(0, len(sp_text)):\n", " if (SequenceMatcher(a=sp_text[i], b=\"imgflib\").ratio() > .8) or (SequenceMatcher(a=sp_text[i], b=\"imgflib.com\").ratio() > .8) or (sp_text[i] == \"com\"):\n", " del sp_text[i]\n", " break\n", " for i, t in enumerate(sp_text):\n", " if t.endswith(\":\") and ((sp_text[i-1]).lower() in \"my her his him\"):\n", " sp_text.insert(i-1,\"\\n\")\n", " elif t.endswith(\":\"):\n", " sp_text.insert(i,\"\\n\")\n", " return \" \".join(sp_text)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2022-03-10T00:43:29.097665Z", "start_time": "2022-03-10T00:01:10.330151Z" }, "code_folding": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.\n", " return _methods._mean(a, axis=axis, dtype=dtype,\n", "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars\n", " ret = ret.dtype.type(ret / rcount)\n" ] } ], "source": [ "text = []\n", "all_class = []\n", "for file in all_files:\n", " all_text = []\n", " cors_eval = []\n", " img = listdir(r\"D:\\{}\".format(file))\n", " for i in range(len(img)):\n", " try:\n", " result = reader.readtext(r\"D:\\{}\\{}\".format(file, img[i]), paragraph=False)\n", " x = only_text(result)\n", " y = only_eval(result)\n", " all_text.append(x)\n", " cors_eval.append(y)\n", " except:\n", " pass\n", " for i, j in zip(all_text[:], cors_eval[:]):\n", " for t, e in zip(i[:], j[:]):\n", " if e <.5:\n", " i.remove(t)\n", " j.remove(e)\n", " if np.mean(j) < .7:\n", " all_text.remove(i)\n", " cors_eval.remove(j)\n", " joined_text = []\n", " for i in all_text:\n", " joined_text.append(\" \".join(i))\n", " label = np.ones(len(all_text))*(all_files.index(file)+1)\n", " label-=1\n", " text.extend(joined_text)\n", " all_class.extend(label)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2022-03-10T00:01:04.412Z" } }, "outputs": [], "source": [ "purified_text = []\n", "for i in text:\n", " pu = purify(str(i))\n", " purified_text.append(pu)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2022-03-10T00:01:16.513Z" } }, "outputs": [], "source": [ "data = {\"Text\": purified_text, \"Class\": all_class}\n", "df = pd.DataFrame(data)\n", "df = df.drop_duplicates()\n", "df.reset_index()\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2022-03-10T00:02:25.881Z" } }, "outputs": [], "source": [ "df.to_excel(\"NLP_classes.xlsx\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2022-03-09T23:19:05.044525Z", "start_time": "2022-03-09T22:44:01.280Z" } }, "outputs": [], "source": [ "df.value_counts([\"Class\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }