{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-03-10T00:01:02.682985Z",
     "start_time": "2022-03-10T00:00:58.195800Z"
    }
   },
   "outputs": [],
   "source": [
    "import easyocr\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from os import listdir\n",
    "from difflib import SequenceMatcher\n",
    "from autocorrect import Speller"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-03-10T00:01:10.249133Z",
     "start_time": "2022-03-10T00:01:02.715992Z"
    }
   },
   "outputs": [],
   "source": [
    "all_files =  [\"monkey_puppet\",\"surprised_pikachu\",\"well_yes_but_actually_no\",\"10_Guy\",\"Spiderman_Computer_Desk\", \"Kevin_Hart\", \"laughing_leo\", \"Lisa\", \"Roll_Safe_Think_About_It\", \"Change_My_Mind\", \"Futurama_Fry\", \"First_World_Problems\"]\n",
    "reader = easyocr.Reader(['en']) \n",
    "spell = Speller(lang='en')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-03-10T00:01:10.297143Z",
     "start_time": "2022-03-10T00:01:10.282141Z"
    }
   },
   "outputs": [],
   "source": [
    "def only_text(lis):\n",
    "    text = []\n",
    "    for i in result:\n",
    "        x = i[1]\n",
    "        text.append(x)\n",
    "    return text\n",
    "def only_eval(lis):\n",
    "    evl = []\n",
    "    for i in result:\n",
    "        x = float(i[2])\n",
    "        evl.append(x)\n",
    "    return evl\n",
    "def purify(text):\n",
    "    if not text:\n",
    "        return \"no text\"\n",
    "    pu_text = spell((text.lower()))\n",
    "    waste = [\"well yes but actually no\", \"change my mind\"]\n",
    "    for i in waste:\n",
    "        if i in pu_text:\n",
    "            pu_text.replace(i,\" \")\n",
    "    sp_text = pu_text.split(\" \")\n",
    "    for i in range(0, len(sp_text)):\n",
    "        if (SequenceMatcher(a=sp_text[i], b=\"imgflib\").ratio() > .8) or (SequenceMatcher(a=sp_text[i], b=\"imgflib.com\").ratio() > .8) or (sp_text[i] == \"com\"):\n",
    "            del sp_text[i]\n",
    "            break\n",
    "    for i, t in enumerate(sp_text):\n",
    "        if t.endswith(\":\") and ((sp_text[i-1]).lower() in \"my her his him\"):\n",
    "            sp_text.insert(i-1,\"\\n\")\n",
    "        elif t.endswith(\":\"):\n",
    "            sp_text.insert(i,\"\\n\")\n",
    "    return \" \".join(sp_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-03-10T00:43:29.097665Z",
     "start_time": "2022-03-10T00:01:10.330151Z"
    },
    "code_folding": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.\n",
      "  return _methods._mean(a, axis=axis, dtype=dtype,\n",
      "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  ret = ret.dtype.type(ret / rcount)\n"
     ]
    }
   ],
   "source": [
    "text = []\n",
    "all_class = []\n",
    "for file in all_files:\n",
    "    all_text = []\n",
    "    cors_eval = []\n",
    "    img = listdir(r\"D:\\{}\".format(file))\n",
    "    for i in range(len(img)):\n",
    "        try:\n",
    "            result = reader.readtext(r\"D:\\{}\\{}\".format(file, img[i]), paragraph=False)\n",
    "            x = only_text(result)\n",
    "            y = only_eval(result)\n",
    "            all_text.append(x)\n",
    "            cors_eval.append(y)\n",
    "        except:\n",
    "            pass\n",
    "    for i, j in zip(all_text[:], cors_eval[:]):\n",
    "        for t, e in zip(i[:], j[:]):\n",
    "            if e <.5:\n",
    "                i.remove(t)\n",
    "                j.remove(e)\n",
    "        if np.mean(j) < .7:\n",
    "            all_text.remove(i)\n",
    "            cors_eval.remove(j)\n",
    "    joined_text = []\n",
    "    for i in all_text:\n",
    "        joined_text.append(\" \".join(i))\n",
    "    label = np.ones(len(all_text))*(all_files.index(file)+1)\n",
    "    label-=1\n",
    "    text.extend(joined_text)\n",
    "    all_class.extend(label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2022-03-10T00:01:04.412Z"
    }
   },
   "outputs": [],
   "source": [
    "purified_text = []\n",
    "for i in text:\n",
    "    pu = purify(str(i))\n",
    "    purified_text.append(pu)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2022-03-10T00:01:16.513Z"
    }
   },
   "outputs": [],
   "source": [
    "data = {\"Text\": purified_text, \"Class\": all_class}\n",
    "df = pd.DataFrame(data)\n",
    "df = df.drop_duplicates()\n",
    "df.reset_index()\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2022-03-10T00:02:25.881Z"
    }
   },
   "outputs": [],
   "source": [
    "df.to_excel(\"NLP_classes.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2022-03-09T23:19:05.044525Z",
     "start_time": "2022-03-09T22:44:01.280Z"
    }
   },
   "outputs": [],
   "source": [
    "df.value_counts([\"Class\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}