{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "2eLLHNlMyFkT" }, "outputs": [], "source": [ "import tensorflow as tf\n", "import pickle\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "import kagglehub\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "path = kagglehub.dataset_download(\"shree1992/housedata\")\n", "\n", "print(\"Path to dataset files:\", path)\n", "df = pd.read_csv(path + \"/data.csv\")\n", "df2 = pd.read_csv(path + \"/data.csv\")\n", "X = df.drop([\"price\", \"street\", \"city\", \"statezip\", \"country\", \"date\", \"yr_built\", \"yr_renovated\",\"waterfront\"], axis=1)\n", "y = df[\"price\"]\n", "X_train, X_test, Y_train,Y_test = train_test_split(X, y, test_size=0.1)\n", "print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)\n", "scaler = StandardScaler()\n", "X_train = scaler.fit_transform(X_train)\n", "X_test = scaler.transform(X_test)\n", "Y_train = Y_train/1e6\n", "Y_test = Y_test/1e6\n" ] }, { "cell_type": "code", "source": [ "\n", "w = np.zeros(X_train.shape[1])\n", "b = np.zeros(1)\n", "\n", "def linear_func(X_train, w, b):\n", " return np.dot(X_train, w) + b\n", "\n", "def mean_squared_error_cost_func(X_train, Y_train, w, b):\n", " m = X_train.shape[0]\n", " summation = 0\n", " for i in range(m):\n", " summation += ((np.dot(w, X_train[i,]) + b) - Y_train[i, ])**2\n", " MSE = (1/(2*m))*(summation)\n", " return MSE" ], "metadata": { "id": "Vz6pFm1J4-4c" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "NON-Vectorized" ], "metadata": { "id": "GHrilIG-qJt-" } }, { "cell_type": "code", "source": [ "alpha = 0.0001\n", "\n", "def mean_squared_error_cost_func_deriv(X_train, Y_train, w, b):\n", " X_train_np = X_train\n", " Y_train_np = Y_train.values\n", " m = X_train_np.shape[0]\n", " summation = 0\n", " for i in range(m):\n", " summation += (((np.dot(w, X_train_np[i,])+b) - Y_train_np[i,])*X_train_np[i,])\n", " w = w - (alpha*((1/m)*summation))\n", "\n", " summation = 0\n", " for i in range(m):\n", " summation += ((np.dot(w, X_train_np[i,])+b) - Y_train_np[i,])\n", " b = b - (alpha*((1/m)*summation))\n", " return w, b\n", "\n", "def gradient_descent(alpha, w, b, X_train, Y_train):\n", " for i in range(1000):\n", " w, b = mean_squared_error_cost_func_deriv(X_train, Y_train, w, b)\n", " if i % 100 == 0:\n", " preds = np.dot(X_train, w)\n", " MSE = np.mean((preds-Y_train)**2)\n", " print(f\"Iteration {i}, MSE: {MSE}, w: {w}, b: {b}\")\n", " return w, b\n", "\n", "W, B = gradient_descent(alpha, w, b, X_train, Y_train)\n", "\n", "def predict(x, W, B):\n", " y_pred = np.zeros(x.shape[0])\n", " for i in range(x.shape[0]):\n", " y_pred[i,] = linear_func(x[i,], W, B)\n", " return y_pred\n", "\n", "\n", "y_pred = predict(X_test, W, B)\n", "\n", "\n" ], "metadata": { "id": "cn4BqyN6-h9_" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Vectorized" ], "metadata": { "id": "acKPA93MqM9j" } }, { "cell_type": "code", "source": [ "alpha = 0.0001\n", "\n", "def mean_squared_error_cost_func_deriv(X_train, Y_train, w, b):\n", " m = X_train.shape[0]\n", " predictions = np.dot(X_train, w) + b\n", " errors = predictions - Y_train\n", "\n", " dw = (1 / m) * np.dot(X_train.T, errors)\n", " db = (1 / m) * np.sum(errors)\n", "\n", " w -= alpha * dw\n", " b -= alpha * db\n", "\n", " return w, b\n", "\n", "def gradient_descent(alpha, w, b, X_train, Y_train):\n", " for i in range(1000):\n", " w, b = mean_squared_error_cost_func_deriv(X_train, Y_train, w, b)\n", " if i % 100 == 0:\n", " preds = np.dot(X_train, w)\n", " MSE = np.mean((preds-Y_train)**2)\n", " print(f\"Iteration {i}, MSE: {mse}, w: {w}, b: {b}\")\n", " return w, b\n", "\n", "W, B = gradient_descent(alpha, w, b, X_train, Y_train)\n", "\n", "def predict(x, W, B):\n", " y_pred = np.zeros(x.shape[0])\n", " for i in range(x.shape[0]):\n", " y_pred[i,] = linear_func(x[i,], W, B)\n", " return y_pred\n", "\n", "\n", "y_pred = predict(X_test, W, B)" ], "metadata": { "id": "PFikMk1ZqGsK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.metrics import mean_squared_error\n", "mse = mean_squared_error(Y_test.values, y_pred)\n", "print(\"Mean Squared Error:\", mse)\n", "print(Y_test.values)\n", "print(\"BREAK\")\n", "print(y_pred)\n", "print(W, B)\n", "\n", "X_feature = X_train[:, 0]\n", "x_range = np.linspace(X_feature.min(), X_feature.max(), 100)\n", "\n", "y_line = w[0] * x_range + b\n", "plt.scatter(X_feature, Y_train, color='blue', label='Training Data')\n", "\n", "plt.plot(x_range, y_line, color='red', label='Linear Regression Line')\n", "plt.xlabel('Feature 1')\n", "plt.ylabel('Y')\n", "plt.title('Linear Regression (Feature 1)')\n", "plt.legend()\n", "plt.show()\n" ], "metadata": { "id": "AE7epvEkcuGQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "XR8icKb1ow43" }, "execution_count": null, "outputs": [] } ] }