{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lzBc7Vwg8l_w"
      },
      "outputs": [],
      "source": [
        "# Importing Libraries\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "# Import dataset\n",
        "dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\\t')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# library to clean data\n",
        "import re\n",
        "\n",
        "# Natural Language Tool Kit\n",
        "import nltk\n",
        "\n",
        "nltk.download('stopwords')\n",
        "\n",
        "# to remove stopword\n",
        "from nltk.corpus import stopwords\n",
        "\n",
        "# for Stemming propose\n",
        "from nltk.stem.porter import PorterStemmer\n",
        "\n",
        "# Initialize empty array\n",
        "# to append clean text\n",
        "corpus = []\n",
        "\n",
        "# 1000 (reviews) rows to clean\n",
        "for i in range(0, 1000):\n",
        "\n",
        "\t# column : \"Review\", row ith\n",
        "\treview = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])\n",
        "\n",
        "\t# convert all cases to lower cases\n",
        "\treview = review.lower()\n",
        "\n",
        "\t# split to array(default delimiter is \" \")\n",
        "\treview = review.split()\n",
        "\n",
        "\t# creating PorterStemmer object to\n",
        "\t# take main stem of each word\n",
        "\tps = PorterStemmer()\n",
        "\n",
        "\t# loop for stemming each word\n",
        "\t# in string array at ith row\n",
        "\treview = [ps.stem(word) for word in review\n",
        "\t\t\t\tif not word in set(stopwords.words('english'))]\n",
        "\n",
        "\t# rejoin all string array elements\n",
        "\t# to create back into a string\n",
        "\treview = ' '.join(review)\n",
        "\n",
        "\t# append each string to create\n",
        "\t# array of clean text\n",
        "\tcorpus.append(review)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PWD4G-Kh86do",
        "outputId": "59b39470-89bf-4466-f9f1-de11dca5114e"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Creating the Bag of Words model\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "\n",
        "# To extract max 1500 feature.\n",
        "# \"max_features\" is attribute to\n",
        "# experiment with to get better results\n",
        "cv = CountVectorizer(max_features = 1500)\n",
        "\n",
        "# X contains corpus (dependent variable)\n",
        "X = cv.fit_transform(corpus).toarray()\n",
        "\n",
        "# y contains answers if review\n",
        "# is positive or negative\n",
        "y = dataset.iloc[:, 1].values"
      ],
      "metadata": {
        "id": "P7MAkjaO88Zp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Splitting the dataset into\n",
        "# the Training set and Test set\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "# experiment with \"test_size\"\n",
        "# to get better results\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)"
      ],
      "metadata": {
        "id": "t0rmp-4l8-GU"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Fitting Random Forest Classification\n",
        "# to the Training set\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "\n",
        "# n_estimators can be said as number of\n",
        "# trees, experiment with n_estimators\n",
        "# to get better results\n",
        "model = RandomForestClassifier(n_estimators = 501,\n",
        "\t\t\t\t\t\t\tcriterion = 'entropy')\n",
        "\n",
        "model.fit(X_train, y_train)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 74
        },
        "id": "TQnRMDwY8_BM",
        "outputId": "a4653f6e-9ad4-4273-ed2b-8d8aa97a505b"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "RandomForestClassifier(criterion='entropy', n_estimators=501)"
            ],
            "text/html": [
              "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(criterion=&#x27;entropy&#x27;, n_estimators=501)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(criterion=&#x27;entropy&#x27;, n_estimators=501)</pre></div></div></div></div></div>"
            ]
          },
          "metadata": {},
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Predicting the Test set results\n",
        "y_pred = model.predict(X_test)\n",
        "\n",
        "y_pred"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "KjrLCFtC9AXl",
        "outputId": "9a81aeb7-8b11-46e5-9c4a-77820729d593"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,\n",
              "       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,\n",
              "       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,\n",
              "       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,\n",
              "       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,\n",
              "       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,\n",
              "       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,\n",
              "       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,\n",
              "       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,\n",
              "       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,\n",
              "       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,\n",
              "       0, 0, 0, 1, 0, 0, 1, 1])"
            ]
          },
          "metadata": {},
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Making the Confusion Matrix\n",
        "from sklearn.metrics import confusion_matrix\n",
        "\n",
        "cm = confusion_matrix(y_test, y_pred)\n",
        "\n",
        "cm"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "HIrqSoXY9Bpf",
        "outputId": "7f00bb75-92a7-4fcb-e112-510c84db469d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([[102,  28],\n",
              "       [ 41,  79]])"
            ]
          },
          "metadata": {},
          "execution_count": 11
        }
      ]
    }
  ]
}