{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": 12, "metadata": { "id": "b4qtzc9tgTxt" }, "outputs": [], "source": [ "corpus = [\n", " \"I can't wait for the new season of my favorite show! 😍\",\n", " \"The COVID-19 pandemic has affected millions of people worldwide.\",\n", " \"U.S. stocks fell on Friday after news of rising inflation.\",\n", " \"Welcome to the website!\",\n", " \"Python is a great programming language!!! ??\",\n", " \"Check out https://www.example.com for more info!\",\n", " \"He won 1st prize in the comp3tition!!!\",\n", " \"I luvv this movie sooo much!!!\"\n", "]\n" ] }, { "cell_type": "markdown", "source": [ "**Cleaning Text (Lowercase, HTML, Numbers, Punctuation, Special Characters)**" ], "metadata": { "id": "pCWTr72ag5Pl" } }, { "cell_type": "code", "source": [ "import re\n", "import string\n", "from bs4 import BeautifulSoup\n", "\n", "def clean_text(text):\n", " text = text.lower()\n", " text = BeautifulSoup(text, \"html.parser\").get_text()\n", " text = re.sub(r'\\d+', '', text)\n", " text = text.translate(str.maketrans('', '', string.punctuation))\n", " text = re.sub(r'\\W+', ' ', text)\n", " text = re.sub(r'\\s+', ' ', text).strip()\n", " return text\n", "\n", "cleaned_corpus = [clean_text(doc) for doc in corpus]\n", "print(\"Cleaned Corpus:\\n\", cleaned_corpus)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_nw7AUzKgdL1", "outputId": "6f4ca751-49a6-45a1-9b6b-8c29146c3b46" }, "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cleaned Corpus:\n", " ['i cant wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'welcome to the website', 'python is a great programming language', 'check out httpswwwexamplecom for more info', 'he won st prize in the comptition', 'i luvv this movie sooo much']\n" ] } ] }, { "cell_type": "markdown", "source": [ "**Tokenization**" ], "metadata": { "id": "RU9e-5dLg9p1" } }, { "cell_type": "code", "source": [ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "nltk.download('punkt')\n", "nltk.download('punkt_tab')\n", "\n", "tokenized_corpus = [word_tokenize(doc) for doc in cleaned_corpus]\n", "print(\"Tokenized Corpus:\\n\", tokenized_corpus)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "juPS9kH5gdOb", "outputId": "f60c109a-0c3e-403a-ec5d-13dfde3fcef5" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tokenized Corpus:\n", " [['i', 'cant', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show'], ['the', 'covid', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation'], ['welcome', 'to', 'the', 'website'], ['python', 'is', 'a', 'great', 'programming', 'language'], ['check', 'out', 'httpswwwexamplecom', 'for', 'more', 'info'], ['he', 'won', 'st', 'prize', 'in', 'the', 'comptition'], ['i', 'luvv', 'this', 'movie', 'sooo', 'much']]\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n" ] } ] }, { "cell_type": "markdown", "source": [ "**Stopword Removal**" ], "metadata": { "id": "AvxTMs4FhBFM" } }, { "cell_type": "code", "source": [ "from nltk.corpus import stopwords\n", "import nltk\n", "nltk.download('stopwords')\n", "\n", "stop_words = set(stopwords.words('english'))\n", "filtered_corpus = [[word for word in doc if word not in stop_words] for doc in tokenized_corpus]\n", "print(\"Stopword Removed Corpus:\\n\", filtered_corpus)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LFxoXiofgdRE", "outputId": "e75c63c9-03c9-4c37-83ae-d9aea6adfb0e" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Stopword Removed Corpus:\n", " [['cant', 'wait', 'new', 'season', 'favorite', 'show'], ['covid', 'pandemic', 'affected', 'millions', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'friday', 'news', 'rising', 'inflation'], ['welcome', 'website'], ['python', 'great', 'programming', 'language'], ['check', 'httpswwwexamplecom', 'info'], ['st', 'prize', 'comptition'], ['luvv', 'movie', 'sooo', 'much']]\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ] }, { "cell_type": "markdown", "source": [ "**Stemming**" ], "metadata": { "id": "ac3a1Zuih1C2" } }, { "cell_type": "code", "source": [ "\n", "from nltk.stem import PorterStemmer\n", "\n", "stemmer = PorterStemmer()\n", "stemmed_corpus = [[stemmer.stem(word) for word in doc] for doc in filtered_corpus]\n", "print(\"Stemmed Corpus:\\n\", stemmed_corpus)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "w5gdFe8egdTk", "outputId": "e8bd2c3b-3d2f-48bb-f1de-dbba449c7e4a" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Stemmed Corpus:\n", " [['cant', 'wait', 'new', 'season', 'favorit', 'show'], ['covid', 'pandem', 'affect', 'million', 'peopl', 'worldwid'], ['us', 'stock', 'fell', 'friday', 'news', 'rise', 'inflat'], ['welcom', 'websit'], ['python', 'great', 'program', 'languag'], ['check', 'httpswwwexamplecom', 'info'], ['st', 'prize', 'comptit'], ['luvv', 'movi', 'sooo', 'much']]\n" ] } ] }, { "cell_type": "markdown", "source": [ "**Lemmatization**" ], "metadata": { "id": "7gy5fJdSh8mm" } }, { "cell_type": "code", "source": [ "\n", "from nltk.stem import WordNetLemmatizer\n", "import nltk\n", "nltk.download('wordnet')\n", "\n", "lemmatizer = WordNetLemmatizer()\n", "lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_corpus]\n", "print(\"Lemmatized Corpus:\\n\", lemmatized_corpus)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "31cDsSvTgdWF", "outputId": "a7203bd4-1101-4790-afa1-aba8a34e47ae" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Lemmatized Corpus:\n", " [['cant', 'wait', 'new', 'season', 'favorite', 'show'], ['covid', 'pandemic', 'affected', 'million', 'people', 'worldwide'], ['u', 'stock', 'fell', 'friday', 'news', 'rising', 'inflation'], ['welcome', 'website'], ['python', 'great', 'programming', 'language'], ['check', 'httpswwwexamplecom', 'info'], ['st', 'prize', 'comptition'], ['luvv', 'movie', 'sooo', 'much']]\n" ] } ] }, { "cell_type": "markdown", "source": [ "**Contractions Expansion**" ], "metadata": { "id": "aZ80ryxEiCC9" } }, { "cell_type": "code", "source": [ "!pip install contractions\n", "import contractions\n", "\n", "expanded_corpus = [contractions.fix(doc) for doc in corpus]\n", "print(\"Expanded Corpus:\\n\", expanded_corpus)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pZ8p5KhggdYx", "outputId": "b07a5193-f2de-48c6-d67c-aa047057a65f" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting contractions\n", " Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)\n", "Collecting textsearch>=0.0.21 (from contractions)\n", " Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)\n", "Collecting anyascii (from textsearch>=0.0.21->contractions)\n", " Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)\n", "Collecting pyahocorasick (from textsearch>=0.0.21->contractions)\n", " Downloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (13 kB)\n", "Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)\n", "Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)\n", "Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.1/345.1 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (114 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.9/114.9 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions\n", "Successfully installed anyascii-0.3.3 contractions-0.1.73 pyahocorasick-2.3.0 textsearch-0.0.24\n", "Expanded Corpus:\n", " ['I cannot wait for the new season of my favorite show! 😍', 'The COVID-19 pandemic has affected millions of people worldwide.', 'YOU.S. stocks fell on Friday after news of rising inflation.', 'Welcome to the website!', 'Python is a great programming language!!! ??', 'Check out https://www.example.com for more info!', 'He won 1st prize in the comp3tition!!!', 'I luvv this movie sooo much!!!']\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install emoji\n", "import emoji\n", "\n", "emoji_corpus = [emoji.demojize(doc) for doc in corpus]\n", "print(\"Emoji Converted Corpus:\\n\", emoji_corpus)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "K-GofDMmiGxE", "outputId": "dcccfe00-c128-4d3e-b547-52be7747cc90" }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting emoji\n", " Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)\n", "Downloading emoji-2.15.0-py3-none-any.whl (608 kB)\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/608.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m604.2/608.4 kB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m608.4/608.4 kB\u001b[0m \u001b[31m16.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: emoji\n", "Successfully installed emoji-2.15.0\n", "Emoji Converted Corpus:\n", " [\"I can't wait for the new season of my favorite show! :smiling_face_with_heart-eyes:\", 'The COVID-19 pandemic has affected millions of people worldwide.', 'U.S. stocks fell on Friday after news of rising inflation.', 'Welcome to the website!', 'Python is a great programming language!!! ??', 'Check out https://www.example.com for more info!', 'He won 1st prize in the comp3tition!!!', 'I luvv this movie sooo much!!!']\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install pyspellchecker\n", "from spellchecker import SpellChecker\n", "\n", "spell = SpellChecker()\n", "corrected_corpus = [[spell.correction(word) for word in doc] for doc in tokenized_corpus]\n", "print(\"Spell Corrected Corpus:\\n\", corrected_corpus)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fH0C8tm2iGz2", "outputId": "beb82a8a-29d1-4dfc-dd6b-605943c52283" }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting pyspellchecker\n", " Downloading pyspellchecker-0.9.0-py3-none-any.whl.metadata (9.3 kB)\n", "Downloading pyspellchecker-0.9.0-py3-none-any.whl (7.2 MB)\n", "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/7.2 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/7.2 MB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m109.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m73.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: pyspellchecker\n", "Successfully installed pyspellchecker-0.9.0\n", "Spell Corrected Corpus:\n", " [['i', 'cant', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show'], ['the', 'covin', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation'], ['welcome', 'to', 'the', 'website'], ['python', 'is', 'a', 'great', 'programming', 'language'], ['check', 'out', None, 'for', 'more', 'info'], ['he', 'won', 'st', 'prize', 'in', 'the', 'competition'], ['i', 'luvs', 'this', 'movie', 'soon', 'much']]\n" ] } ] }, { "cell_type": "code", "source": [ "import nltk\n", "nltk.download('averaged_perceptron_tagger')\n", "nltk.download('averaged_perceptron_tagger_eng')\n", "\n", "pos_tagged_corpus = [nltk.pos_tag(doc) for doc in tokenized_corpus]\n", "print(\"POS Tagged Corpus:\\n\", pos_tagged_corpus)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qmuTPXNtiG2h", "outputId": "f9b856ce-4c41-4805-e8d8-3713fb16d1d9" }, "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n", "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Unzipping taggers/averaged_perceptron_tagger_eng.zip.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "POS Tagged Corpus:\n", " [[('i', 'NN'), ('cant', 'VBP'), ('wait', 'NN'), ('for', 'IN'), ('the', 'DT'), ('new', 'JJ'), ('season', 'NN'), ('of', 'IN'), ('my', 'PRP$'), ('favorite', 'JJ'), ('show', 'NN')], [('the', 'DT'), ('covid', 'NN'), ('pandemic', 'NN'), ('has', 'VBZ'), ('affected', 'VBN'), ('millions', 'NNS'), ('of', 'IN'), ('people', 'NNS'), ('worldwide', 'VBP')], [('us', 'PRP'), ('stocks', 'NNS'), ('fell', 'VBD'), ('on', 'IN'), ('friday', 'NN'), ('after', 'IN'), ('news', 'NN'), ('of', 'IN'), ('rising', 'VBG'), ('inflation', 'NN')], [('welcome', 'NN'), ('to', 'TO'), ('the', 'DT'), ('website', 'NN')], [('python', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('great', 'JJ'), ('programming', 'NN'), ('language', 'NN')], [('check', 'VB'), ('out', 'RP'), ('httpswwwexamplecom', 'NN'), ('for', 'IN'), ('more', 'JJR'), ('info', 'NN')], [('he', 'PRP'), ('won', 'VBD'), ('st', 'JJ'), ('prize', 'NN'), ('in', 'IN'), ('the', 'DT'), ('comptition', 'NN')], [('i', 'NN'), ('luvv', 'VBP'), ('this', 'DT'), ('movie', 'NN'), ('sooo', 'VBZ'), ('much', 'RB')]]\n" ] } ] } ] }