Sentiment Analysis
March 13, 2025
[1]: import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
# Load dataset
data = pd.read_csv('kindle_review.csv')
df = data[['reviewText', 'rating']].copy()
[2]: df
[2]: reviewText rating
0 Jace Rankin may be short, but he's nothing to … 3
1 Great short read. I didn't want to put it dow… 5
2 I'll start by saying this is the first of four… 3
3 Aggie is Angela Lansbury who carries pocketboo… 3
4 I did not expect this type of book to be in li… 4
… … …
11995 Valentine cupid is a vampire- Jena and Ian ano… 4
11996 I have read all seven books in this series. Ap… 5
11997 This book really just wasn't my cuppa. The si… 3
11998 tried to use it to charge my kindle, it didn't… 1
11999 Taking Instruction is a look into the often hi… 3
[12000 rows x 2 columns]
[3]: # Convert rating to binary (0: negative, 1: positive)
df['rating'] = df['rating'].apply(lambda x: 0 if x < 3 else 1)
# Lowercase
df['reviewText'] = df['reviewText'].str.lower()
# Remove special characters, URLs, and HTML tags
1
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]',␣
↪'', str(x)))
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+', '', x))
# Download NLTK resources
nltk.download(['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger',␣
↪'maxent_ne_chunker', 'words'])
# Tokenization
df['tokens'] = df['reviewText'].apply(word_tokenize)
# Remove stopwords
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if␣
↪word not in stop_words])
# Stemming
stemmer = PorterStemmer()
df['stemmed'] = df['tokens'].apply(lambda tokens: [stemmer.stem(word) for word␣
↪in tokens])
# POS Tagging
df['pos_tags'] = df['tokens'].apply(pos_tag)
# Lemmatization with POS
lemmatizer = WordNetLemmatizer()
def lemmatize_with_pos(tagged_tokens):
lemmatized = []
for word, tag in tagged_tokens:
pos = tag[0].lower()
pos = pos if pos in ['a', 'r', 'n', 'v'] else 'n' # Default to noun
lemmatized.append(lemmatizer.lemmatize(word, pos))
return lemmatized
df['lemmatized'] = df['pos_tags'].apply(lemmatize_with_pos)
[nltk_data] Downloading package punkt to
[nltk_data] C:\Users\itzsh\AppData\Roaming\nltk_data…
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\itzsh\AppData\Roaming\nltk_data…
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data] C:\Users\itzsh\AppData\Roaming\nltk_data…
[nltk_data] Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] C:\Users\itzsh\AppData\Roaming\nltk_data…
2
[nltk_data] Package averaged_perceptron_tagger is already up-to-
[nltk_data] date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data] C:\Users\itzsh\AppData\Roaming\nltk_data…
[nltk_data] Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data] C:\Users\itzsh\AppData\Roaming\nltk_data…
[nltk_data] Package words is already up-to-date!
[4]: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
# Bag of Words (BOW)
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['lemmatized'].apply(' '.join))
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['lemmatized'].apply(' '.join))
# Word2Vec
# Train Word2Vec model
sentences = df['lemmatized'].tolist()
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1,␣
↪workers=4)
# Convert sentences to vectors by averaging word vectors
def sentence_vector(sentence):
return np.mean([w2v_model.wv[word] for word in sentence if word in␣
↪w2v_model.wv], axis=0)
X_w2v = np.array([sentence_vector(sentence) for sentence in sentences])
[5]: from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# Split data
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow,␣
↪df['rating'], test_size=0.2)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, df['rating'],␣
↪test_size=0.2)
X_train_w2v, X_test_w2v, _, _ = train_test_split(X_w2v, df['rating'],␣
↪test_size=0.2)
# Train models
3
nb_bow = GaussianNB().fit(X_train_bow.toarray(), y_train)
nb_tfidf = GaussianNB().fit(X_train_tfidf.toarray(), y_train)
nb_w2v = GaussianNB().fit(X_train_w2v, y_train)
# Evaluate
y_pred_bow = nb_bow.predict(X_test_bow.toarray())
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf.toarray())
y_pred_w2v = nb_w2v.predict(X_test_w2v)
print("BOW Accuracy:", accuracy_score(y_test, y_pred_bow))
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Word2Vec Accuracy:", accuracy_score(y_test, y_pred_w2v))
BOW Accuracy: 0.57125
TF-IDF Accuracy: 0.4525
Word2Vec Accuracy: 0.5091666666666667
[6]: # CNN Model using existing Word2Vec embeddings
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import train_test_split
# Convert lemmatized tokens to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['lemmatized'].apply(lambda x: ' '.join(x)))
sequences = tokenizer.texts_to_sequences(df['lemmatized'].apply(lambda x: ' '.
↪join(x)))
# Pad sequences
max_len = max(len(s) for s in sequences)
X_cnn = pad_sequences(sequences, maxlen=max_len, padding='post')
y = df['rating']
# Split data with fixed random_state
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y,␣
↪test_size=0.2, random_state=42)
# Load existing Word2Vec embeddings
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
if word in w2v_model.wv:
embedding_matrix[i] = w2v_model.wv[word]
4
# Build CNN model
model_cnn = Sequential()
model_cnn.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],␣
↪input_length=max_len, trainable=False))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(1, activation='sigmoid'))
model_cnn.compile(optimizer='adam', loss='binary_crossentropy',␣
↪metrics=['accuracy'])
# Train and evaluate
model_cnn.fit(X_train_cnn, y_train_cnn, epochs=20, validation_data=(X_test_cnn,␣
↪y_test_cnn))
loss, accuracy = model_cnn.evaluate(X_test_cnn, y_test_cnn)
print(f"CNN Accuracy: {accuracy:.4f}")
C:\Users\itzsh\AppData\Local\Programs\Python\Python312\Lib\site-
packages\keras\src\layers\core\embedding.py:90: UserWarning: Argument
`input_length` is deprecated. Just remove it.
warnings.warn(
Epoch 1/20
300/300 �������������������� 20s 63ms/step -
accuracy: 0.7099 - loss: 0.5623 - val_accuracy: 0.7725 - val_loss: 0.4640
Epoch 2/20
300/300 �������������������� 19s 62ms/step -
accuracy: 0.7932 - loss: 0.4413 - val_accuracy: 0.7887 - val_loss: 0.4366
Epoch 3/20
300/300 �������������������� 20s 66ms/step -
accuracy: 0.8290 - loss: 0.3919 - val_accuracy: 0.7171 - val_loss: 0.6283
Epoch 4/20
300/300 �������������������� 22s 73ms/step -
accuracy: 0.8139 - loss: 0.3938 - val_accuracy: 0.7871 - val_loss: 0.4415
Epoch 5/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.8338 - loss: 0.3696 - val_accuracy: 0.7742 - val_loss: 0.4720
Epoch 6/20
300/300 �������������������� 21s 71ms/step -
accuracy: 0.8592 - loss: 0.3223 - val_accuracy: 0.7942 - val_loss: 0.4341
Epoch 7/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.8816 - loss: 0.2871 - val_accuracy: 0.7967 - val_loss: 0.4406
Epoch 8/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.8980 - loss: 0.2596 - val_accuracy: 0.7942 - val_loss: 0.4483
Epoch 9/20
300/300 �������������������� 20s 67ms/step -
5
accuracy: 0.9035 - loss: 0.2508 - val_accuracy: 0.7887 - val_loss: 0.4552
Epoch 10/20
300/300 �������������������� 21s 68ms/step -
accuracy: 0.9062 - loss: 0.2372 - val_accuracy: 0.7896 - val_loss: 0.4663
Epoch 11/20
300/300 �������������������� 20s 68ms/step -
accuracy: 0.9428 - loss: 0.1871 - val_accuracy: 0.7833 - val_loss: 0.5050
Epoch 12/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.9312 - loss: 0.1930 - val_accuracy: 0.7804 - val_loss: 0.4908
Epoch 13/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.9480 - loss: 0.1580 - val_accuracy: 0.7862 - val_loss: 0.5018
Epoch 14/20
300/300 �������������������� 20s 68ms/step -
accuracy: 0.9576 - loss: 0.1441 - val_accuracy: 0.7725 - val_loss: 0.5382
Epoch 15/20
300/300 �������������������� 21s 69ms/step -
accuracy: 0.9662 - loss: 0.1268 - val_accuracy: 0.7754 - val_loss: 0.5315
Epoch 16/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.9718 - loss: 0.1106 - val_accuracy: 0.7808 - val_loss: 0.5851
Epoch 17/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.9747 - loss: 0.1027 - val_accuracy: 0.7754 - val_loss: 0.5782
Epoch 18/20
300/300 �������������������� 20s 67ms/step -
accuracy: 0.9827 - loss: 0.0863 - val_accuracy: 0.7717 - val_loss: 0.6004
Epoch 19/20
300/300 �������������������� 20s 68ms/step -
accuracy: 0.9826 - loss: 0.0792 - val_accuracy: 0.7783 - val_loss: 0.6086
Epoch 20/20
300/300 �������������������� 20s 68ms/step -
accuracy: 0.9850 - loss: 0.0752 - val_accuracy: 0.7650 - val_loss: 0.6401
75/75 �������������������� 1s 18ms/step -
accuracy: 0.7521 - loss: 0.6422
CNN Accuracy: 0.7650
[ ]:
[ ]: