Fake News Classifier
Fake News Classifier
-----------------------------------------------------------
import nlp_utils
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
Loading dataset
--------------------------------------------------------------
df=pd.read_csv('train.csv')
df['title']
df['text']
df['label'].value_counts()
df.isnull().sum()
Text cleaning
___________________________________________________________________________________
___________________
import re
import string
Remove all alpha numeric letters
___________________________________________________________________________________
____________
Apply all the lambda functions wrote previously through .map on the comments column
___________________________________________________________________________________
____________________________
df['text'] =
df['text'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)
df['text']
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
#DataFrame.apply(Function_to_apply_to_each_row)
def rem_stopword(data):
li=[]
for w in data.split():
if w not in stop_words:
li.append(w)
return " ".join(li)
Y=df['label']
Count vectorizer
___________________________________________________________________________________
__________________
clf = MultinomialNB()
clf.fit(tfidf_train, Y_train)
pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(Y_test, pred)
print("accuracy: %0.3f" % score)
cm = metrics.confusion_matrix(Y_test, pred)
print(cm)
Naive Bayes model on Count Vectorized
__________________________________________________________________
clf = MultinomialNB()
clf.fit(count_train, Y_train)
pred1 = clf.predict(count_test)
score = metrics.accuracy_score(Y_test, pred1)
print("accuracy: %0.3f" % score)
cm2 = metrics.confusion_matrix(Y_test, pred1)
print(cm2)