VIVEKANANDHA
COLLEGE OF ARTS AND SCIENCES FOR WOMEN
(AUTONOMOUS)
(AnISO9001:2015Certified Institution;
Affiliated to Periyar University, Salem, Approved by AICTE,
Re-accredited with“A++”Grade by NAAC, Recognized U/S 12(B), 2(f)ofUGCAct,1956)
Elayampalayam,Tiruchengode-637205.
MASTER OF SCIENCE IN COMPUTER SCIENCE
PRACTICAL RECORD
NAME :
REG.NO :
NATURAL LANGUAGE PROCESSING LAB
(24P2CSEP01)
SEMESTER–II
2024-2026
VIVEKANANDHA
COLLEGE OF ARTS AND SCIENCES FOR WOMEN
(AUTONOMOUS)
AnISO9001:2015CertifiedInstitution
(Affiliated to Periyar University-Salem, Approved by AICTE,
Reaccredited with “A++”Grade by NAAC, RecognizedU/S12(B), 2(f)ofUGCAct1956)
Elayampalayam , Tiruchengode-637205.
MASTER OF SCIENCE IN COMPUTER SCIENCE
Certified that this is a bonafide record of practical work done by
Ms/Mrs Reg. No: in the NATURAL
LANGUAGE PROCESSING LAB (24P2CSEP01) at the Vivekanandha College of Arts and
Sciences for Women (Autonomous), Elayampalayam, Tiruchengode.
Staff In-Charge Head of the Department
Submitted for the University Practical Examinations held on at PG
and Research Department of Computer Science and Applications, Vivekanandha
College of Arts and Sciences for Women (Autonomous), Elayampalayam,
Tiruchengode.
Internal Examiner External Examiner
INDEX
S.NO DATE CONTENTS PAGE SIGN
NO.
1
Tokenize a given text
2 Sentences of a text document
Tokenize text with stop words as
3 delimiters
Remove stop words and
4 punctuations in a text
A. Perform Stemming
5 B. Lemmatize a given Text
6 Extract Usernames from Email
Common words in text excluding
7 stop words
8 Spell correction in a given text
Classify A Text as Positive/Negative
9 Sentiment
10 Root word of any word in a sentence
a) load the iris data from a given csv
11 file into a dataframe
b) Extract Noun and Verb phrases
from a text
sets of synonyms and antonyms of a
12 given word
Print the first 15 random combine
13 labeled male and labeled female
names from names corpus
PROGRAM
1. Tokenize a text
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt') # Download tokenizer data
# Example text
text = "NLP makes machines understand language. Tokenization is the first step."
# Sentence Tokenization
print("Sentences:", sent_tokenize(text))
# Word Tokenization
print("Words:", word_tokenize(text))
OUTPUT
PROGRAM
2. Sentences of a text document
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt') # Download tokenizer data
# Read the text from a file
file_path = "example.txt" # Replace with your file path
with open(file_path, 'r') as file:
text = file.read()
# Sentence Tokenization
sentences = sent_tokenize(text)
# Display the sentences
print("Sentences in the document:")
for i, sentence in enumerate(sentences, 1):
print(f"{i}: {sentence}")
save a text file as example.txt in jupyter notebook
OUTPUT
PROGRAM
3. Tokenize text with stop words as delimiters
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
# Download necessary data
nltk.download('punkt')
nltk.download('stopwords')
# Example text
text = "I enjoy learning Python and coding."
# Define stop words
stop_words = set(stopwords.words('english'))
# Tokenize the text
words = word_tokenize(text)
# Tokenize using stop words as delimiters
tokens_without_stopwords = [word for word in words if word.lower() not in
stop_words]
# Output the result
print("Original Tokens:", words)
print("Tokens without Stop Words:", tokens_without_stopwords)
OUTPUT
PROGRAM
4. Remove stop words and punctuations in a text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
# Download necessary data
nltk.download('punkt')
nltk.download('stopwords')
# Example text
text = "Python is great! It's simple and powerful."
# Define stop words
stop_words = set(stopwords.words('english'))
# Tokenize the text
words = word_tokenize(text)
# Remove stop words and punctuation
tokens_cleaned = [word for word in words if word.lower() not in stop_words and
word not in string.punctuation]
# Output the result
print("Tokens without Stop Words and Punctuation:", tokens_cleaned)
OUTPUT
PROGRAM
5. A. Perform Stemming
# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
# choose some words to be stemmed
words = ["pythonprogramming", "programs", "programmer", "event", "thankyou"]
for w in words:
print(w, " : ", ps.stem(w))
OUTPUT
PROGRAM
5. B. Lemmatize A Given Text
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
# Download necessary resources
nltk.download('punkt')
nltk.download('wordnet')
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(text)
lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
return lemmatized_text
text = "The cats are chasing mice and playing in the garden"
lemmatized_text = lemmatize_text(text)
print("Original Text:", text)
print("Lemmatized Text:", lemmatized_text)
OUTPUT
PROGRAM
6. Extract Usernames from Email
# Using regular expression
import re
# Defining an email string
e = "
[email protected]"
# Using the search function to find a match for the domain part of the email
match = re.search(r'@([a-zA-Z0-9.-]+)', e)
# If a match is found, extracting the domain part (the part after '@') using the group()
method
if match:
domain = match.group(1)
print(domain)
OUTPUT
PROGRAM
7. Find the most common words in the text excluding stop words
import nltk
from nltk.corpus import stopwords
from collections import Counter
import string
# Download stopwords (only needed once)
nltk.download("stopwords")
def most_common_words(text, n=10):
stop_words = set(stopwords.words("english")) # Load stop words
words = text.lower().translate(str.maketrans("", "", string.punctuation)).split() #
Convert to lowercase & remove punctuation
filtered_words = [word for word in words if word not in stop_words] # Remove stop
words
word_counts = Counter(filtered_words) # Count words
return word_counts.most_common(n) # Get most common words
# Example text
text = "This is a simple example text. This text is just for testing the most common
words."
# Get the top 5 most common words
result = most_common_words(text, 5)
# Print result
print(result)
OUTPUT
PROGRAM
8. Spell correction in a given text
# list of incorrect spellings
# that need to be corrected
incorrect_words=['happpy', 'azmaing', 'intelliengt']
# loop for finding correct spellings
# based on jaccard distance
# and printing the correct word
for word in incorrect_words:
temp = [(jaccard_distance(set(ngrams(word, 2)),
set(ngrams(w, 2))),w)
for w in correct_words if w[0]==word[0]]
print(sorted(temp, key = lambda val:val[0])[0][1])
PROGRAM
9. Classify A Text as Positive/Negative Sentiment
from textblob import TextBlob
text_1 = "The movie was so awesome." text_2
= "The food here tastes terrible."
#Determining the Polarity
p_1 = TextBlob(text_1).sentiment.polarity
p_2 = TextBlob(text_2).sentiment.polarity
#Determining the Subjectivity
s_1 = TextBlob(text_1).sentiment.subjectivity s_2
= TextBlob(text_2).sentiment.subjectivity
print("Polarity of Text 1 is", p_1)
print("Polarity of Text 2 is", p_2)
print("Subjectivity of Text 1 is", s_1)
print("Subjectivity of Text 2 is", s_2)
PROGRAM
10. Find the ROOT word of any word in a sentence
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
words = ["renting", "renter", "rental", "rents", "apple"]
all_rents = {}
for word in words:
stem = stemmer.stem(word)
if stem not in all_rents:
all_rents[stem] = []
all_rents[stem].appen
d(word) else:
all_rents[stem].append(word)
print(all_rents)
OUTPUT
{'rent': ['renting', 'rents'], 'renter': ['renter'], 'rental': ['rental'], 'appl': ['apple']}
PROGRAM
11. a) load the iris data from a given csv file into a dataframe and print
the shape of the data, type of the data and first 3 rows.
import pandas as pd
data =
pd.read_csv("iris.csv")
print("Shape of the
data:") print(data.shape)
print("\nData Type:")
print(type(data))
print("\nFirst 3 rows:")
print(data.head(3))
OUTPUT
Shape of the data:
(150, 6)
Data Type:
<class 'pandas.core.frame.DataFrame'>
First 3 rows:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
PROGRAM
11.b) Extract Noun and Verb phrases from a text
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, RegexpParser
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def chunk_sentence(sentence):
words = word_tokenize(sentence) # Tokenize words
tagged_words = pos_tag(words) # Perform POS tagging
# Define grammar for chunking
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP pair """
parser = RegexpParser(grammar) # Create a chunk parser
chunked_sentence = parser.parse(tagged_words) # Apply parsing
return chunked_sentence
# Example sentence
sentence = "The quick brown fox jumps over the lazy dog"
# Perform chunking
chunked_sentence = chunk_sentence(sentence)
# Print chunked result
print(chunked_sentence)
# Optional: Draw chunk tree (Only works in GUI-supported environments)
chunked_sentence.draw()
OUTPUT
PROGRAM
12. Write a Python NLTK program to find the sets of synonyms and
antonyms of a given word.
def synonym_antonym_extractor(phrase):
from nltk.corpus import wordnet
synonyms = []
antonyms = []
for syn in wordnet.synsets(phrase):
for l in syn.lemmas():
synonyms.append(l.name())
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))
synonym_antonym_extractor(phrase="word")
OUTPUT
PROGRAM
13. Print the first 15 random combine labeled male and labeled female
names from names corpus.
from nltk.corpus import
names import random
male_names =
names.words('male.txt')
female_names =
names.words('female.txt')
labeled_male_names = [(str(name), 'male') for name in
male_names] labeled_female_names = [(str(name), 'female') for
name in female_names] # combine labeled male and labeled female
names
labeled_all_names = labeled_male_names +
labeled_female_names # shuffle the labeled names array
random.shuffle(labeled_all_names)
print("First 15 random labeled combined
names:") print (labeled_all_names[:15])
OUTPUT