LAB 1 -
import re
def process_text(text):
words = [Link](r'[\s\r\n\W]+', text)
dates = [Link](r'\b\d{2}/\d{2}/\d{4}\b', text)
phones = [Link](r'(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})', text)
return words, dates, phones
text = "Call me on 123-456-7890 or (987) 654 3210. Let's meet on 15/04/2025.\
nOkay?"
words, dates, phones = process_text(text)
print("Words:", words)
print("Dates:", dates)
print("Phones:", phones)
LAB2-
import re
def clean_edges(text):
return [Link](r'^\W+|\W+$', '', text)
def count_non_alnum(text):
return len([Link](r'\W', text))
def replace_non_alnum(text, char='-'):
return [Link](r'\W', char, text)
s = "!!Hello, World!!"
print("1.", clean_edges(s))
print("2.", count_non_alnum(s))
print("3.", replace_non_alnum(s, '-'))
LAB -3 -
import random
def split_pairs(word):
return [(word[:i], word[i:]) for i in range(1, len(word))]
def prefixes_suffixes(word):
prefixes = [word[:i] for i in range(1, len(word)+1)]
suffixes = [word[i:] for i in range(len(word))]
return prefixes, suffixes
def random_splits(word):
positions = list(range(1, len(word)))
[Link](positions)
return [(word[:i], word[i:]) for i in positions]
w = "carried"
print("1. All pairs:", split_pairs(w))
p, s = prefixes_suffixes(w)
print("2. Prefixes:", p)
print(" Suffixes:", s)
print("3. Random splits:", random_splits(w))
LAB-4
from collections import Counter
def get_ngrams(text, n):
words = [Link]().split()
return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
def ngram_freq(text, n):
return Counter(get_ngrams(text, n))
def ngram_probs(text, n):
grams = get_ngrams(text, n)
total = len(grams)
freq = Counter(grams)
return {g: freq[g]/total for g in freq}
def reverse_ngrams(text, n):
words = [Link]().split()[::-1]
return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
text = "the quick brown fox jumps over the lazy dog"
print("1. Frequencies:", ngram_freq(text, 2))
print("2. Probabilities:", ngram_probs(text, 2))
print("3. Reverse n-grams:", reverse_ngrams(text, 2))
LAB -5 -
import re
def remove_digits(text):
return [Link](r'\d', '', text)
def count_digits(text):
return len([Link](r'\d', text))
def extract_digits(text):
return [Link](r'\d', text)
def greedy_tokenize(text):
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]
{2,})'
return [Link](pattern, text)
# Example
text = "Contact me at [Link]@[Link] before 02/28/2025."
print("1. No digits:", remove_digits(text))
print("2. Digit count:", count_digits(text))
print("3. Digits found:", extract_digits(text))
print("4. Greedy tokenized:", greedy_tokenize(text))
LAB-6
import re
def remove_noise(text):
return [Link](r'#\w+', '', text)
def remove_emojis(text):
return [Link](r'[^\w\s,]', '', text)
def normalize(text):
return ' '.join([Link]().split())
def extract_dates(text):
pattern = r'(\d{1,2}/\d{1,2}/\d{4}|\d{1,2}-\d{1,2}-\d{4}|[A-Za-z]+\s\d{1,2},\s\
d{4})'
return [Link](pattern, text)
def extract_phone(text):
matches = [Link](r'(\+?\d{1,2}[\s-]?)?\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}',
text)
return [[Link](r'\D', '', m)[-10:] for m in matches]
# Example input
text = "Hey 😊 call me at (123) 456-7890 or 123-456-7890 on 01/02/2025 or Jan 5,
2024! #happy"
print("1. Cleaned Text:", remove_noise(text))
print("2. No Emojis:", remove_emojis(text))
print("3. Normalized:", normalize(text))
print("4. Dates:", extract_dates(text))
print("5. Phones:", extract_phone(text))
LAB -7
import re
import spacy
from [Link] import PorterStemmer, WordNetLemmatizer
# Load spaCy for NER
nlp = [Link]('en_core_web_sm')
# Custom Tokenizer: Removes hashtags, mentions, and URLs
def custom_tokenizer(text):
text = [Link](r'(@\w+|#\w+|http\S+)', '', text) # Remove mentions, hashtags,
URLs
return [Link](r'\b\w+\b', [Link]()) # Tokenize and convert to
lowercase
# Stemming and Lemmatization
def apply_stemming_and_lemmatization(text):
words = custom_tokenizer(text)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed = [[Link](word) for word in words]
lemmatized = [[Link](word) for word in words]
return stemmed, lemmatized
# NER and Text Normalization
def ner_and_normalize(text):
doc = nlp(text)
entities = {'PERSON': [], 'ORG': [], 'DATE': []}
for ent in [Link]:
if ent.label_ == 'PERSON': entities['PERSON'].append([Link])
if ent.label_ == 'ORG': entities['ORG'].append([Link])
if ent.label_ == 'DATE': entities['DATE'].append([Link])
# Normalize Dates and Money
text = [Link](r'\d{1,2}/\d{1,2}/\d{2,4}', 'DATE', text) # Standardize dates
text = [Link](r'\$\d+', 'MONEY', text) # Standardize money
return entities, text
# Example Text
text = "John went to the store on 12/25/2025 to buy a laptop worth $500."
# Stemming, Lemmatization, NER and Normalization
stemmed, lemmatized = apply_stemming_and_lemmatization(text)
entities, normalized_text = ner_and_normalize(text)
# Results
print("Stemming:", stemmed)
print("Lemmatization:", lemmatized)
print("Entities:", entities)
print("Normalized Text:", normalized_text)
LAB -8-
import re
# Dictionary for common slangs and abbreviations
slang_dict = {
"brb": "be right back",
"btw": "by the way",
"lol": "laughing out loud",
"idk": "I don't know",
"smh": "shaking my head",
"tbh": "to be honest",
"fomo": "fear of missing out",
}
# Dictionary for emojis
emoji_dict = {
":)": "happy",
":(": "sad",
":D": "grinning",
":P": "playful",
"XD": "laughing",
"<3": "love",
}
# Function to replace slangs, emojis, and standardize punctuation
def standardize_text(text):
# Replace slangs
for slang, full_form in slang_dict.items():
text = [Link](rf"\b{slang}\b", full_form, text, flags=[Link])
# Replace emojis
for emoji, meaning in emoji_dict.items():
text = [Link](emoji, meaning)
# Standardize punctuation (e.g., multiple exclamation marks)
text = [Link](r'!+', '!', text) # Multiple exclamations become a single one
text = [Link](r'\?+', '?', text) # Multiple question marks become a single one
text = [Link](r'\s+', ' ', text) # Normalize spaces to a single space
# Convert to lowercase
text = [Link]()
return text
# Test the function
text = "BRB, I'll be back in a sec!!! LOL, this is amazing :) XD"
standardized_text = standardize_text(text)
# Output the result
print("Original Text:", text)
print("Standardized Text:", standardized_text)
LAB -9
import spacy
from spacy import displacy
nlp = [Link]("en_core_web_sm")
# Function to perform POS tagging, NER, and noun phrase extraction
def analyze_text(text):
# Process the text using spaCy
doc = nlp(text)
# Part of Speech Tagging (POS) for each word
print("POS Tagging:")
for token in doc:
print(f'{[Link]}: {token.pos_}')
# Syntax structure and Subject-Verb-Object relationships
print("\nSyntax Structure (Subject-Verb-Object):")
for token in doc:
if 'subj' in token.dep_:
subject = [Link]
verb = [child for child in [Link] if child.pos_ == 'VERB']
if verb:
verb = verb[0].text
print(f'Subject: {subject}, Verb: {verb}')
# Extracting Noun Phrases
print("\nNoun Phrases:")
for np in doc.noun_chunks:
print([Link])
# Named Entity Recognition (NER)
print("\nNamed Entities:")
for ent in [Link]:
print(f'{[Link]} ({ent.label_})')
# Visualize the dependency tree
[Link](doc, style='dep', jupyter=True)
# Test the function
text = "Apple is looking at buying U.K. startup for $1 billion. Tim Cook, CEO of
Apple, announced the news."
analyze_text(text)
LAB -10 -
import nltk
import numpy as np
from hmmlearn import hmm
from sklearn.naive_bayes import MultinomialNB
from [Link] import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from [Link] import fetch_20newsgroups
# Load dataset (20 Newsgroups dataset as example)
newsgroups = fetch_20newsgroups(subset='all',
categories=['[Link]', '[Link]'])
texts = [Link]
labels = [Link]
# Preprocess text (tokenize and vectorize using CountVectorizer)
vectorizer = CountVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(texts).toarray()
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3,
random_state=42)
# HMM Model for Text Classification (using discrete emissions)
class HMMTextClassifier:
def __init__(self, n_components=5):
[Link] = [Link](n_components=n_components)
def fit(self, X_train, y_train):
# Fit the HMM on the training data (text as sequences of tokens)
[Link](X_train)
def predict(self, X_test):
# Predict labels for the test set
return [Link](X_test)
hmm_model = HMMTextClassifier(n_components=5)
hmm_model.fit(X_train, y_train)
hmm_predictions = hmm_model.predict(X_test)
hmm_accuracy = accuracy_score(y_test, hmm_predictions)
print(f"HMM Accuracy: {hmm_accuracy}")
# Hybrid HMM-Naïve Bayes Model
class HybridHMMNB:
def __init__(self):
self.hmm_model = [Link](n_components=5)
self.nb_model = MultinomialNB()
def fit(self, X_train, y_train):
# Train HMM to extract features
self.hmm_model.fit(X_train)
hmm_features_train = self.hmm_model.predict(X_train)
# Train Naive Bayes on the extracted HMM features
self.nb_model.fit(hmm_features_train.reshape(-1, 1), y_train)
def predict(self, X_test):
# Predict HMM features and then classify using Naive Bayes
hmm_features_test = self.hmm_model.predict(X_test)
return self.nb_model.predict(hmm_features_test.reshape(-1, 1))
hybrid_model = HybridHMMNB()
hybrid_model.fit(X_train, y_train)
hybrid_predictions = hybrid_model.predict(X_test)
hybrid_accuracy = accuracy_score(y_test, hybrid_predictions)
print(f"Hybrid HMM-Naïve Bayes Accuracy: {hybrid_accuracy}")
# Conclusion: Compare accuracies
print(f"Accuracy Comparison: HMM vs Hybrid Model")
print(f"HMM Accuracy: {hmm_accuracy}")
print(f"Hybrid Model Accuracy: {hybrid_accuracy}")