0% found this document useful (0 votes)

38 views7 pages

Bling

The document consists of a series of Python labs that cover various text processing techniques using regular expressions, natural language processing, and machine learning. Each lab demonstrates different functionalities such as text cleaning, tokenization, n-gram analysis, named entity recognition, and classification using HMM and Naive Bayes models. The labs utilize libraries like re, spacy, nltk, and sklearn to perform tasks related to text analysis and manipulation.

Uploaded by

girik11004

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

38 views7 pages

Bling

Uploaded by

girik11004

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

LAB 1 -

import re
def process_text(text):
words = [Link](r'[\s\r\n\W]+', text)
dates = [Link](r'\b\d{2}/\d{2}/\d{4}\b', text)
phones = [Link](r'($?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4})', text)
return words, dates, phones
text = "Call me on 123-456-7890 or (987) 654 3210. Let's meet on 15/04/2025.\
nOkay?"
words, dates, phones = process_text(text)
print("Words:", words)
print("Dates:", dates)
print("Phones:", phones)

LAB2-

import re
def clean_edges(text):
return [Link](r'^\W+|\W+$', '', text)

def count_non_alnum(text):
return len([Link](r'\W', text))

def replace_non_alnum(text, char='-'):

return [Link](r'\W', char, text)

s = "!!Hello, World!!"
print("1.", clean_edges(s))
print("2.", count_non_alnum(s))
print("3.", replace_non_alnum(s, '-'))

LAB -3 -

import random

def split_pairs(word):
return [(word[:i], word[i:]) for i in range(1, len(word))]

def prefixes_suffixes(word):
prefixes = [word[:i] for i in range(1, len(word)+1)]
suffixes = [word[i:] for i in range(len(word))]
return prefixes, suffixes

def random_splits(word):
positions = list(range(1, len(word)))
[Link](positions)
return [(word[:i], word[i:]) for i in positions]

w = "carried"
print("1. All pairs:", split_pairs(w))
p, s = prefixes_suffixes(w)
print("2. Prefixes:", p)
print(" Suffixes:", s)
print("3. Random splits:", random_splits(w))

LAB-4
from collections import Counter

def get_ngrams(text, n):

words = [Link]().split()
return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

def ngram_freq(text, n):

return Counter(get_ngrams(text, n))

def ngram_probs(text, n):

grams = get_ngrams(text, n)
total = len(grams)
freq = Counter(grams)
return {g: freq[g]/total for g in freq}

def reverse_ngrams(text, n):

words = [Link]().split()[::-1]
return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

text = "the quick brown fox jumps over the lazy dog"
print("1. Frequencies:", ngram_freq(text, 2))
print("2. Probabilities:", ngram_probs(text, 2))
print("3. Reverse n-grams:", reverse_ngrams(text, 2))

LAB -5 -

import re

def remove_digits(text):
return [Link](r'\d', '', text)

def count_digits(text):
return len([Link](r'\d', text))

def extract_digits(text):
return [Link](r'\d', text)

def greedy_tokenize(text):
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]
{2,})'
return [Link](pattern, text)

# Example
text = "Contact me at [Link]@[Link] before 02/28/2025."
print("1. No digits:", remove_digits(text))
print("2. Digit count:", count_digits(text))
print("3. Digits found:", extract_digits(text))
print("4. Greedy tokenized:", greedy_tokenize(text))

LAB-6

import re

def remove_noise(text):
return [Link](r'#\w+', '', text)

def remove_emojis(text):
return [Link](r'[^\w\s,]', '', text)
def normalize(text):
return ' '.join([Link]().split())

def extract_dates(text):
pattern = r'(\d{1,2}/\d{1,2}/\d{4}|\d{1,2}-\d{1,2}-\d{4}|[A-Za-z]+\s\d{1,2},\s\
d{4})'
return [Link](pattern, text)

def extract_phone(text):
matches = [Link](r'(\+?\d{1,2}[\s-]?)?$?\d{3}$?[\s-]?\d{3}[\s-]?\d{4}',
text)
return [[Link](r'\D', '', m)[-10:] for m in matches]

# Example input
text = "Hey 😊 call me at (123) 456-7890 or 123-456-7890 on 01/02/2025 or Jan 5,
2024! #happy"

print("1. Cleaned Text:", remove_noise(text))

print("2. No Emojis:", remove_emojis(text))
print("3. Normalized:", normalize(text))
print("4. Dates:", extract_dates(text))
print("5. Phones:", extract_phone(text))

LAB -7

import re
import spacy
from [Link] import PorterStemmer, WordNetLemmatizer

# Load spaCy for NER

nlp = [Link]('en_core_web_sm')

# Custom Tokenizer: Removes hashtags, mentions, and URLs

def custom_tokenizer(text):
text = [Link](r'(@\w+|#\w+|http\S+)', '', text) # Remove mentions, hashtags,
URLs
return [Link](r'\b\w+\b', [Link]()) # Tokenize and convert to
lowercase

# Stemming and Lemmatization

def apply_stemming_and_lemmatization(text):
words = custom_tokenizer(text)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed = [[Link](word) for word in words]
lemmatized = [[Link](word) for word in words]
return stemmed, lemmatized

# NER and Text Normalization

def ner_and_normalize(text):
doc = nlp(text)
entities = {'PERSON': [], 'ORG': [], 'DATE': []}

for ent in [Link]:

if ent.label_ == 'PERSON': entities['PERSON'].append([Link])
if ent.label_ == 'ORG': entities['ORG'].append([Link])
if ent.label_ == 'DATE': entities['DATE'].append([Link])
# Normalize Dates and Money
text = [Link](r'\d{1,2}/\d{1,2}/\d{2,4}', 'DATE', text) # Standardize dates
text = [Link](r'\$\d+', 'MONEY', text) # Standardize money
return entities, text

# Example Text
text = "John went to the store on 12/25/2025 to buy a laptop worth $500."

# Stemming, Lemmatization, NER and Normalization

stemmed, lemmatized = apply_stemming_and_lemmatization(text)
entities, normalized_text = ner_and_normalize(text)

# Results
print("Stemming:", stemmed)
print("Lemmatization:", lemmatized)
print("Entities:", entities)
print("Normalized Text:", normalized_text)

LAB -8-

import re

# Dictionary for common slangs and abbreviations

slang_dict = {
"brb": "be right back",
"btw": "by the way",
"lol": "laughing out loud",
"idk": "I don't know",
"smh": "shaking my head",
"tbh": "to be honest",
"fomo": "fear of missing out",
}

# Dictionary for emojis

emoji_dict = {
":)": "happy",
":(": "sad",
":D": "grinning",
":P": "playful",
"XD": "laughing",
"<3": "love",
}

# Function to replace slangs, emojis, and standardize punctuation

def standardize_text(text):
# Replace slangs
for slang, full_form in slang_dict.items():
text = [Link](rf"\b{slang}\b", full_form, text, flags=[Link])

# Replace emojis
for emoji, meaning in emoji_dict.items():
text = [Link](emoji, meaning)

# Standardize punctuation (e.g., multiple exclamation marks)

text = [Link](r'!+', '!', text) # Multiple exclamations become a single one
text = [Link](r'\?+', '?', text) # Multiple question marks become a single one
text = [Link](r'\s+', ' ', text) # Normalize spaces to a single space

# Convert to lowercase
text = [Link]()

return text

# Test the function

text = "BRB, I'll be back in a sec!!! LOL, this is amazing :) XD"
standardized_text = standardize_text(text)

# Output the result

print("Original Text:", text)
print("Standardized Text:", standardized_text)

LAB -9

import spacy
from spacy import displacy

nlp = [Link]("en_core_web_sm")

# Function to perform POS tagging, NER, and noun phrase extraction

def analyze_text(text):
# Process the text using spaCy
doc = nlp(text)

# Part of Speech Tagging (POS) for each word

print("POS Tagging:")
for token in doc:
print(f'{[Link]}: {token.pos_}')

# Syntax structure and Subject-Verb-Object relationships

print("\nSyntax Structure (Subject-Verb-Object):")
for token in doc:
if 'subj' in token.dep_:
subject = [Link]
verb = [child for child in [Link] if child.pos_ == 'VERB']
if verb:
verb = verb[0].text
print(f'Subject: {subject}, Verb: {verb}')

# Extracting Noun Phrases

print("\nNoun Phrases:")
for np in doc.noun_chunks:
print([Link])

# Named Entity Recognition (NER)

print("\nNamed Entities:")
for ent in [Link]:
print(f'{[Link]} ({ent.label_})')

# Visualize the dependency tree

[Link](doc, style='dep', jupyter=True)

# Test the function

text = "Apple is looking at buying U.K. startup for $1 billion. Tim Cook, CEO of
Apple, announced the news."
analyze_text(text)

LAB -10 -
import nltk
import numpy as np
from hmmlearn import hmm
from sklearn.naive_bayes import MultinomialNB
from [Link] import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from [Link] import fetch_20newsgroups

# Load dataset (20 Newsgroups dataset as example)

newsgroups = fetch_20newsgroups(subset='all',
categories=['[Link]', '[Link]'])
texts = [Link]
labels = [Link]

# Preprocess text (tokenize and vectorize using CountVectorizer)

vectorizer = CountVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(texts).toarray()

# Split into training and test set

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3,
random_state=42)

# HMM Model for Text Classification (using discrete emissions)

class HMMTextClassifier:
def __init__(self, n_components=5):
[Link] = [Link](n_components=n_components)

def fit(self, X_train, y_train):

# Fit the HMM on the training data (text as sequences of tokens)
[Link](X_train)

def predict(self, X_test):

# Predict labels for the test set
return [Link](X_test)

hmm_model = HMMTextClassifier(n_components=5)
hmm_model.fit(X_train, y_train)
hmm_predictions = hmm_model.predict(X_test)
hmm_accuracy = accuracy_score(y_test, hmm_predictions)
print(f"HMM Accuracy: {hmm_accuracy}")

# Hybrid HMM-Naïve Bayes Model

class HybridHMMNB:
def __init__(self):
self.hmm_model = [Link](n_components=5)
self.nb_model = MultinomialNB()

def fit(self, X_train, y_train):

# Train HMM to extract features
self.hmm_model.fit(X_train)
hmm_features_train = self.hmm_model.predict(X_train)

# Train Naive Bayes on the extracted HMM features

self.nb_model.fit(hmm_features_train.reshape(-1, 1), y_train)

def predict(self, X_test):

# Predict HMM features and then classify using Naive Bayes
hmm_features_test = self.hmm_model.predict(X_test)
return self.nb_model.predict(hmm_features_test.reshape(-1, 1))

hybrid_model = HybridHMMNB()
hybrid_model.fit(X_train, y_train)
hybrid_predictions = hybrid_model.predict(X_test)
hybrid_accuracy = accuracy_score(y_test, hybrid_predictions)
print(f"Hybrid HMM-Naïve Bayes Accuracy: {hybrid_accuracy}")

# Conclusion: Compare accuracies

print(f"Accuracy Comparison: HMM vs Hybrid Model")
print(f"HMM Accuracy: {hmm_accuracy}")
print(f"Hybrid Model Accuracy: {hybrid_accuracy}")

NLP Lab - Manual
No ratings yet
NLP Lab - Manual
33 pages
NLP Record
No ratings yet
NLP Record
15 pages
Module 5
No ratings yet
Module 5
69 pages
NLP Lab Manual for CSE Students
No ratings yet
NLP Lab Manual for CSE Students
28 pages
Unit 5
No ratings yet
Unit 5
4 pages
NLP Practical Journal
No ratings yet
NLP Practical Journal
36 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
Session2 3
No ratings yet
Session2 3
18 pages
DSBDL Assn 07
No ratings yet
DSBDL Assn 07
4 pages
NLP Practical Journal 2023-24
No ratings yet
NLP Practical Journal 2023-24
22 pages
NLP Techniques for Text Processing
No ratings yet
NLP Techniques for Text Processing
41 pages
NLP Text Processing Techniques
No ratings yet
NLP Text Processing Techniques
6 pages
Experiment: 1
No ratings yet
Experiment: 1
28 pages
NLP - Course EDC 1 29
No ratings yet
NLP - Course EDC 1 29
29 pages
NLP Lecture2 Text Pre Processing
No ratings yet
NLP Lecture2 Text Pre Processing
54 pages
NLP Session 4
No ratings yet
NLP Session 4
13 pages
NLP Smitpatel
No ratings yet
NLP Smitpatel
32 pages
NLP Lab Manual (R20)
50% (2)
NLP Lab Manual (R20)
24 pages
Lab Manual - NLP
No ratings yet
Lab Manual - NLP
60 pages
NLP - Exp 1 11
No ratings yet
NLP - Exp 1 11
29 pages
NLP Lab Codes Till Mod3
No ratings yet
NLP Lab Codes Till Mod3
7 pages
Clint-Roy Muvirimi-Mukarakate H1802386 AI Practical Assignment
No ratings yet
Clint-Roy Muvirimi-Mukarakate H1802386 AI Practical Assignment
8 pages
Natural Language Processing
No ratings yet
Natural Language Processing
17 pages
NLP Exp-123
No ratings yet
NLP Exp-123
6 pages
NLP Lab Manual - Final
No ratings yet
NLP Lab Manual - Final
15 pages
NLPPractical
No ratings yet
NLPPractical
12 pages
NLP Problem-Solving Techniques
No ratings yet
NLP Problem-Solving Techniques
118 pages
NLP - Shortnotes Unit 1 & 2
100% (1)
NLP - Shortnotes Unit 1 & 2
16 pages
NLP
No ratings yet
NLP
12 pages
Python Text Preprocessing Techniques
No ratings yet
Python Text Preprocessing Techniques
6 pages
Python Text Processing and NLP Basics
No ratings yet
Python Text Processing and NLP Basics
32 pages
NLP Day1
No ratings yet
NLP Day1
4 pages
Jal Patel NLP
No ratings yet
Jal Patel NLP
32 pages
Tokenization (Breaking Text Into Words) : Import From Import From Import From Import
No ratings yet
Tokenization (Breaking Text Into Words) : Import From Import From Import From Import
11 pages
Natural Language Processing Lab Manual
No ratings yet
Natural Language Processing Lab Manual
24 pages
NLP - Shortnotes Unit 1 & 2
No ratings yet
NLP - Shortnotes Unit 1 & 2
16 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
7 pages
Rajeev Mishra 20 SCSE1180087
No ratings yet
Rajeev Mishra 20 SCSE1180087
29 pages
Ngram 2x3
No ratings yet
Ngram 2x3
5 pages
NLP with Python Lab Manual
No ratings yet
NLP with Python Lab Manual
15 pages
NLP Techniques in Machine Learning Lab
No ratings yet
NLP Techniques in Machine Learning Lab
4 pages
NLP Experiment 1
No ratings yet
NLP Experiment 1
13 pages
Python Regex & NLTK Guide
No ratings yet
Python Regex & NLTK Guide
53 pages
NLP - Cheatsheet
No ratings yet
NLP - Cheatsheet
10 pages
DSBA+Master+Codebook+ +Text+Mining+&+TSF
No ratings yet
DSBA+Master+Codebook+ +Text+Mining+&+TSF
11 pages
NLP Practicals
No ratings yet
NLP Practicals
6 pages
NLP Tasks for MCA Students
No ratings yet
NLP Tasks for MCA Students
16 pages
Python Text Processing Techniques
No ratings yet
Python Text Processing Techniques
13 pages
InfoSec Lab Manual for Students
No ratings yet
InfoSec Lab Manual for Students
25 pages
NLP Pipeline: Chapter-2
No ratings yet
NLP Pipeline: Chapter-2
171 pages
NLP Lab1
No ratings yet
NLP Lab1
6 pages
CD Lab
No ratings yet
CD Lab
41 pages
Tinywow Pythass3 77951173
No ratings yet
Tinywow Pythass3 77951173
17 pages
NLP Lab File
No ratings yet
NLP Lab File
15 pages
NLP Lab Exp 01
No ratings yet
NLP Lab Exp 01
5 pages
SSC 40 Day Plan
No ratings yet
SSC 40 Day Plan
2 pages
FActbank Paper
No ratings yet
FActbank Paper
11 pages
Speakout Third Edition С1 С2 Students Book Keys
No ratings yet
Speakout Third Edition С1 С2 Students Book Keys
42 pages
Grade 5 English Activities
No ratings yet
Grade 5 English Activities
29 pages
Focus On Vocabulary 1: Bridging Vocabulary
No ratings yet
Focus On Vocabulary 1: Bridging Vocabulary
3 pages
Homework Four Solution - CSE 355
No ratings yet
Homework Four Solution - CSE 355
10 pages
Learning - Guide - 1 ENGLISH DOES WORK LEVEL 13 SENA
No ratings yet
Learning - Guide - 1 ENGLISH DOES WORK LEVEL 13 SENA
12 pages
Wechsel Verbs
No ratings yet
Wechsel Verbs
2 pages
Punctuation Marks
No ratings yet
Punctuation Marks
14 pages
Teaching Grammar
No ratings yet
Teaching Grammar
23 pages
B1 Level Subject Verb Agreement Exercises
No ratings yet
B1 Level Subject Verb Agreement Exercises
6 pages
English Language Practice Test
No ratings yet
English Language Practice Test
4 pages
英文-Acceptability Instructions 2.9
No ratings yet
英文-Acceptability Instructions 2.9
8 pages
EFL Vocabulary Teaching Guide
No ratings yet
EFL Vocabulary Teaching Guide
4 pages
Grammar for 2nd Year English Students
No ratings yet
Grammar for 2nd Year English Students
8 pages
Tematica - Bibliografie - Licență LLE-LLM 2022-2023
No ratings yet
Tematica - Bibliografie - Licență LLE-LLM 2022-2023
2 pages
Kurikulum Program Studi Sastra Inggris
No ratings yet
Kurikulum Program Studi Sastra Inggris
2 pages
19 - 02 - 25 (Clase 2) & 25 - 02 - 25 (Clase 3)
No ratings yet
19 - 02 - 25 (Clase 2) & 25 - 02 - 25 (Clase 3)
4 pages
Grammar Sparkle 8
No ratings yet
Grammar Sparkle 8
33 pages
Ramos Pinto-Sociolinguistics and Translation-Handbook - of - Translation - Studies - Volume - 3
No ratings yet
Ramos Pinto-Sociolinguistics and Translation-Handbook - of - Translation - Studies - Volume - 3
24 pages
Eng Reviewer
100% (1)
Eng Reviewer
278 pages
ENGLISH NOTES CLASS 10th
No ratings yet
ENGLISH NOTES CLASS 10th
38 pages
10 - Eng-MARCH-4PM-YT
No ratings yet
10 - Eng-MARCH-4PM-YT
56 pages
Friends Global 11 - Unit 4d Grammar
No ratings yet
Friends Global 11 - Unit 4d Grammar
10 pages
Shewa - NLP Project Report PDF
No ratings yet
Shewa - NLP Project Report PDF
7 pages
English Prepositions for Learners
100% (1)
English Prepositions for Learners
3 pages
English Lang Notes 9093
No ratings yet
English Lang Notes 9093
3 pages
Grade 3 (3rd QE)
No ratings yet
Grade 3 (3rd QE)
4 pages
Oral Communication Exam 2019-2020
No ratings yet
Oral Communication Exam 2019-2020
3 pages
Essential English Learning Resources
No ratings yet
Essential English Learning Resources
33 pages

Bling

Uploaded by

Bling

Uploaded by

LAB 1 -

def replace_non_alnum(text, char='-'):

def get_ngrams(text, n):

def ngram_freq(text, n):

def ngram_probs(text, n):

def reverse_ngrams(text, n):

print("1. Cleaned Text:", remove_noise(text))

# Load spaCy for NER

# Custom Tokenizer: Removes hashtags, mentions, and URLs

# Stemming and Lemmatization

# NER and Text Normalization

for ent in [Link]:

# Stemming, Lemmatization, NER and Normalization

# Dictionary for common slangs and abbreviations

# Dictionary for emojis

# Function to replace slangs, emojis, and standardize punctuation

# Standardize punctuation (e.g., multiple exclamation marks)

# Test the function

# Output the result

# Function to perform POS tagging, NER, and noun phrase extraction

# Part of Speech Tagging (POS) for each word

# Syntax structure and Subject-Verb-Object relationships

# Extracting Noun Phrases

# Named Entity Recognition (NER)

# Visualize the dependency tree

# Test the function

# Load dataset (20 Newsgroups dataset as example)

# Preprocess text (tokenize and vectorize using CountVectorizer)

# Split into training and test set

# HMM Model for Text Classification (using discrete emissions)

def fit(self, X_train, y_train):

def predict(self, X_test):

# Hybrid HMM-Naïve Bayes Model

def fit(self, X_train, y_train):

# Train Naive Bayes on the extracted HMM features

def predict(self, X_test):

# Conclusion: Compare accuracies

You might also like