0% found this document useful (0 votes)
38 views7 pages

Bling

The document consists of a series of Python labs that cover various text processing techniques using regular expressions, natural language processing, and machine learning. Each lab demonstrates different functionalities such as text cleaning, tokenization, n-gram analysis, named entity recognition, and classification using HMM and Naive Bayes models. The labs utilize libraries like re, spacy, nltk, and sklearn to perform tasks related to text analysis and manipulation.

Uploaded by

girik11004
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
38 views7 pages

Bling

The document consists of a series of Python labs that cover various text processing techniques using regular expressions, natural language processing, and machine learning. Each lab demonstrates different functionalities such as text cleaning, tokenization, n-gram analysis, named entity recognition, and classification using HMM and Naive Bayes models. The labs utilize libraries like re, spacy, nltk, and sklearn to perform tasks related to text analysis and manipulation.

Uploaded by

girik11004
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

LAB 1 -

import re
def process_text(text):
words = [Link](r'[\s\r\n\W]+', text)
dates = [Link](r'\b\d{2}/\d{2}/\d{4}\b', text)
phones = [Link](r'(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})', text)
return words, dates, phones
text = "Call me on 123-456-7890 or (987) 654 3210. Let's meet on 15/04/2025.\
nOkay?"
words, dates, phones = process_text(text)
print("Words:", words)
print("Dates:", dates)
print("Phones:", phones)

LAB2-

import re
def clean_edges(text):
return [Link](r'^\W+|\W+$', '', text)

def count_non_alnum(text):
return len([Link](r'\W', text))

def replace_non_alnum(text, char='-'):


return [Link](r'\W', char, text)

s = "!!Hello, World!!"
print("1.", clean_edges(s))
print("2.", count_non_alnum(s))
print("3.", replace_non_alnum(s, '-'))

LAB -3 -

import random

def split_pairs(word):
return [(word[:i], word[i:]) for i in range(1, len(word))]

def prefixes_suffixes(word):
prefixes = [word[:i] for i in range(1, len(word)+1)]
suffixes = [word[i:] for i in range(len(word))]
return prefixes, suffixes

def random_splits(word):
positions = list(range(1, len(word)))
[Link](positions)
return [(word[:i], word[i:]) for i in positions]

w = "carried"
print("1. All pairs:", split_pairs(w))
p, s = prefixes_suffixes(w)
print("2. Prefixes:", p)
print(" Suffixes:", s)
print("3. Random splits:", random_splits(w))

LAB-4
from collections import Counter

def get_ngrams(text, n):


words = [Link]().split()
return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

def ngram_freq(text, n):


return Counter(get_ngrams(text, n))

def ngram_probs(text, n):


grams = get_ngrams(text, n)
total = len(grams)
freq = Counter(grams)
return {g: freq[g]/total for g in freq}

def reverse_ngrams(text, n):


words = [Link]().split()[::-1]
return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

text = "the quick brown fox jumps over the lazy dog"
print("1. Frequencies:", ngram_freq(text, 2))
print("2. Probabilities:", ngram_probs(text, 2))
print("3. Reverse n-grams:", reverse_ngrams(text, 2))

LAB -5 -

import re

def remove_digits(text):
return [Link](r'\d', '', text)

def count_digits(text):
return len([Link](r'\d', text))

def extract_digits(text):
return [Link](r'\d', text)

def greedy_tokenize(text):
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]
{2,})'
return [Link](pattern, text)

# Example
text = "Contact me at [Link]@[Link] before 02/28/2025."
print("1. No digits:", remove_digits(text))
print("2. Digit count:", count_digits(text))
print("3. Digits found:", extract_digits(text))
print("4. Greedy tokenized:", greedy_tokenize(text))

LAB-6

import re

def remove_noise(text):
return [Link](r'#\w+', '', text)

def remove_emojis(text):
return [Link](r'[^\w\s,]', '', text)
def normalize(text):
return ' '.join([Link]().split())

def extract_dates(text):
pattern = r'(\d{1,2}/\d{1,2}/\d{4}|\d{1,2}-\d{1,2}-\d{4}|[A-Za-z]+\s\d{1,2},\s\
d{4})'
return [Link](pattern, text)

def extract_phone(text):
matches = [Link](r'(\+?\d{1,2}[\s-]?)?\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}',
text)
return [[Link](r'\D', '', m)[-10:] for m in matches]

# Example input
text = "Hey 😊 call me at (123) 456-7890 or 123-456-7890 on 01/02/2025 or Jan 5,
2024! #happy"

print("1. Cleaned Text:", remove_noise(text))


print("2. No Emojis:", remove_emojis(text))
print("3. Normalized:", normalize(text))
print("4. Dates:", extract_dates(text))
print("5. Phones:", extract_phone(text))

LAB -7

import re
import spacy
from [Link] import PorterStemmer, WordNetLemmatizer

# Load spaCy for NER


nlp = [Link]('en_core_web_sm')

# Custom Tokenizer: Removes hashtags, mentions, and URLs


def custom_tokenizer(text):
text = [Link](r'(@\w+|#\w+|http\S+)', '', text) # Remove mentions, hashtags,
URLs
return [Link](r'\b\w+\b', [Link]()) # Tokenize and convert to
lowercase

# Stemming and Lemmatization


def apply_stemming_and_lemmatization(text):
words = custom_tokenizer(text)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed = [[Link](word) for word in words]
lemmatized = [[Link](word) for word in words]
return stemmed, lemmatized

# NER and Text Normalization


def ner_and_normalize(text):
doc = nlp(text)
entities = {'PERSON': [], 'ORG': [], 'DATE': []}

for ent in [Link]:


if ent.label_ == 'PERSON': entities['PERSON'].append([Link])
if ent.label_ == 'ORG': entities['ORG'].append([Link])
if ent.label_ == 'DATE': entities['DATE'].append([Link])
# Normalize Dates and Money
text = [Link](r'\d{1,2}/\d{1,2}/\d{2,4}', 'DATE', text) # Standardize dates
text = [Link](r'\$\d+', 'MONEY', text) # Standardize money
return entities, text

# Example Text
text = "John went to the store on 12/25/2025 to buy a laptop worth $500."

# Stemming, Lemmatization, NER and Normalization


stemmed, lemmatized = apply_stemming_and_lemmatization(text)
entities, normalized_text = ner_and_normalize(text)

# Results
print("Stemming:", stemmed)
print("Lemmatization:", lemmatized)
print("Entities:", entities)
print("Normalized Text:", normalized_text)

LAB -8-

import re

# Dictionary for common slangs and abbreviations


slang_dict = {
"brb": "be right back",
"btw": "by the way",
"lol": "laughing out loud",
"idk": "I don't know",
"smh": "shaking my head",
"tbh": "to be honest",
"fomo": "fear of missing out",
}

# Dictionary for emojis


emoji_dict = {
":)": "happy",
":(": "sad",
":D": "grinning",
":P": "playful",
"XD": "laughing",
"<3": "love",
}

# Function to replace slangs, emojis, and standardize punctuation


def standardize_text(text):
# Replace slangs
for slang, full_form in slang_dict.items():
text = [Link](rf"\b{slang}\b", full_form, text, flags=[Link])

# Replace emojis
for emoji, meaning in emoji_dict.items():
text = [Link](emoji, meaning)

# Standardize punctuation (e.g., multiple exclamation marks)


text = [Link](r'!+', '!', text) # Multiple exclamations become a single one
text = [Link](r'\?+', '?', text) # Multiple question marks become a single one
text = [Link](r'\s+', ' ', text) # Normalize spaces to a single space

# Convert to lowercase
text = [Link]()

return text

# Test the function


text = "BRB, I'll be back in a sec!!! LOL, this is amazing :) XD"
standardized_text = standardize_text(text)

# Output the result


print("Original Text:", text)
print("Standardized Text:", standardized_text)

LAB -9

import spacy
from spacy import displacy

nlp = [Link]("en_core_web_sm")

# Function to perform POS tagging, NER, and noun phrase extraction


def analyze_text(text):
# Process the text using spaCy
doc = nlp(text)

# Part of Speech Tagging (POS) for each word


print("POS Tagging:")
for token in doc:
print(f'{[Link]}: {token.pos_}')

# Syntax structure and Subject-Verb-Object relationships


print("\nSyntax Structure (Subject-Verb-Object):")
for token in doc:
if 'subj' in token.dep_:
subject = [Link]
verb = [child for child in [Link] if child.pos_ == 'VERB']
if verb:
verb = verb[0].text
print(f'Subject: {subject}, Verb: {verb}')

# Extracting Noun Phrases


print("\nNoun Phrases:")
for np in doc.noun_chunks:
print([Link])

# Named Entity Recognition (NER)


print("\nNamed Entities:")
for ent in [Link]:
print(f'{[Link]} ({ent.label_})')

# Visualize the dependency tree


[Link](doc, style='dep', jupyter=True)

# Test the function


text = "Apple is looking at buying U.K. startup for $1 billion. Tim Cook, CEO of
Apple, announced the news."
analyze_text(text)

LAB -10 -
import nltk
import numpy as np
from hmmlearn import hmm
from sklearn.naive_bayes import MultinomialNB
from [Link] import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from [Link] import fetch_20newsgroups

# Load dataset (20 Newsgroups dataset as example)


newsgroups = fetch_20newsgroups(subset='all',
categories=['[Link]', '[Link]'])
texts = [Link]
labels = [Link]

# Preprocess text (tokenize and vectorize using CountVectorizer)


vectorizer = CountVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(texts).toarray()

# Split into training and test set


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3,
random_state=42)

# HMM Model for Text Classification (using discrete emissions)


class HMMTextClassifier:
def __init__(self, n_components=5):
[Link] = [Link](n_components=n_components)

def fit(self, X_train, y_train):


# Fit the HMM on the training data (text as sequences of tokens)
[Link](X_train)

def predict(self, X_test):


# Predict labels for the test set
return [Link](X_test)

hmm_model = HMMTextClassifier(n_components=5)
hmm_model.fit(X_train, y_train)
hmm_predictions = hmm_model.predict(X_test)
hmm_accuracy = accuracy_score(y_test, hmm_predictions)
print(f"HMM Accuracy: {hmm_accuracy}")

# Hybrid HMM-Naïve Bayes Model


class HybridHMMNB:
def __init__(self):
self.hmm_model = [Link](n_components=5)
self.nb_model = MultinomialNB()

def fit(self, X_train, y_train):


# Train HMM to extract features
self.hmm_model.fit(X_train)
hmm_features_train = self.hmm_model.predict(X_train)

# Train Naive Bayes on the extracted HMM features


self.nb_model.fit(hmm_features_train.reshape(-1, 1), y_train)

def predict(self, X_test):


# Predict HMM features and then classify using Naive Bayes
hmm_features_test = self.hmm_model.predict(X_test)
return self.nb_model.predict(hmm_features_test.reshape(-1, 1))

hybrid_model = HybridHMMNB()
hybrid_model.fit(X_train, y_train)
hybrid_predictions = hybrid_model.predict(X_test)
hybrid_accuracy = accuracy_score(y_test, hybrid_predictions)
print(f"Hybrid HMM-Naïve Bayes Accuracy: {hybrid_accuracy}")

# Conclusion: Compare accuracies


print(f"Accuracy Comparison: HMM vs Hybrid Model")
print(f"HMM Accuracy: {hmm_accuracy}")
print(f"Hybrid Model Accuracy: {hybrid_accuracy}")

You might also like