Lab Program 1
!pip install gensim
corpus = ['king is a strong man','queen is a wise woman','boy is a young man','girl
is a young woman','prince is a young','prince will be strong','princess is
young','man is strong','woman is pretty', 'prince is a boy','prince will be king',
'princess is a girl', 'princess will be queen']
print(corpus)
statements_listt = []
for cor in corpus:
statements_listt.append([Link]())
print(statements_listt)
from [Link] import STOPWORDS
documents = [[word for word in document if word not in STOPWORDS] for document in
statements_listt]
documents
import gensim
from [Link] import Word2Vec
model = Word2Vec(documents, min_count=1, vector_size=3, window = 3)
# Assuming you have already trained your Word2Vec model and it's stored in the
'model' variable
# 1. Addition and Subtraction:
vector1 = [Link]['king']
vector2 = [Link]['man']
sum_vector = vector1 + vector2
print("sum vector ",sum_vector)
diff_vector = vector1 - vector2
print("difference vector ",sum_vector)
# 2. Cosine Similarity:
similarity = [Link]('king', 'queen')
print(f"Cosine Similarity between 'king' and 'queen': {similarity}")
# 3. Finding Most Similar Words:
similar_words = [Link].most_similar('king', topn=5)
print(f"Most Similar words to 'king': {similar_words}")
# 4. Analogy Example:
analogy_vector = [Link]['king'] - [Link]['man'] + [Link]['woman']
most_similar = [Link].most_similar(positive=[analogy_vector], topn=1)
print(f"Analogy Result (king - man + woman): {most_similar}")
program 2
import gensim
from [Link] import Word2Vec
import re
import pandas as pd
import [Link] as plt
from [Link] import PCA
# Sample domain-specific corpus (Technology)
technology_corpus = [
"Artificial intelligence is transforming various industries.",
"Machine learning algorithms improve predictive analytics.",
"Cloud computing enables scalable infrastructure for businesses.",
"Cybersecurity is crucial for protecting sensitive data.",
"Blockchain technology ensures secure and decentralized transactions.",
"The Internet of Things connects smart devices seamlessly.",
"Big data analytics helps organizations make data-driven decisions.",
"Quantum computing has the potential to revolutionize cryptography.",
"Edge computing brings computation closer to data sources.",
"Natural language processing enhances human-computer interactions."
]
# Basic text preprocessing function (tokenization & lowercasing)
def simple_tokenize(text):
return [Link](r'\b\w+\b', [Link]())
# Preprocess corpus manually
preprocessed_corpus = [simple_tokenize(sentence) for sentence in technology_corpus]
# Train Word2Vec model
model = Word2Vec(sentences=preprocessed_corpus, vector_size=50, window=5,
min_count=1, workers=4)
# Select 10 domain-specific words
selected_words = ["ai", "machine", "cloud", "cybersecurity", "blockchain", "iot",
"data", "quantum", "edge", "nlp"]
# Filter selected words to include only words present in [Link]
selected_words = [word for word in selected_words if word in [Link]]
# Extract word embeddings for selected words
word_vectors = [[Link][word] for word in selected_words if word in [Link]]
# Reduce dimensionality using PCA
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(word_vectors)
# Create DataFrame for visualization
df_embeddings = [Link](reduced_vectors, columns=["x", "y"],
index=selected_words)
# Plot embeddings
[Link](figsize=(10, 6))
[Link](df_embeddings["x"], df_embeddings["y"], marker='o')
for word, (x, y) in zip(df_embeddings.index, reduced_vectors):
[Link](x, y, word, fontsize=12)
[Link]("PCA Component 1")
[Link]("PCA Component 2")
[Link]("Word Embeddings Visualization (Technology Domain)")
[Link]()
# Function to get semantically similar words
def get_similar_words(word, top_n=5):
if word in [Link]:
return [Link].most_similar(word, topn=top_n)
else:
return f"Word '{word}' not in vocabulary."
# Example usage
input_word = "technology"
similar_words = get_similar_words(input_word)
print(f"Top 5 words similar to '{input_word}':", similar_words)