Ir Practical Manual 2
Ir Practical Manual 2
Code :
import nltk
document1 = "The quick brown fox jumped over the lazy dog"
nltk.download('stopwords')
stopWords = stopwords.words('english')
tokens1 = document1.lower().split()
tokens2 = document2.lower().split()
inverted_index = {}
occ_num_doc1 = {}
occ_num_doc2 = {}
if term in stopWords:
continue
documents = []
if term in tokens1:
documents.append("Document 1")
occ_num_doc1[term] = tokens1.count(term)
if term in tokens2:
documents.append("Document 2")
occ_num_doc2[term] = tokens2.count(term)
inverted_index[term] = documents
else:
print()
OUTPUT :
[nltk_data] C:\Users\shivl\AppData\Roaming\nltk_data...
Implement the vector space model with TF-IDF weighting and cosine similarity.
Code :
documents = {
2: "apple banana",
3: "banana orange",
4: "apple"
def build_index(docs):
index = {}
terms = set(text.split())
index[term] = {doc_id}
else:
index[term].add(doc_id)
return index
inverted_index = build_index(documents)
if not operands:
return list(result)
result = set()
return list(result)
return list(all_docs_set.difference(operand_set))
OUTPUT :
CODE :
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import nltk
import numpy as np
nltk.download('stopwords')
stopWords = stopwords.words('english')
vectorizer = CountVectorizer(stop_words=stopWords)
transformer = TfidfTransformer()
trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
testVectorizerArray = vectorizer.transform(test_set).toarray()
print(vector)
print(testV)
print(cosine)
transformer.fit(trainVectorizerArray)
print()
print(transformer.transform(trainVectorizerArray).toarray())
transformer.fit(testVectorizerArray)
print()
tfidf = transformer.transform(testVectorizerArray)
print(tfidf.todense())
OUTPUT :
[nltk_data] C:\Users\shivl\AppData\Roaming\nltk_data...
[0 1 0 1]]
[1 0 1 0]
[0 1 1 1]
0.408
[0 1 0 1]
[0 1 1 1]
0.816
[[0.70710678 0. 0.70710678 0. ]
CODE :
if m == 0:
return n
if n == 0:
return m
if str1[m-1] == str2[n-1]:
str1 = "sunday"
str2 = "saturday"
OUTPUT :
Calculate precision, recall, and F-measure for a given set of retrieval results.
Use an evaluation toolkit to measure average precision and other evaluation metrics.
CODE :
A)
true_positive = len(retrieved_set.intersection(relevant_set))
false_positive = len(retrieved_set.difference(relevant_set))
false_negative = len(relevant_set.difference(retrieved_set))
'''
(Optional)
PPT values:
true_positive = 20
false_positive = 10
false_negative = 30
'''
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-measure: {f_measure}")
OUTPUT :
True Positive: 1
False Positive: 2
False Negative: 1
Precision: 0.3333333333333333
Recall: 0.5
F-measure: 0.4
B)
y_true = [0, 1, 1, 0, 1, 1]
y_true = [0, 1, 1, 0, 1, 1]
OUTPUT :
Implement a text classification algorithm (e.g., Naive Bayes or Support Vector Machines).
CODE :
import pandas as pd
X, y = df_train["data"], df_train["flu"]
# Split data
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)
classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)
predictions = classifier.predict(new_data_counts)
print(predictions)
print("Classification Report:")
print(classification_report(y_test, classifier.predict(X_test_counts)))
df_test['flu_prediction'] = predictions
OUTPUT :
Accuracy: 1.00
Classification Report:
accuracy 1.00 1
Apply the clustering algorithm to a set of documents and evaluate the clustering results.
CODE :
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
print(kmeans.labels_)
OUTPUT :
[0 2 0 1 0]
PRACTICAL NO 7
CODE :
import requests
import time
def get_html(url):
try:
except requests.RequestException:
return None
def save_and_print_robots_txt(url):
robots_content = get_html(robots_url)
if robots_content:
file.write(robots_content)
print("robots.txt content:")
print(robots_content)
else:
def is_allowed_by_robots(url):
try:
parser = RobotFileParser()
parser.parse(file.read().splitlines())
except Exception:
return True
visited = set()
return
visited.add(url)
time.sleep(delay)
html = get_html(url)
if html:
print(f"Crawling: {url}")
crawl_recursive(link, depth + 1)
crawl_recursive(start_url, 1)
# Example usage:
OUTPUT :
robots.txt content:
Crawling: https://wikipedia.com
Crawling: https://en.wikipedia.org/
Crawling: https://ja.wikipedia.org/
Crawling: https://ru.wikipedia.org/
Crawling: https://de.wikipedia.org/
Crawling: https://es.wikipedia.org/
Crawling: https://fr.wikipedia.org/
Crawling: https://it.wikipedia.org/
Crawling: https://zh.wikipedia.org/
Crawling: https://fa.wikipedia.org/
Crawling: https://pl.wikipedia.org/
Crawling: https://ar.wikipedia.org/
Crawling: https://arz.wikipedia.org/
Crawling: https://nl.wikipedia.org/
Crawling: https://pt.wikipedia.org/
Crawling: https://ceb.wikipedia.org/
Crawling:
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /
# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /
User-agent: IsraBot
Disallow:
User-agent: Orthogaffections
PRACTICAL NO 8
Implement the PageRank algorithm to rank web pages based on link analysis.
Apply the PageRank algorithm to a small web graph and analyze the results.
CODE :
import numpy as np
num_nodes = len(graph)
for _ in range(max_iterations):
prev_page_ranks = np.copy(page_ranks)
if not incoming_links:
continue
damping_factor * sum(prev_page_ranks[link] /
break
return page_ranks
if __name__ == "__main__":
web_graph = [
[1, 2],
[0, 2],
[0, 1] ,
[1,2],
result = page_rank(web_graph)
for i, pr in enumerate(result):
OUTPUT :
Page 0: 0.6725117940472367
Page 1: 0.7470731975560085
Page 2: 0.7470731975560085
Page 3: 0.25
PRACTICAL NO 9
Train the ranking model using labelled data and evaluate its effectiveness.
CODE :
import numpy as np
X_train = np.array([
])
le = LabelEncoder()
y_train = le.fit_transform(y_train) # Make sure labels are in the proper range (0, 1)
# Train RankSVM
rank_svm.fit(X_train, y_train)
predictions = rank_svm.predict(test_data)
OUTPUT :
CODE :
# Sample text
text = """
Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives
its environment and takes actions
that maximize its chance of achieving its goals. Colloquially, the term "artificial intelligence" is often
used to describe machines
"""
# 1. Extractive Summarization
print("Extractive Summary:")
print(extractive_summary[0]['summary_text'])
# 2. Abstractive Summarization
abstractive_summary = summarizer_abstractive(text, max_length=50, min_length=25,
do_sample=False)
print("Abstractive Summary:")
print(abstractive_summary[0]['summary_text'])
# 3. Question Answering
context = text # The passage from which the answer will be extracted
print("Question:", question)
print("Answer:", answer['answer'])
OUTPUT :
Extractive Summary:
============================================================================
====
Abstractive Summary:
leading AI textbooks define the field as the study of "intelligent agents" the term "artificial
intelligence" is often used to describe machines that mimic "cognitive" functions .
============================================================================
====