
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load the dataset
df = pd.read_csv('dataset.csv')

# EDA: Feature Types and Statistical Analysis
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns
print(f'Numerical Features: {len(numerical_features)}')
print(f'Categorical Features: {len(categorical_features)}')
print(df[numerical_features].describe())

for cat_feature in categorical_features:
    print(f'\nFrequency of categories in {cat_feature}:')
    print(df[cat_feature].value_counts().head())

# Temporal Distribution of Attacks
df['iyear'].value_counts().sort_index().plot(kind='line')
plt.title('Number of Attacks Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Attacks')
plt.show()

# Spatial Distribution of Attacks
top_countries = df['country_txt'].value_counts().head(10)
top_countries.plot(kind='barh')
plt.title('Top 10 Countries with the Most Attacks')
plt.xlabel('Number of Attacks')
plt.ylabel('Country')
plt.show()

# Missing Data Analysis
missing_percentage = df.isnull().mean() * 100
missing_percentage.sort_values(ascending=False, inplace=True)
print(missing_percentage.head(10))

# Word Cloud for Textual Analysis
text = " ".join(summary for summary in df.summary.dropna())
wordcloud = WordCloud(background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
