
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Dropout, concatenate
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, TFDistilBertModel

# Assuming the dataset has been preprocessed
# df = pd.read_csv('path_to_your_preprocessed_dataset.csv')

# Feature Extraction with DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_distilbert = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

def extract_features(texts):
    inputs = tokenizer(texts, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model_distilbert(inputs)
    return outputs.last_hidden_state[:,0,:].numpy()

# Preparing the Data for Experimental Series

# Series 1: Textual Features Only
text_features = extract_features(df['text_column'])
X_train, X_test, y_train, y_test = train_test_split(text_features, df['target'], test_size=0.2, random_state=42)

# Series 2: Combined Features (Textual, Categorical, Numerical)
# Assuming 'cat_num_features' is an array of preprocessed categorical and numerical features
combined_features = np.concatenate((text_features, cat_num_features), axis=1)
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(combined_features, df['target'], test_size=0.2, random_state=42)

# Define the Model
def build_model(input_shape):
    input_layer = Input(shape=(input_shape,))
    x = LSTM(64, return_sequences=True)(input_layer)
    x = Dropout(0.5)(x)
    x = LSTM(32)(x)
    x = Dense(64, activation='relu')(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate the Model for Series 1
model_series1 = build_model(X_train.shape[1])
model_series1.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
predictions_series1 = model_series1.predict(X_test)
print("Accuracy Series 1:", accuracy_score(y_test, predictions_series1.round()))

# Train and Evaluate the Model for Series 2
model_series2 = build_model(X_train_combined.shape[1])
model_series2.fit(X_train_combined, y_train_combined, epochs=10, batch_size=32, validation_split=0.2)
predictions_series2 = model_series2.predict(X_test_combined)
print("Accuracy Series 2:", accuracy_score(y_test_combined, predictions_series2.round()))
