12/2/25, 1:10 AM SHAP_Assignment
In [1]: import numpy as np
import pandas as pd
from [Link] import RandomForestRegressor
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from lime import lime_tabular
import shap
import torch
from [Link] import DeepLift
print("All imports OK")
print("numpy:", np.__version__)
print("shap:", shap.__version__)
C:\Users\Lenovo\anaconda3\envs\explain_env\lib\site-packages\tqdm\[Link]: Tqd
mWarning: IProgress not found. Please update jupyter and ipywidgets. See https://
[Link]/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
All imports OK
numpy: 1.26.4
shap: 0.48.0
In [2]: import os, zipfile, [Link]
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import RandomForestRegressor
from lime import lime_tabular
import shap
import torch
import [Link] as nn
from [Link] import Dataset, DataLoader
from [Link] import DeepLift
data_dir = "turbofan_data"
[Link](data_dir, exist_ok=True)
zip_url = "[Link]
zip_path = "[Link]"
if not [Link](zip_path):
print("Downloading dataset...")
[Link](zip_url, zip_path)
with [Link](zip_path, "r") as zf:
[Link](data_dir)
print("Download + extract done.")
else:
print("Zip already present.")
Zip already present.
In [5]: import os
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
[Link] 1/11
12/2/25, 1:10 AM SHAP_Assignment
from [Link] import RandomForestRegressor
# adjust this line ⬇️
train_file = [Link](data_dir, "CMAPSSData", "train_FD001.txt")
cols = ['engine_id', 'cycle'] + [f'setting_{i}' for i in range(1,4)] + [f's_{i}'
df = pd.read_csv(train_file, sep=r"\s+", header=None, names=cols)
# compute RUL
max_cycle = [Link]('engine_id')['cycle'].max().reset_index().rename(columns=
df = [Link](max_cycle, on='engine_id')
df['RUL'] = df['max_cycle'] - df['cycle']
[Link](columns=['max_cycle'], inplace=True)
feature_cols = [c for c in [Link] if c not in ['engine_id','cycle','RUL']]
X = df[feature_cols].values
y = df['RUL'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_st
scaler = StandardScaler().fit(X_train)
X_train_s = [Link](X_train)
X_val_s = [Link](X_val)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
[Link](X_train_s, y_train)
print("RF val R^2:", [Link](X_val_s, y_val))
[Link] 2/11
12/2/25, 1:10 AM SHAP_Assignment
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[5], line 11
8 train_file = [Link](data_dir, "CMAPSSData", "train_FD001.txt")
10 cols = ['engine_id', 'cycle'] + [f'setting_{i}' for i in range(1,4)] +
[f's_{i}' for i in range(1,22)]
---> 11 df = pd.read_csv(train_file, sep=r"\s+", header=None, names=cols)
13 # compute RUL
14 max_cycle = [Link]('engine_id')['cycle'].max().reset_index().rename(c
olumns={'cycle':'max_cycle'})
File ~\anaconda3\envs\explain_env\lib\site-packages\pandas\io\parsers\[Link]:
1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, u
secols, dtype, engine, converters, true_values, false_values, skipinitialspace, s
kiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_
blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date
_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, deci
mal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encodi
ng, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_
map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 [Link](kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~\anaconda3\envs\explain_env\lib\site-packages\pandas\io\parsers\[Link]:
620, in _read(filepath_or_buffer, kwds)
617 _validate_names([Link]("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File ~\anaconda3\envs\explain_env\lib\site-packages\pandas\io\parsers\[Link]:
1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 [Link]["has_index_names"] = kwds["has_index_names"]
1619 [Link]: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, [Link])
File ~\anaconda3\envs\explain_env\lib\site-packages\pandas\io\parsers\[Link]:
1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 [Link] = get_handle(
1881 f,
1882 mode,
1883 encoding=[Link]("encoding", None),
1884 compression=[Link]("compression", None),
1885 memory_map=[Link]("memory_map", False),
1886 is_text=is_text,
1887 errors=[Link]("encoding_errors", "strict"),
1888 storage_options=[Link]("storage_options", None),
1889 )
1890 assert [Link] is not None
1891 f = [Link]
[Link] 3/11
12/2/25, 1:10 AM SHAP_Assignment
File ~\anaconda3\envs\explain_env\lib\site-packages\pandas\io\[Link], in g
et_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors,
storage_options)
868 elif isinstance(handle, str):
869 # Check whether the filename is to be opened in binary mode.
870 # Binary mode does not support 'encoding' and 'newline'.
871 if [Link] and "b" not in [Link]:
872 # Encoding
--> 873 handle = open(
874 handle,
875 [Link],
876 encoding=[Link],
877 errors=errors,
878 newline="",
879 )
880 else:
881 # Binary mode
882 handle = open(handle, [Link])
FileNotFoundError: [Errno 2] No such file or directory: 'turbofan_data\\CMAPSSDat
a\\train_FD001.txt'
In [7]: import os
data_dir = "turbofan_data" # same as before
for root, dirs, files in [Link](data_dir):
for f in files:
if [Link]() == "train_fd001.txt":
print("FOUND:", [Link](root, f))
In [8]: import os, zipfile, [Link]
data_dir = "turbofan_data"
[Link](data_dir, exist_ok=True)
zip_url = "[Link]
zip_path = "[Link]"
print("Downloading NASA Turbofan dataset...")
[Link](zip_url, zip_path)
print("Download complete.")
print("Extracting...")
with [Link](zip_path, "r") as zf:
[Link](data_dir)
print("Extraction finished.")
Downloading NASA Turbofan dataset...
Download complete.
Extracting...
Extraction finished.
In [9]: import os
for root, dirs, files in [Link]("turbofan_data"):
print(root)
for f in files:
print(" ", f)
[Link] 4/11
12/2/25, 1:10 AM SHAP_Assignment
turbofan_data
turbofan_data\6. Turbofan Engine Degradation Simulation Data Set
[Link]
In [10]: import os, zipfile
outer_dir = [Link]("turbofan_data", "6. Turbofan Engine Degradation Simula
inner_zip = [Link](outer_dir, "[Link]")
extract_dir = [Link]("turbofan_data", "CMAPSSData")
[Link](extract_dir, exist_ok=True)
print("Inner zip path:", inner_zip)
with [Link](inner_zip, "r") as zf:
[Link](extract_dir)
print("Extracted to:", extract_dir)
# quick check
for root, dirs, files in [Link](extract_dir):
print(root)
for f in files:
print(" ", f)
Inner zip path: turbofan_data\6. Turbofan Engine Degradation Simulation Data Set
\[Link]
Extracted to: turbofan_data\CMAPSSData
turbofan_data\CMAPSSData
Damage Propagation [Link]
[Link]
RUL_FD001.txt
RUL_FD002.txt
RUL_FD003.txt
RUL_FD004.txt
test_FD001.txt
test_FD002.txt
test_FD003.txt
test_FD004.txt
train_FD001.txt
train_FD002.txt
train_FD003.txt
train_FD004.txt
In [11]: import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import RandomForestRegressor
# 1. path to FD001 training file
train_file = [Link]("turbofan_data", "CMAPSSData", "train_FD001.txt")
print("Using train file:", train_file)
# 2. load with column names (1 id, 1 cycle, 3 settings, 21 sensors)
cols = ['engine_id', 'cycle'] + [f'setting_{i}' for i in range(1,4)] + [f's_{i}'
df = pd.read_csv(train_file, sep=r"\s+", header=None, names=cols)
# 3. compute Remaining Useful Life (RUL) label
max_cycle = [Link]('engine_id')['cycle'].max().reset_index().rename(columns=
df = [Link](max_cycle, on='engine_id')
df['RUL'] = df['max_cycle'] - df['cycle']
[Link] 5/11
12/2/25, 1:10 AM SHAP_Assignment
[Link](columns=['max_cycle'], inplace=True)
# 4. split features / target
feature_cols = [c for c in [Link] if c not in ['engine_id', 'cycle', 'RUL']]
X = df[feature_cols].values
y = df['RUL'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_st
# 5. scale and train RF
scaler = StandardScaler().fit(X_train)
X_train_s = [Link](X_train)
X_val_s = [Link](X_val)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
[Link](X_train_s, y_train)
print("RF val R^2:", [Link](X_val_s, y_val))
Using train file: turbofan_data\CMAPSSData\train_FD001.txt
RF val R^2: 0.6248712754209377
In [2]: from lime import lime_tabular
import shap
idx = 10 # choose any validation index you like
# LIME
explainer_lime = lime_tabular.LimeTabularExplainer(
training_data=X_train_s,
feature_names=feature_cols,
mode='regression'
)
lime_exp = explainer_lime.explain_instance(X_val_s[idx], [Link], num_feature
print("LIME explanation (top 8 features):")
for feat, contrib in lime_exp.as_list():
print(f"{feat}: {contrib:.3f}")
# SHAP
import numpy as np
import shap
# use a smaller random subset for SHAP (e.g. 500 rows)
n_shap = min(500, X_val_s.shape[0])
idxs = [Link](X_val_s.shape[0], size=n_shap, replace=False)
X_shap = X_val_s[idxs]
explainer_shap = [Link](rf)
# this is now much faster
shap_values = explainer_shap.shap_values(X_shap)
print("Computed SHAP values for", n_shap, "rows")
# global feature importance summary
shap.summary_plot(shap_values, X_shap, feature_names=feature_cols)
[Link] 6/11
12/2/25, 1:10 AM SHAP_Assignment
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[2], line 8
4 idx = 10 # choose any validation index you like
6 # LIME
7 explainer_lime = lime_tabular.LimeTabularExplainer(
----> 8 training_data=X_train_s,
9 feature_names=feature_cols,
10 mode='regression'
11 )
12 lime_exp = explainer_lime.explain_instance(X_val_s[idx], [Link], num_
features=8)
13 print("LIME explanation (top 8 features):")
NameError: name 'X_train_s' is not defined
In [3]: # ONE-CELL: data download/extract -> train RF -> LIME -> FAST SHAP
# Paste and run this in a fresh kernel cell.
import os, zipfile, [Link], random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import RandomForestRegressor
# reproducible
[Link](0)
[Link](0)
# ----------------- 1) ensure dataset is available and extracted ---------------
data_dir = "turbofan_data"
[Link](data_dir, exist_ok=True)
outer_zip = [Link](data_dir, "[Link]")
# download outer zip if not present
zip_url = "[Link]
if not [Link](outer_zip):
print("Downloading turbofan dataset (will take a short while)...")
[Link](zip_url, outer_zip)
print("Download complete:", outer_zip)
else:
print("Outer zip already present:", outer_zip)
# extract outer zip (it contains an inner [Link])
with [Link](outer_zip, "r") as zf:
[Link](data_dir)
# find inner zip path (it may have a subfolder)
inner_zip = None
for root, dirs, files in [Link](data_dir):
for f in files:
if [Link]().endswith("[Link]"):
inner_zip = [Link](root, f)
break
if inner_zip:
break
if inner_zip is None:
raise FileNotFoundError("Could not find inner [Link] inside download
print("Found inner zip:", inner_zip)
[Link] 7/11
12/2/25, 1:10 AM SHAP_Assignment
# extract inner zip into turbofan_data/CMAPSSData (idempotent)
extract_dir = [Link](data_dir, "CMAPSSData")
[Link](extract_dir, exist_ok=True)
with [Link](inner_zip, "r") as zf:
[Link](extract_dir)
print("Extracted inner zip to:", extract_dir)
# Find FD001 train file
train_file = None
for root, dirs, files in [Link](extract_dir):
for f in files:
if [Link]() == "train_fd001.txt":
train_file = [Link](root, f)
break
if train_file:
break
if train_file is None:
raise FileNotFoundError("train_FD001.txt not found inside extracted CMAPSSDa
print("Using training file:", train_file)
# ----------------- 2) Load data and build RUL labels -----------------
cols = ['engine_id', 'cycle'] + [f'setting_{i}' for i in range(1,4)] + [f's_{i}'
df = pd.read_csv(train_file, sep=r"\s+", header=None, names=cols)
# compute RUL
max_cycle = [Link]('engine_id')['cycle'].max().reset_index().rename(columns=
df = [Link](max_cycle, on='engine_id')
df['RUL'] = df['max_cycle'] - df['cycle']
[Link](columns=['max_cycle'], inplace=True)
# features & target
feature_cols = [c for c in [Link] if c not in ['engine_id','cycle','RUL']]
X = df[feature_cols].values
y = df['RUL'].values
# train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_s
# scale
scaler = StandardScaler().fit(X_train)
X_train_s = [Link](X_train)
X_val_s = [Link](X_val)
# ----------------- 3) Train a RandomForest (fast) -----------------
rf = RandomForestRegressor(n_estimators=150, max_depth=12, random_state=42, n_jo
print("Training RandomForest ...")
[Link](X_train_s, y_train)
print("RF val R^2:", [Link](X_val_s, y_val))
# ----------------- 4) LIME explanation for one sample -----------------
print("\n--- LIME (local) ---")
from lime import lime_tabular
idx = 10 if X_val_s.shape[0] > 10 else 0
explainer_lime = lime_tabular.LimeTabularExplainer(
training_data=X_train_s,
feature_names=feature_cols,
mode='regression',
discretize_continuous=True
)
[Link] 8/11
12/2/25, 1:10 AM SHAP_Assignment
lime_exp = explainer_lime.explain_instance(X_val_s[idx], [Link], num_feature
print("LIME explanation (top 8 features):")
for feat, contrib in lime_exp.as_list():
print(f"{feat}: {contrib:.3f}")
# ----------------- 5) FAST SHAP (subsampled) -----------------
print("\n--- SHAP (fast subsample) ---")
import shap
n_shap = min(200, X_val_s.shape[0]) # small and fast
shap_idxs = [Link](X_val_s.shape[0], size=n_shap, replace=False)
X_shap = X_val_s[shap_idxs]
explainer_shap = [Link](rf)
shap_values = explainer_shap.shap_values(X_shap) # fast because n_shap small
print(f"Computed SHAP values for {n_shap} rows")
# summary plot (will show inline if notebook supports plots)
shap.summary_plot(shap_values, X_shap, feature_names=feature_cols, show=True)
# optional: local SHAP for same sample idx
shap_single = explainer_shap.shap_values(X_val_s[idx:idx+1])
print("\nLocal SHAP for sample idx", idx)
print("SHAP values (feature -> value):")
for f_name, val in zip(feature_cols, shap_single[0].flatten()):
# only print non-negligible contributions
if abs(val) > 1e-6:
print(f"{f_name}: {val:.4f}")
print("\nDone. If plots don't display, ensure your notebook supports inline plot
Downloading turbofan dataset (will take a short while)...
Download complete: turbofan_data\[Link]
Found inner zip: turbofan_data\6. Turbofan Engine Degradation Simulation Data Set
\[Link]
Extracted inner zip to: turbofan_data\CMAPSSData
Using training file: turbofan_data\CMAPSSData\train_FD001.txt
Training RandomForest ...
RF val R^2: 0.6306293029798791
--- LIME (local) ---
LIME explanation (top 8 features):
s_12 > 0.74: 11.778
-0.21 < s_9 <= 0.19: 11.520
-0.74 < s_4 <= -0.10: 7.690
-0.72 < s_11 <= -0.09: 4.218
s_20 > 0.74: 3.040
-0.73 < s_15 <= -0.08: 2.392
s_2 <= -0.71: 2.231
0.01 < setting_1 <= 0.69: 2.200
--- SHAP (fast subsample) ---
Computed SHAP values for 200 rows
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_1032\[Link]: FutureWarnin
g: The NumPy global RNG was seeded by calling `[Link]`. In a future versi
on this function will no longer use the global RNG. Pass `rng` explicitly to opt-
in to the new behaviour and silence this warning.
shap.summary_plot(shap_values, X_shap, feature_names=feature_cols, show=True)
[Link] 9/11
12/2/25, 1:10 AM SHAP_Assignment
[Link] 10/11
12/2/25, 1:10 AM SHAP_Assignment
Local SHAP for sample idx 10
SHAP values (feature -> value):
setting_1: -0.2128
setting_2: -0.3015
s_2: 0.5282
s_3: 0.2771
s_4: 8.7374
s_6: -0.0157
s_7: 0.1358
s_8: -0.0640
s_9: 6.4077
s_11: 9.8663
s_12: 12.1784
s_13: 3.5527
s_14: -3.4699
s_15: 1.1906
s_17: -0.4829
s_20: 2.5888
s_21: -0.6387
Done. If plots don't display, ensure your notebook supports inline plotting (matp
lotlib).
In [ ]:
[Link] 11/11