import tkinter as tk
from tkinter import ttk, filedialog, messagebox, scrolledtext
import pandas as pd
import numpy as np
import os
import re
class ExcelProcessor:
def __init__(self, root):
self.root = root
self.root.title("Excel数据处理器")
self.root.geometry("900x600")
self.df = None
self.create_widgets()
def create_widgets(self):
# 顶部控制区域
control_frame = ttk.Frame(self.root, padding=10)
control_frame.pack(fill=tk.X)
# 文件选择按钮
ttk.Button(control_frame, text="选择Excel文件", command=self.load_excel).grid(row=0, column=0, padx=5)
# 处理方式选择
self.process_var = tk.StringVar(value="点击选择")
ttk.Label(control_frame, text="处理方式:").grid(row=0, column=1, padx=5)
process_combo = ttk.Combobox(control_frame, textvariable=self.process_var, width=15)
process_combo['values'] = ( '统计', '预处理','数据排序', '数据切分','保存结果')
process_combo.grid(row=0, column=2, padx=5)
# 执行按钮
ttk.Button(control_frame, text="执行", command=self.process_data).grid(row=0, column=3, padx=5)
# 数据展示区域
self.notebook = ttk.Notebook(self.root)
self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 预览标签页
self.preview_frame = ttk.Frame(self.notebook)
self.notebook.add(self.preview_frame, text="数据展示")
# 统计标签页
self.stats_frame = ttk.Frame(self.notebook)
self.notebook.add(self.stats_frame, text="统计信息")
# 状态栏
self.status_var = tk.StringVar(value="就绪")
ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W).pack(fill=tk.X,
side=tk.BOTTOM)
def load_excel(self):
"""加载Excel文件"""
file_path = filedialog.askopenfilename(
title="选择Excel文件",
filetypes=[("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")]
)
if not file_path:
return
try:
self.status_var.set(f"正在加载: {os.path.basename(file_path)}...")
self.root.update() # 更新界面显示状态
# 读取Excel文件
self.df = pd.read_excel(file_path)
# 显示预览数据
self.show_preview()
self.status_var.set(
f"已加载: {os.path.basename(file_path)} | 行数: {len(self.df)} | 列数: {len(self.df.columns)}")
except Exception as e:
messagebox.showerror("加载错误", f"无法读取Excel文件:\n{str(e)}")
self.status_var.set("加载失败")
def preprocess_data(self):
"""数据预处理对话框 - 整合全部预处理功能"""
if self.df is None:
messagebox.showwarning("警告", "请先选择Excel文件")
return
preprocess_window = tk.Toplevel(self.root)
preprocess_window.title("数据预处理")
preprocess_window.geometry("650x800")
main_frame = ttk.Frame(preprocess_window)
main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# ================ 新增:功能启用复选框 ================
enable_frame = ttk.LabelFrame(main_frame, text="启用功能", padding=10)
enable_frame.pack(fill=tk.X, pady=5)
# 创建启用变量
self.enable_missing = tk.BooleanVar(value=True)
self.enable_outlier = tk.BooleanVar(value=True)
self.enable_datetime = tk.BooleanVar(value=True)
self.enable_lag = tk.BooleanVar(value=True)
ttk.Checkbutton(enable_frame, text="执行缺失值处理", variable=self.enable_missing).pack(anchor=tk.W)
ttk.Checkbutton(enable_frame, text="执行异常值处理", variable=self.enable_outlier).pack(anchor=tk.W)
ttk.Checkbutton(enable_frame, text="执行时间列转换", variable=self.enable_datetime).pack(anchor=tk.W)
ttk.Checkbutton(enable_frame, text="添加滞后特征", variable=self.enable_lag).pack(anchor=tk.W)
# =================================================
# 1. 缺失值处理部分
missing_frame = ttk.LabelFrame(main_frame, text="缺失值处理", padding=10)
missing_frame.pack(fill=tk.X, pady=5)
# 缺失值统计显示
missing_stats = self.df.isnull().sum()
missing_text = scrolledtext.ScrolledText(missing_frame, height=4)
missing_text.pack(fill=tk.X)
for col, count in missing_stats.items():
if count > 0:
missing_text.insert(tk.END, f"{col}: {count}个缺失值\n")
missing_text.config(state=tk.DISABLED)
# 缺失值处理方法选择
ttk.Label(missing_frame, text="处理方法:").pack(anchor=tk.W)
missing_method_var = tk.StringVar(value="fill")
missing_method_frame = ttk.Frame(missing_frame)
missing_method_frame.pack(fill=tk.X, pady=5)
ttk.Radiobutton(missing_method_frame, text="删除缺失行", variable=missing_method_var, value="drop").pack(
side=tk.LEFT)
ttk.Radiobutton(missing_method_frame, text="固定值填充", variable=missing_method_var, value="fill").pack(
side=tk.LEFT)
ttk.Radiobutton(missing_method_frame, text="插值法", variable=missing_method_var, value="interpolate").pack(
side=tk.LEFT)
# 填充选项
fill_options_frame = ttk.Frame(missing_frame)
fill_options_frame.pack(fill=tk.X, pady=5)
ttk.Label(fill_options_frame, text="填充值:").pack(side=tk.LEFT)
fill_value_entry = ttk.Entry(fill_options_frame, width=10)
fill_value_entry.pack(side=tk.LEFT, padx=5)
fill_value_entry.insert(0, "0")
ttk.Label(fill_options_frame, text="或选择:").pack(side=tk.LEFT, padx=5)
fill_type_var = tk.StringVar(value="fixed")
ttk.Radiobutton(fill_options_frame, text="前值填充", variable=fill_type_var, value="ffill").pack(side=tk.LEFT)
ttk.Radiobutton(fill_options_frame, text="后值填充", variable=fill_type_var, value="bfill").pack(side=tk.LEFT)
ttk.Radiobutton(fill_options_frame, text="均值填充", variable=fill_type_var, value="mean").pack(side=tk.LEFT)
# 2. 异常值处理部分
outlier_frame = ttk.LabelFrame(main_frame, text="异常值处理", padding=10)
outlier_frame.pack(fill=tk.X, pady=5)
# 异常值检测方法
ttk.Label(outlier_frame, text="检测方法:").pack(anchor=tk.W)
outlier_method_var = tk.StringVar(value="3sigma")
outlier_method_frame = ttk.Frame(outlier_frame)
outlier_method_frame.pack(fill=tk.X)
ttk.Radiobutton(outlier_method_frame, text="3σ原则", variable=outlier_method_var, value="3sigma").pack(
side=tk.LEFT)
ttk.Radiobutton(outlier_method_frame, text="IQR方法", variable=outlier_method_var, value="iqr").pack(
side=tk.LEFT)
# 异常值处理方式
ttk.Label(outlier_frame, text="处理方式:").pack(anchor=tk.W)
outlier_action_var = tk.StringVar(value="remove")
outlier_action_frame = ttk.Frame(outlier_frame)
outlier_action_frame.pack(fill=tk.X)
ttk.Radiobutton(outlier_action_frame, text="删除", variable=outlier_action_var, value="remove").pack(
side=tk.LEFT)
ttk.Radiobutton(outlier_action_frame, text="用中位数替换", variable=outlier_action_var, value="median").pack(
side=tk.LEFT)
ttk.Radiobutton(outlier_action_frame, text="用前后均值替换", variable=outlier_action_var,
value="neighbor").pack(side=tk.LEFT)
# 3. 数据类型转换部分
type_frame = ttk.LabelFrame(main_frame, text="数据类型转换", padding=10)
type_frame.pack(fill=tk.X, pady=5)
# 时间列转换
ttk.Label(type_frame, text="时间列转换:").pack(anchor=tk.W)
time_col_var = tk.StringVar()
time_col_combo = ttk.Combobox(type_frame, textvariable=time_col_var, width=20)
time_col_combo['values'] = tuple(self.df.columns)
time_col_combo.pack(anchor=tk.W, pady=5)
# === 新增:时间单位选择 ===
time_units_frame = ttk.Frame(type_frame)
time_units_frame.pack(fill=tk.X, pady=5)
ttk.Label(time_units_frame, text="提取时间单位:").pack(side=tk.LEFT)
# 创建时间单位变量
self.extract_year = tk.BooleanVar(value=True)
self.extract_month = tk.BooleanVar(value=True)
self.extract_day = tk.BooleanVar(value=True)
self.extract_hour = tk.BooleanVar(value=False)
self.extract_minute = tk.BooleanVar(value=False)
self.extract_second = tk.BooleanVar(value=False)
# 添加复选框
ttk.Checkbutton(time_units_frame, text="年", variable=self.extract_year).pack(side=tk.LEFT, padx=5)
ttk.Checkbutton(time_units_frame, text="月", variable=self.extract_month).pack(side=tk.LEFT, padx=5)
ttk.Checkbutton(time_units_frame, text="日", variable=self.extract_day).pack(side=tk.LEFT, padx=5)
ttk.Checkbutton(time_units_frame, text="时", variable=self.extract_hour).pack(side=tk.LEFT, padx=5)
ttk.Checkbutton(time_units_frame, text="分", variable=self.extract_minute).pack(side=tk.LEFT, padx=5)
ttk.Checkbutton(time_units_frame, text="秒", variable=self.extract_second).pack(side=tk.LEFT, padx=5)
# === 修改时间转换逻辑 ===
if self.enable_datetime.get():
time_col = time_col_var.get()
if time_col and time_col in self.df.columns:
try:
# 统一处理不同日期格式
self.df[time_col] = self.df[time_col].apply(
lambda x: pd.to_datetime(x, errors='coerce', format='mixed')
)
# 强制显示完整时间格式
pd.set_option('display.datetime_format', '%Y-%m-%d %H:%M:%S')
# 根据用户选择提取时间单位
if self.extract_year.get():
self.df['year'] = self.df[time_col].dt.year
if self.extract_month.get():
self.df['month'] = self.df[time_col].dt.month
if self.extract_day.get():
self.df['day'] = self.df[time_col].dt.day
if self.extract_hour.get():
self.df['hour'] = self.df[time_col].dt.hour
if self.extract_minute.get():
self.df['minute'] = self.df[time_col].dt.minute
if self.extract_second.get():
self.df['second'] = self.df[time_col].dt.second
# 新增:确保时间部分显示
self.df['full_datetime'] = self.df[time_col].dt.strftime('%Y-%m-%d %H:%M:%S')
# 时间周期特征
if self.extract_hour.get() or self.extract_minute.get():
self.df['time_of_day'] = self.df[time_col].dt.hour + self.df[time_col].dt.minute / 60.0
if self.extract_second.get():
self.df['time_of_day'] += self.df[time_col].dt.second / 3600.0
except Exception as e:
messagebox.showerror("时间转换错误", f"时间列转换失败: {str(e)}")
# 4. 特征工程部分
feature_frame = ttk.LabelFrame(main_frame, text="特征工程", padding=10)
feature_frame.pack(fill=tk.X, pady=5)
# 添加滞后特征
ttk.Label(feature_frame, text="滞后特征:").pack(anchor=tk.W)
lag_frame = ttk.Frame(feature_frame)
lag_frame.pack(fill=tk.X)
ttk.Label(lag_frame, text="选择列:").pack(side=tk.LEFT)
lag_col_var = tk.StringVar()
lag_col_combo = ttk.Combobox(lag_frame, textvariable=lag_col_var, width=15)
lag_col_combo['values'] = tuple(self.df.select_dtypes(include=['number']).columns)
lag_col_combo.pack(side=tk.LEFT, padx=5)
ttk.Label(lag_frame, text="滞后步数:").pack(side=tk.LEFT)
lag_steps_entry = ttk.Entry(lag_frame, width=5)
lag_steps_entry.pack(side=tk.LEFT)
lag_steps_entry.insert(0, "1")
# 执行预处理按钮
def apply_preprocessing():
try:
original_shape = self.df.shape
# 1. 处理缺失值 (如果启用)
if self.enable_missing.get():
missing_method = missing_method_var.get()
if missing_method == "drop":
self.df = self.df.dropna()
elif missing_method == "fill":
fill_type = fill_type_var.get()
if fill_type == "fixed":
fill_value = fill_value_entry.get()
self.df = self.df.fillna(
float(fill_value) if self.df.select_dtypes(include=['number']).shape[
1] > 0 else fill_value)
elif fill_type == "ffill":
self.df = self.df.ffill()
elif fill_type == "bfill":
self.df = self.df.bfill()
elif fill_type == "mean":
self.df = self.df.fillna(self.df.mean())
elif missing_method == "interpolate":
self.df = self.df.interpolate()
# 2. 处理异常值 (如果启用)
if self.enable_outlier.get():
outlier_method = outlier_method_var.get()
outlier_action = outlier_action_var.get()
numeric_cols = self.df.select_dtypes(include=['number']).columns
for col in numeric_cols:
if outlier_method == "3sigma":
mean, std = self.df[col].mean(), self.df[col].std()
lower, upper = mean - 3 * std, mean + 3 * std
else: # iqr
q1, q3 = self.df[col].quantile(0.25), self.df[col].quantile(0.75)
iqr = q3 - q1
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
if outlier_action == "remove":
self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)]
elif outlier_action == "median":
self.df.loc[(self.df[col] < lower) | (self.df[col] > upper), col] = self.df[col].median()
elif outlier_action == "neighbor":
mask = (self.df[col] < lower) | (self.df[col] > upper)
self.df.loc[mask, col] = self.df[col].rolling(2, min_periods=1).mean()[mask]
# 3. 时间列转换 (如果启用)
if self.enable_datetime.get():
time_col = time_col_var.get()
if time_col and time_col in self.df.columns:
try:
self.df[time_col] = pd.to_datetime(self.df[time_col])
self.df['year'] = self.df[time_col].dt.year
self.df['month'] = self.df[time_col].dt.month
self.df['day'] = self.df[time_col].dt.day
except Exception as e:
messagebox.showwarning("时间转换警告", f"时间列转换失败: {str(e)}")
# 4. 添加滞后特征 (如果启用)
if self.enable_lag.get():
lag_col = lag_col_var.get()
if lag_col and lag_col in self.df.columns:
try:
lag_steps = int(lag_steps_entry.get())
self.df[f'{lag_col}_lag{lag_steps}'] = self.df[lag_col].shift(lag_steps)
except Exception as e:
messagebox.showwarning("滞后特征警告", f"创建滞后特征失败: {str(e)}")
# =========================================================
# 更新显示
self.show_preview()
preprocess_window.destroy()
new_shape = self.df.shape
self.status_var.set(f"预处理完成 | 原形状: {original_shape} | 新形状: {new_shape}")
except Exception as e:
messagebox.showerror("预处理错误", f"预处理过程中发生错误:\n{str(e)}")
ttk.Button(main_frame, text="执行预处理", command=apply_preprocessing).pack(pady=10)
def show_preview(self):
"""在表格中分页显示数据预览"""
# 清除旧内容
for widget in self.preview_frame.winfo_children():
widget.destroy()
# 创建主容器
container = ttk.Frame(self.preview_frame)
container.pack(fill=tk.BOTH, expand=True)
# 创建表格
columns = list(self.df.columns)
self.tree = ttk.Treeview(container, columns=columns, show="headings")
# 添加列标题
for col in columns:
self.tree.heading(col, text=col)
self.tree.column(col, width=100, anchor=tk.W)
# 添加滚动条
scrollbar = ttk.Scrollbar(container, orient=tk.VERTICAL, command=self.tree.yview)
self.tree.configure(yscroll=scrollbar.set)
# 布局表格和滚动条
self.tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
# 创建分页控制面板
pagination_frame = ttk.Frame(self.preview_frame)
pagination_frame.pack(fill=tk.X, pady=5)
# 分页参数
self.current_page = 1
self.rows_per_page = 1000 # 每页显示的行数
self.total_pages = max(1, (len(self.df) + self.rows_per_page - 1) // self.rows_per_page)
# 分页标签
self.page_label = ttk.Label(pagination_frame, text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页")
self.page_label.pack(side=tk.LEFT, padx=10)
# 分页按钮
ttk.Button(pagination_frame, text="首页", command=lambda: self.change_page(1)).pack(side=tk.LEFT)
ttk.Button(pagination_frame, text="上一页", command=lambda: self.change_page(self.current_page - 1)).pack(
side=tk.LEFT)
ttk.Button(pagination_frame, text="下一页", command=lambda: self.change_page(self.current_page + 1)).pack(
side=tk.LEFT)
ttk.Button(pagination_frame, text="末页", command=lambda: self.change_page(self.total_pages)).pack(side=tk.LEFT)
# 跳转输入框
ttk.Label(pagination_frame, text="跳转到:").pack(side=tk.LEFT, padx=(10, 0))
self.page_entry = ttk.Entry(pagination_frame, width=5)
self.page_entry.pack(side=tk.LEFT)
ttk.Button(pagination_frame, text="跳转", command=self.jump_to_page).pack(side=tk.LEFT, padx=(5, 10))
# 显示第一页数据
self.load_page_data()
def load_page_data(self):
"""加载当前页的数据"""
# 清空现有数据
for item in self.tree.get_children():
self.tree.delete(item)
# 计算起始和结束索引
start_idx = (self.current_page - 1) * self.rows_per_page
end_idx = start_idx + self.rows_per_page
# 添加当前页的数据行
for i, row in self.df.iloc[start_idx:end_idx].iterrows():
self.tree.insert("", tk.END, values=list(row))
# 更新分页标签
self.page_label.config(text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页")
self.page_entry.delete(0, tk.END)
self.page_entry.insert(0, str(self.current_page))
def change_page(self, new_page):
"""切换页面"""
# 确保新页码在有效范围内
new_page = max(1, min(new_page, self.total_pages))
if new_page != self.current_page:
self.current_page = new_page
self.load_page_data()
def jump_to_page(self):
"""跳转到指定页码"""
try:
page_num = int(self.page_entry.get())
self.change_page(page_num)
except ValueError:
messagebox.showerror("错误", "请输入有效的页码数字")
def process_data(self):
"""根据选择的处理方式处理数据"""
if self.df is None:
messagebox.showwarning("警告", "请先选择Excel文件")
return
process_type = self.process_var.get()
if process_type == "统计":
self.show_statistics()
self.notebook.select(1)
elif process_type == "数据排序":
self.sort_data()
elif process_type == "保存结果":
self.save_data()
elif process_type == "预处理":
self.preprocess_data()
elif process_type == "数据切分":
self.divide_data()
def show_statistics(self):
"""显示数据统计信息"""
# 清除旧内容
for widget in self.stats_frame.winfo_children():
widget.destroy()
# 计算统计信息
stats = self.df.describe(include='all').fillna('-')
# 创建表格显示统计信息
columns = ['统计项'] + list(stats.columns)
tree = ttk.Treeview(self.stats_frame, columns=columns, show="headings")
# 添加列标题
for col in columns:
tree.heading(col, text=col)
tree.column(col, width=100, anchor=tk.W)
# 添加数据行
for index, row in stats.iterrows():
tree.insert("", tk.END, values=[index] + list(row))
# 添加滚动条
scrollbar = ttk.Scrollbar(self.stats_frame, orient=tk.VERTICAL, command=tree.yview)
tree.configure(yscroll=scrollbar.set)
# 布局
tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
# 添加数据类型信息
type_frame = ttk.LabelFrame(self.stats_frame, text="数据类型")
type_frame.pack(fill=tk.X, padx=5, pady=5)
type_text = scrolledtext.ScrolledText(type_frame, height=5)
type_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
dtypes = self.df.dtypes.apply(lambda x: x.name).to_dict()
type_info = "\n".join([f"{col}: {dtype}" for col, dtype in dtypes.items()])
type_text.insert(tk.END, type_info)
type_text.config(state=tk.DISABLED)
def sort_data(self):
"""数据排序对话框"""
sort_window = tk.Toplevel(self.root)
sort_window.title("数据排序")
sort_window.geometry("400x300")
ttk.Label(sort_window, text="选择排序列:").pack(pady=10)
# 列选择
col_var = tk.StringVar()
col_combo = ttk.Combobox(sort_window, textvariable=col_var, width=20)
col_combo['values'] = tuple(self.df.columns)
col_combo.pack(pady=5)
# 排序方式
ttk.Label(sort_window, text="排序方式:").pack(pady=10)
order_var = tk.StringVar(value="ascending")
ttk.Radiobutton(sort_window, text="升序", variable=order_var, value="ascending").pack()
ttk.Radiobutton(sort_window, text="降序", variable=order_var, value="descending").pack()
def apply_sort():
if not col_var.get():
messagebox.showwarning("警告", "请选择排序列")
return
try:
ascending = (order_var.get() == "ascending")
self.df = self.df.sort_values(by=col_var.get(), ascending=ascending)
self.show_preview()
sort_window.destroy()
self.status_var.set(f"数据已按 {col_var.get()} {'升序' if ascending else '降序'} 排序")
except Exception as e:
messagebox.showerror("排序错误", f"排序失败:\n{str(e)}")
ttk.Button(sort_window, text="应用排序", command=apply_sort).pack(pady=20)
def save_data(self):
"""保存处理结果"""
if self.df is None or self.df.empty:
messagebox.showwarning("警告", "没有可保存的数据")
return
save_path = filedialog.asksaveasfilename(
defaultextension=".xlsx",
filetypes=[("Excel文件", "*.xlsx"), ("CSV文件", "*.csv")]
)
if not save_path:
return
try:
if save_path.endswith('.xlsx'):
self.df.to_excel(save_path, index=False)
else:
self.df.to_csv(save_path, index=False)
self.status_var.set(f"文件已保存至: {os.path.basename(save_path)}")
messagebox.showinfo("保存成功", f"文件已成功保存至:\n{save_path}")
except Exception as e:
messagebox.showerror("保存错误", f"保存文件失败:\n{str(e)}")
# 创建并运行程序
if __name__ == "__main__":
root = tk.Tk()
app = ExcelProcessor(root)
root.mainloop()
请你用相同的风格也一个excel处理器类的一个方法:可以对数据进行筛选
最新发布