reduce_mem_usage函数,减少数据的内存

本文介绍了一个在Kaggle比赛中使用的内存优化函数reduce_mem_usage。该函数通过对数据类型进行智能转换,如int和float的各类精度,来降低DataFrame的内存使用,从而提升数据处理效率。文章详细展示了如何根据数据的最小值和最大值选择合适的整数或浮点数类型,以实现内存占用的显著减少。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

最近在做kaggle比赛的时候看到有一个函数reduce_mem_usage,可以对数据进行压缩,从而减少内存消耗,因此记录一下:

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# -*- coding: utf-8 -*- """ Created on Sun Jul 20 16:20:23 2025 @author: srx20 """ # -*- coding: utf-8 -*- """ Created on Sun Jul 20 16:00:01 2025 @author: srx20 """ import os import gc import numpy as np import pandas as pd import joblib import talib as ta from tqdm import tqdm import random from sklearn.cluster import MiniBatchKMeans from sklearn.preprocessing import StandardScaler from sklearn.model_selection import RandomizedSearchCV, GroupKFold from sklearn.feature_selection import SelectKBest, f_classif from sklearn.metrics import make_scorer, recall_score, classification_report import lightgbm as lgb import logging import psutil import warnings from scipy import sparse warnings.filterwarnings('ignore') # 设置日志记录 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('stock_prediction_fixed.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) # ========== 配置类 ========== class StockConfig: def __init__(self): # 数据路径 self.SH_PATH = r"D:\股票量化数据库\股票csv数据\上证" self.SZ_PATH = r"D:\股票量化数据库\股票csv数据\深证" # 时间范围 self.START_DATE = "2018-01-01" self.END_DATE = "2025-7-18" self.TEST_START = "2021-01-01" self.TEST_END = "2024-12-31" # 聚类设置 self.CLUSTER_NUM = 8 self.CLUSTER_FEATURES = [ 'price_change', 'volatility', 'volume_change', 'MA5', 'MA20', 'RSI14', 'MACD_hist' ] # 预测特征 (初始列表,实际使用时会动态更新) self.PREDICT_FEATURES = [ 'open', 'high', 'low', 'close', 'volume', 'price_change', 'volatility', 'volume_change', 'MA5', 'MA20', 'RSI14', 'MACD_hist', 'cluster', 'MOM10', 'ATR14', 'VWAP', 'RSI_diff', 'price_vol_ratio', 'MACD_RSI', 'advance_decline', 'day_of_week', 'month' ] # 模型参数优化范围(内存优化版) self.PARAM_GRID = { 'boosting_type': ['gbdt'], # 减少选项 'num_leaves': [31, 63], # 减少选项 'max_depth': [-1, 7], # 减少选项 'learning_rate': [0.01, 0.05], 'n_estimators': [300, 500], # 减少选项 'min_child_samples': [50], # 固定值 'min_split_gain': [0.0, 0.1], 'reg_alpha': [0, 0.1], 'reg_lambda': [0, 0.1], 'feature_fraction': [0.7, 0.9], 'bagging_fraction': [0.7, 0.9], 'bagging_freq': [1] } # 目标条件 self.MIN_GAIN = 0.05 self.MIN_LOW_RATIO = 0.98 # 调试模式 self.DEBUG_MODE = False self.MAX_STOCKS = 50 if self.DEBUG_MODE else None self.SAMPLE_FRACTION = 0.3 if not self.DEBUG_MODE else 1.0 # 采样比例 # ========== 内存管理工具 ========== def reduce_mem_usage(df): """优化DataFrame内存使用""" start_mem = df.memory_usage().sum() / 1024**2 for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 logger.info(f'内存优化: 从 {start_mem:.2f} MB 减少到 {end_mem:.2f} MB ({100*(start_mem-end_mem)/start_mem:.1f}%)') return df def print_memory_usage(): """打印当前内存使用情况""" process = psutil.Process(os.getpid()) mem = process.memory_info().rss / (1024 ** 2) logger.info(f"当前内存使用: {mem:.2f} MB") # ========== 数据加载 (修复版) ========== def load_stock_data(sh_path, sz_path, start_date, end_date, sample_fraction=1.0, debug_mode=False, max_stocks=None): """加载股票数据,并过滤日期范围(修复随机抽样问题)""" stock_data = {} # 创建文件列表 all_files = [] for exchange, path in [('SH', sh_path), ('SZ', sz_path)]: if os.path.exists(path): csv_files = [f for f in os.listdir(path) if f.endswith('.csv')] for file in csv_files: all_files.append((exchange, path, file)) if not all_files: logger.warning("没有找到任何CSV文件") return stock_data # 随机抽样(修复一维问题) if sample_fraction < 1.0: sample_size = max(1, int(len(all_files) * sample_fraction)) # 使用random.sample代替np.random.choice all_files = random.sample(all_files, sample_size) logger.info(f"抽样 {len(all_files)} 只股票文件 (比例: {sample_fraction})") total_files = len(all_files) pbar = tqdm(total=total_files, desc='加载股票数据') loaded_count = 0 for exchange, path, file in all_files: if max_stocks is not None and loaded_count >= max_stocks: break if file.endswith('.csv'): stock_code = f"{exchange}_{file.split('.')[0]}" file_path = os.path.join(path, file) try: # 读取数据并验证列名 df = pd.read_csv(file_path) # 验证必要的列是否存在 required_cols = ['date', 'open', 'high', 'low', 'close', 'volume'] if not all(col in df.columns for col in required_cols): logger.warning(f"股票 {stock_code} 缺少必要列,跳过") pbar.update(1) continue # 转换日期并过滤 df['date'] = pd.to_datetime(df['date']) df = df[(df['date'] >= start_date) & (df['date'] <= end_date)] if len(df) < 100: # 至少100个交易日 logger.info(f"股票 {stock_code} 数据不足({len(df)}条),跳过") pbar.update(1) continue # 转换数据类型 for col in ['open', 'high', 'low', 'close']: df[col] = pd.to_numeric(df[col], errors='coerce').astype(np.float32) df['volume'] = pd.to_numeric(df['volume'], errors='coerce').astype(np.uint32) # 删除包含NaN的行 df = df.dropna(subset=required_cols) if len(df) > 0: stock_data[stock_code] = df loaded_count += 1 logger.debug(f"成功加载股票 {stock_code},数据条数: {len(df)}") else: logger.warning(f"股票 {stock_code} 过滤后无数据") except Exception as e: logger.error(f"加载股票 {stock_code} 失败: {str(e)}", exc_info=True) pbar.update(1) # 调试模式只处理少量股票 if debug_mode and loaded_count >= 10: logger.info("调试模式: 已加载10只股票,提前结束") break pbar.close() logger.info(f"成功加载 {len(stock_data)} 只股票数据") return stock_data # ========== 特征工程 (修复版) ========== class FeatureEngineer: def __init__(self, config): self.config = config def safe_fillna(self, series, default=0): """安全填充NaN值""" if isinstance(series, pd.Series): return series.fillna(default) elif isinstance(series, np.ndarray): return np.nan_to_num(series, nan=default) return series def transform(self, df): """添加技术指标特征(修复NumPy数组问题)""" try: # 创建临时副本用于TA-Lib计算 df_temp = df.copy() # 将价格列转换为float64以满足TA-Lib要求 for col in ['open', 'high', 'low', 'close']: df_temp[col] = df_temp[col].astype(np.float64) # 基础特征 df['price_change'] = df['close'].pct_change().fillna(0) df['volatility'] = df['close'].rolling(5).std().fillna(0) df['volume_change'] = df['volume'].pct_change().fillna(0) df['MA5'] = df['close'].rolling(5).mean().fillna(0) df['MA20'] = df['close'].rolling(20).mean().fillna(0) # 技术指标 - 修复NumPy数组问题 rsi = ta.RSI(df_temp['close'].values, timeperiod=14) df['RSI14'] = self.safe_fillna(rsi, 50) macd, macd_signal, macd_hist = ta.MACD( df_temp['close'].values, fastperiod=12, slowperiod=26, signalperiod=9 ) df['MACD_hist'] = self.safe_fillna(macd_hist, 0) # 新增特征 mom = ta.MOM(df_temp['close'].values, timeperiod=10) df['MOM10'] = self.safe_fillna(mom, 0) atr = ta.ATR( df_temp['high'].values, df_temp['low'].values, df_temp['close'].values, timeperiod=14 ) df['ATR14'] = self.safe_fillna(atr, 0) # 成交量加权平均价 vwap = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum() df['VWAP'] = self.safe_fillna(vwap, 0) # 相对强弱指数差值 df['RSI_diff'] = df['RSI14'] - df['RSI14'].rolling(5).mean().fillna(0) # 价格波动比率 df['price_vol_ratio'] = df['price_change'] / (df['volatility'].replace(0, 1e-8) + 1e-8) # 技术指标组合特征 df['MACD_RSI'] = df['MACD_hist'] * df['RSI14'] # 市场情绪指标 df['advance_decline'] = (df['close'] > df['open']).astype(int).rolling(5).sum().fillna(0) # 时间特征 df['day_of_week'] = df['date'].dt.dayofweek df['month'] = df['date'].dt.month # 处理无穷大和NaN df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0) # 优化内存 return reduce_mem_usage(df) except Exception as e: logger.error(f"特征工程失败: {str(e)}", exc_info=True) # 返回基本特征作为回退方案 df['price_change'] = df['close'].pct_change().fillna(0) df['volatility'] = df['close'].rolling(5).std().fillna(0) df['volume_change'] = df['volume'].pct_change().fillna(0) df['MA5'] = df['close'].rolling(5).mean().fillna(0) df['MA20'] = df['close'].rolling(20).mean().fillna(0) # 填充缺失的技术指标 for col in self.config.PREDICT_FEATURES: if col not in df.columns: df[col] = 0 return df # ========== 聚类模型 (添加保存/加载功能) ========== class StockCluster: def __init__(self, config): self.config = config self.scaler = StandardScaler() self.kmeans = MiniBatchKMeans( n_clusters=config.CLUSTER_NUM, random_state=42, batch_size=1000 ) self.cluster_map = {} # 股票代码到聚类ID的映射 self.model_file = "stock_cluster_model.pkl" # 模型保存路径 def save(self): """保存聚类模型到文件""" # 创建包含所有必要组件的字典 model_data = { 'kmeans': self.kmeans, 'scaler': self.scaler, 'cluster_map': self.cluster_map, 'config_cluster_num': self.config.CLUSTER_NUM } # 使用joblib保存模型 joblib.dump(model_data, self.model_file) logger.info(f"聚类模型已保存到: {self.model_file}") def load(self): """从文件加载聚类模型""" if os.path.exists(self.model_file): model_data = joblib.load(self.model_file) self.kmeans = model_data['kmeans'] self.scaler = model_data['scaler'] self.cluster_map = model_data['cluster_map'] logger.info(f"从 {self.model_file} 加载聚类模型") return True else: logger.warning("聚类模型文件不存在,需要重新训练") return False def fit(self, stock_data): """训练聚类模型""" logger.info("开始股票聚类分析...") cluster_features = [] # 提取每只股票的特征 for stock_code, df in tqdm(stock_data.items(), desc="提取聚类特征"): if len(df) < 100: # 至少100个交易日 continue features = {} for feat in self.config.CLUSTER_FEATURES: if feat in df.columns: # 使用统计特征 features[f"{feat}_mean"] = df[feat].mean() features[f"{feat}_std"] = df[feat].std() else: # 特征缺失时填充0 features[f"{feat}_mean"] = 0 features[f"{feat}_std"] = 0 cluster_features.append(features) if not cluster_features: logger.warning("没有可用的聚类特征,使用默认聚类") # 创建默认聚类映射 self.cluster_map = {code: 0 for code in stock_data.keys()} return self # 创建特征DataFrame feature_df = pd.DataFrame(cluster_features) feature_df = reduce_mem_usage(feature_df) # 标准化特征 scaled_features = self.scaler.fit_transform(feature_df) # 聚类 self.kmeans.fit(scaled_features) clusters = self.kmeans.predict(scaled_features) feature_df['cluster'] = clusters # 创建股票到聚类的映射 stock_codes = list(stock_data.keys())[:len(clusters)] # 确保长度匹配 for i, stock_code in enumerate(stock_codes): self.cluster_map[stock_code] = clusters[i] logger.info("聚类分布统计:") logger.info(feature_df['cluster'].value_counts().to_string()) logger.info(f"股票聚类完成,共分为 {self.config.CLUSTER_NUM} 个类别") # 训练完成后自动保存模型 self.save() return self def transform(self, df, stock_code): """为数据添加聚类特征""" cluster_id = self.cluster_map.get(stock_code, -1) # 默认为-1表示未知聚类 df['cluster'] = cluster_id return df # ========== 目标创建 ========== class TargetCreator: def __init__(self, config): self.config = config def create_targets(self, df): """创建目标变量 - 修改为收盘价高于开盘价5%""" # 计算次日收盘价相对于开盘价的涨幅 df['next_day_open_to_close_gain'] = df['close'].shift(-1) / df['open'].shift(-1) - 1 # 计算次日最低价与开盘价比例 df['next_day_low_ratio'] = df['low'].shift(-1) / df['open'].shift(-1) # 创建复合目标:收盘价比开盘价高5% 且 最低价≥开盘价98% df['target'] = 0 mask = (df['next_day_open_to_close_gain'] > self.config.MIN_GAIN) & \ (df['next_day_low_ratio'] >= self.config.MIN_LOW_RATIO) df.loc[mask, 'target'] = 1 # 删除最后一行(没有次日数据) df = df.iloc[:-1] # 检查目标分布 target_counts = df['target'].value_counts() logger.info(f"目标分布: 0={target_counts.get(0, 0)}, 1={target_counts.get(1, 0)}") # 添加调试信息 if self.config.DEBUG_MODE: sample_targets = df[['open', 'close', 'next_day_open_to_close_gain', 'target']].tail(5) logger.debug(f"目标创建示例:\n{sample_targets}") return df # ========== 模型训练 (内存优化版) ========== class StockModelTrainer: def __init__(self, config): self.config = config self.model_name = "stock_prediction_model" self.feature_importance = None def prepare_dataset(self, stock_data, cluster_model, feature_engineer): """准备训练数据集(内存优化版)""" logger.info("准备训练数据集...") X_list = [] y_list = [] stock_group_list = [] # 用于分组交叉验证 target_creator = TargetCreator(self.config) # 使用生成器减少内存占用 for stock_code, df in tqdm(stock_data.items(), desc="处理股票数据"): try: # 特征工程 df = feature_engineer.transform(df.copy()) # 添加聚类特征 df = cluster_model.transform(df, stock_code) # 创建目标 df = target_creator.create_targets(df) # 只保留所需特征和目标 features = self.config.PREDICT_FEATURES if 'target' not in df.columns: logger.warning(f"股票 {stock_code} 缺少目标列,跳过") continue X = df[features] y = df['target'] # 确保没有NaN值 if X.isnull().any().any(): logger.warning(f"股票 {stock_code} 特征包含NaN值,跳过") continue # 使用稀疏矩阵存储(减少内存) sparse_X = sparse.csr_matrix(X.values.astype(np.float32)) X_list.append(sparse_X) y_list.append(y.values) stock_group_list.extend([stock_code] * len(X)) # 为每个样本添加股票代码作为组标识 # 定期清理内存 if len(X_list) % 100 == 0: gc.collect() print_memory_usage() except Exception as e: logger.error(f"处理股票 {stock_code} 失败: {str(e)}", exc_info=True) if not X_list: logger.error("没有可用的训练数据") return None, None, None # 合并所有数据 X_full = sparse.vstack(X_list) y_full = np.concatenate(y_list) groups = np.array(stock_group_list) logger.info(f"数据集准备完成,样本数: {X_full.shape[0]}") logger.info(f"目标分布: 0={sum(y_full==0)}, 1={sum(y_full==1)}") return X_full, y_full, groups def feature_selection(self, X, y): """执行特征选择(内存优化版)""" logger.info("执行特征选择...") # 使用基模型评估特征重要性 base_model = lgb.LGBMClassifier( n_estimators=100, random_state=42, n_jobs=-1 ) # 分批训练(减少内存占用) batch_size = 100000 for i in range(0, X.shape[0], batch_size): end_idx = min(i + batch_size, X.shape[0]) X_batch = X[i:end_idx].toarray() if sparse.issparse(X) else X[i:end_idx] y_batch = y[i:end_idx] if i == 0: base_model.fit(X_batch, y_batch) else: base_model.fit(X_batch, y_batch, init_model=base_model) # 获取特征重要性 importance = pd.Series(base_model.feature_importances_, index=self.config.PREDICT_FEATURES) importance = importance.sort_values(ascending=False) logger.info("特征重要性:\n" + importance.to_string()) # 选择前K个重要特征 k = min(15, len(self.config.PREDICT_FEATURES)) selected_features = importance.head(k).index.tolist() logger.info(f"选择前 {k} 个特征: {selected_features}") # 更新配置中的特征列表 self.config.PREDICT_FEATURES = selected_features # 转换特征矩阵 if sparse.issparse(X): # 对于稀疏矩阵,我们需要重新索引 feature_indices = [self.config.PREDICT_FEATURES.index(f) for f in selected_features] X_selected = X[:, feature_indices] else: X_selected = X[selected_features] return X_selected, selected_features def train_model(self, X, y, groups): """训练并优化模型(内存优化版)""" if X is None or len(y) == 0: logger.error("训练数据为空,无法训练模型") return None logger.info("开始训练模型...") # 1. 处理类别不平衡 pos_count = sum(y == 1) neg_count = sum(y == 0) scale_pos_weight = neg_count / pos_count logger.info(f"类别不平衡处理: 正样本权重 = {scale_pos_weight:.2f}") # 2. 特征选择 X_selected, selected_features = self.feature_selection(X, y) # 3. 自定义评分函数 - 关注正类召回率 def positive_recall_score(y_true, y_pred): return recall_score(y_true, y_pred, pos_label=1) custom_scorer = make_scorer(positive_recall_score, greater_is_better=True) # 4. 使用分组时间序列交叉验证(减少折数) group_kfold = GroupKFold(n_splits=2) # 减少折数以节省内存 cv = list(group_kfold.split(X_selected, y, groups=groups)) # 5. 创建模型 model = lgb.LGBMClassifier( objective='binary', random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight, verbose=-1 ) # 6. 参数搜索(减少迭代次数) search = RandomizedSearchCV( estimator=model, param_distributions=self.config.PARAM_GRID, n_iter=10, # 减少迭代次数以节省内存 scoring=custom_scorer, cv=cv, verbose=2, n_jobs=1, # 减少并行任务以节省内存 pre_dispatch='2*n_jobs', # 控制任务分发 random_state=42 ) logger.info("开始参数搜索...") # 分批处理数据减少内存占用) if sparse.issparse(X_selected): X_dense = X_selected.toarray() # 转换为密集矩阵用于搜索 else: X_dense = X_selected search.fit(X_dense, y) # 7. 使用最佳参数训练最终模型 best_params = search.best_params_ logger.info(f"最佳参数: {best_params}") logger.info(f"最佳召回率: {search.best_score_}") final_model = lgb.LGBMClassifier( **best_params, objective='binary', random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight ) # 使用早停策略训练最终模型 logger.info("训练最终模型...") final_model.fit( X_dense, y, eval_set=[(X_dense, y)], eval_metric='binary_logloss', callbacks=[ lgb.early_stopping(stopping_rounds=50, verbose=False), lgb.log_evaluation(period=100) ] ) # 保存特征重要性 self.feature_importance = pd.Series( final_model.feature_importances_, index=selected_features ).sort_values(ascending=False) # 8. 保存模型 model_path = f"{self.model_name}.pkl" joblib.dump((final_model, selected_features), model_path) logger.info(f"模型已保存到: {model_path}") return final_model def evaluate_model(self, model, X_test, y_test): """评估模型性能""" if model is None or len(X_test) == 0: logger.warning("无法评估模型,缺少数据或模型") return # 预测测试集 y_pred = model.predict(X_test) # 计算召回率 recall = recall_score(y_test, y_pred, pos_label=1) logger.info(f"测试集召回率: {recall:.4f}") # 计算满足条件的样本比例 condition_ratio = sum(y_test == 1) / len(y_test) logger.info(f"满足条件的样本比例: {condition_ratio:.4f}") # 详细分类报告 report = classification_report(y_test, y_pred) logger.info("分类报告:\n" + report) # 特征重要性 if self.feature_importance is not None: logger.info("特征重要性:\n" + self.feature_importance.to_string()) # ========== 主程序 ========== def main(): # 初始化配置 config = StockConfig() logger.info("===== 股票上涨预测程序 (修复版) =====") # 加载训练数据(添加抽样) logger.info(f"加载训练数据: {config.START_DATE} 至 {config.END_DATE}") train_data = load_stock_data( config.SH_PATH, config.SZ_PATH, config.START_DATE, config.END_DATE, sample_fraction=config.SAMPLE_FRACTION, debug_mode=config.DEBUG_MODE, max_stocks=config.MAX_STOCKS ) if not train_data: logger.error("错误: 没有加载到任何股票数据,请检查数据路径和格式") return # 特征工程 feature_engineer = FeatureEngineer(config) # 聚类分析 - 尝试加载现有模型,否则训练新模型 cluster_model = StockCluster(config) if not cluster_model.load(): # 尝试加载模型 try: cluster_model.fit(train_data) except Exception as e: logger.error(f"聚类分析失败: {str(e)}", exc_info=True) # 创建默认聚类映射 cluster_model.cluster_map = {code: 0 for code in train_data.keys()} logger.info("使用默认聚类(所有股票归为同一类)") cluster_model.save() # 保存默认聚类模型 # 准备训练数据 trainer = StockModelTrainer(config) try: X_train, y_train, groups = trainer.prepare_dataset( train_data, cluster_model, feature_engineer ) except Exception as e: logger.error(f"准备训练数据失败: {str(e)}", exc_info=True) return if X_train is None or len(y_train) == 0: logger.error("错误: 没有可用的训练数据") return # 训练模型 model = trainer.train_model(X_train, y_train, groups) if model is None: logger.error("模型训练失败") return # 加载测试数据(添加抽样) logger.info(f"\n加载测试数据: {config.TEST_START} 至 {config.TEST_END}") test_data = load_stock_data( config.SH_PATH, config.SZ_PATH, config.TEST_START, config.TEST_END, sample_fraction=config.SAMPLE_FRACTION, debug_mode=config.DEBUG_MODE, max_stocks=config.MAX_STOCKS ) if test_data: # 准备测试数据 X_test, y_test, _ = trainer.prepare_dataset( test_data, cluster_model, feature_engineer ) if X_test is not None and len(y_test) > 0: # 评估模型 if sparse.issparse(X_test): X_test = X_test.toarray() trainer.evaluate_model(model, X_test, y_test) else: logger.warning("测试数据准备失败,无法评估模型") else: logger.warning("没有测试数据可用") logger.info("===== 程序执行完成 =====") if __name__ == "__main__": main() 这串代码有以下报错: Traceback (most recent call last): File "d:\股票量化数据库\股票量化数据库\大涨预测模型训练程序3.0.py", line 303, in transform return reduce_mem_usage(df) ^^^^^^^^^^^^^^^^^^^^ File "d:\股票量化数据库\股票量化数据库\大涨预测模型训练程序3.0.py", line 121, in reduce_mem_usage if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: '>' not supported between instances of 'Timestamp' and 'numpy.float16' 2025-07-20 17:55:12,250 - INFO - 目标分布: 0=265, 1=0 2025-07-20 17:55:12,266 - ERROR - 特征工程失败: '>' not supported between instances of 'Timestamp' and 'numpy.float16'
最新发布
07-21
这个模型是在干什么:import pandas as pd import numpy as np import lightgbm as lgb from lightgbm import early_stopping, log_evaluation from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score import chardet import os import gc import joblib from tqdm import tqdm import warnings warnings.filterwarnings('ignore') # 内存优化函数 def reduce_mem_usage(df, use_float16=False): """迭代降低DataFrame的内存占用""" start_mem = df.memory_usage().sum() / 1024**2 print(f"内存优化前: {start_mem:.2f} MB") for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print(f"内存优化后: {end_mem:.2f} MB ({100*(start_mem-end_mem)/start_mem:.1f}% 减少)") return df def detect_encoding(file_path): with open(file_path, 'rb') as f: result = chardet.detect(f.read(10000)) return result['encoding'], result['confidence'] def load_data_for_day(day): """逐天加载数据并进行基本处理,返回优化后的DataFrame""" dtypes = {'did': 'category', 'vid': 'category'} day_str = f"{day:02d}" try: # 加载 see 数据 see_path = f'see_{day_str}.csv' if not os.path.exists(see_path): print(f"⚠️ 警告: 文件 {see_path} 不存在,跳过该天数据") return None, None, None see = pd.read_csv(see_path, encoding='latin1', dtype=dtypes) if 'did' not in see.columns or 'vid' not in see.columns: print(f"⚠️ 警告: see_{day_str}.csv 缺少必要字段") return None, None, None see['day'] = day_str see = reduce_mem_usage(see) # 加载 click 数据 click_path = f'click_{day_str}.csv' if os.path.exists(click_path): click = pd.read_csv( click_path, encoding='ISO-8859-1', on_bad_lines='skip', dtype=dtypes ) if 'click_time' not in click.columns: print(f"⚠️ 警告: click_{day_str}.csv 缺少 click_time 字段") click = None else: click = click[['did', 'vid']] click = reduce_mem_usage(click) else: click = None print(f"⚠️ 警告: click_{day_str}.csv 不存在") # 加载 play 数据 play_path = f'playplus_{day_str}.csv' if os.path.exists(play_path): play = pd.read_csv( play_path, engine='python', encoding_errors='ignore', dtype=dtypes ) if 'play_time' not in play.columns: print(f"⚠️ 警告: playplus_{day_str}.csv 缺少 play_time 字段") play = None else: play = play[['did', 'vid', 'play_time']] play = reduce_mem_usage(play) else: play = None print(f"⚠️ 警告: playplus_{day_str}.csv 不存在") return see, click, play except Exception as e: print(f"⚠️ 加载第 {day_str} 天数据时出错: {str(e)}") return None, None, None def process_data_in_chunks(days, feature_builder=None): """分块处理数据,避免内存溢出""" # 首先处理视频信息(一次性) if not os.path.exists('vid_info_table.csv'): raise FileNotFoundError("错误: vid_info_table.csv 文件不存在") video_info = pd.read_csv('vid_info_table.csv', encoding='gbk', dtype={'vid': 'category'}) if 'item_duration' not in video_info.columns: raise ValueError("vid_info_table.csv 缺少 item_duration 字段") video_info = reduce_mem_usage(video_info) video_info['vid'] = video_info['vid'].astype('category') # 初始化全局数据结构 user_stats = {} video_stats = {} # 逐天处理数据 for day in tqdm(range(1, days + 1), desc="处理每日数据"): see, click, play = load_data_for_day(day) if see is None: continue # 处理曝光数据 see_grouped = see.groupby('did')['vid'].nunique().reset_index(name='exposure_count') see_grouped = reduce_mem_usage(see_grouped) # 合并播放数据(如果存在) if play is not None: see = pd.merge(see, play, on=['did', 'vid'], how='left') see['play_time'] = see['play_time'].fillna(0).astype(np.float32) else: see['play_time'] = 0.0 # 合并点击数据(如果存在) if click is not None: click['clicked'] = 1 see = pd.merge(see, click, on=['did', 'vid'], how='left') see['clicked'] = see['clicked'].fillna(0).astype(np.int8) else: see['clicked'] = 0 # 合并视频信息 see = pd.merge(see, video_info[['vid', 'item_duration']], on='vid', how='left') see['item_duration'] = see['item_duration'].fillna(1.0) see.loc[see['item_duration'] <= 0, 'item_duration'] = 1.0 # 计算完成率 see['completion_rate'] = (see['play_time'] / see['item_duration']).clip(0, 1).astype(np.float16) # 创建标签 see['label'] = np.select( [(see['completion_rate'] > 0.4), (see['clicked'] == 1)], [2, 1], # 2=完成, 1=点击 default=0 # 0=曝光未点击 ).astype(np.int8) see['binary_label'] = see['label'].apply(lambda x: 1 if x >= 1 else 0).astype(np.int8) # 更新用户统计 for _, row in see.iterrows(): did = row['did'] vid = row['vid'] # 初始化用户统计 if did not in user_stats: user_stats[did] = { 'exposure_count': 0, 'click_count': 0, 'active_days': set() } # 更新曝光计数 user_stats[did]['exposure_count'] += 1 # 更新点击计数 if row['clicked'] == 1: user_stats[did]['click_count'] += 1 # 更新活跃天数 user_stats[did]['active_days'].add(day) # 初始化视频统计 if vid not in video_stats: video_stats[vid] = { 'click_users': set() } # 更新视频点击用户 if row['clicked'] == 1: video_stats[vid]['click_users'].add(did) # 释放内存 del see gc.collect() # 计算全局特征 print("计算全局特征...") user_features = [] for did, stats in user_stats.items(): active_days = len(stats['active_days']) click_count = stats['click_count'] exposure_count = stats['exposure_count'] if stats['exposure_count'] > 0 else 1 user_click_rate = click_count / exposure_count user_features.append({ 'did': did, 'user_click_rate': user_click_rate, 'user_active_days': active_days }) video_features = [] for vid, stats in video_stats.items(): video_popularity = len(stats['click_users']) video_features.append({ 'vid': vid, 'video_popularity': video_popularity }) user_df = pd.DataFrame(user_features) video_df = pd.DataFrame(video_features) # 释放内存 del user_stats, video_stats gc.collect() # 保存特征 user_df = reduce_mem_usage(user_df) video_df = reduce_mem_usage(video_df) user_df.to_csv('user_click_rate.csv', index=False) video_df.to_csv('video_popularity.csv', index=False) return user_df, video_df def prepare_samples(days=7): """准备训练样本(内存优化版本)""" # 处理数据并获取全局特征 user_df, video_df = process_data_in_chunks(days) # 读取并处理最近一天的数据作为样本 see, _, play = load_data_for_day(days) if see is None: raise ValueError("无法加载样本数据") # 合并用户特征 see = pd.merge(see, user_df, on='did', how='left') see['user_click_rate'] = see['user_click_rate'].fillna(0).astype(np.float32) see['user_active_days'] = see['user_active_days'].fillna(1).astype(np.int16) # 合并视频特征 see = pd.merge(see, video_df, on='vid', how='left') see['video_popularity'] = see['video_popularity'].fillna(0).astype(np.float32) # 特征交叉 see['user_video_interaction'] = (see['user_active_days'] * np.log1p(see['video_popularity'])).astype(np.float32) see['user_video_affinity'] = (see['user_click_rate'] * see['video_popularity']).astype(np.float32) # 处理视频信息 video_info = pd.read_csv('vid_info_table.csv', encoding='gbk', dtype={'vid': 'category'}) see = pd.merge(see, video_info[['vid', 'item_duration']], on='vid', how='left') see['item_duration'] = see['item_duration'].fillna(1.0) see.loc[see['item_duration'] <= 0, 'item_duration'] = 1.0 # 计算完成率 if 'play_time' not in see.columns: see['play_time'] = 0.0 see['completion_rate'] = (see['play_time'] / see['item_duration']).clip(0, 1).astype(np.float16) # 创建标签 see['label'] = np.select( [(see['completion_rate'] > 0.4), (see['clicked'] == 1)], [2, 1], # 2=完成, 1=点击 default=0 # 0=曝光未点击 ).astype(np.int8) see['binary_label'] = see['label'].apply(lambda x: 1 if x >= 1 else 0).astype(np.int8) # 优化内存 see = reduce_mem_usage(see) return see, user_df, video_df def train_model(samples): """训练模型(内存优化版本)""" print("准备训练数据...") features = ['user_click_rate', 'video_popularity', 'user_active_days', 'user_video_interaction', 'user_video_affinity'] # 确保特征存在 available_features = [f for f in features if f in samples.columns] print(f"使用的特征: {available_features}") X = samples[available_features] y = samples['binary_label'] # 检查标签分布 if len(y.unique()) < 2: raise ValueError("标签数据不平衡,需要正负样本") # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # 优化内存 X_train = reduce_mem_usage(X_train) X_test = reduce_mem_usage(X_test) # 创建数据集 lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=True) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=True) # 优化模型参数(降低复杂度) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'num_leaves': 31, # 减少叶子节点 'max_depth': 7, # 减少深度 'learning_rate': 0.05, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'min_child_samples': 100, # 增加以降低内存 'verbosity': -1, 'seed': 42 } # 训练模型 print("训练模型...") model = lgb.train( params, lgb_train, num_boost_round=500, # 减少迭代次数 valid_sets=[lgb_train, lgb_eval], callbacks=[ early_stopping(stopping_rounds=50, verbose=True), log_evaluation(period=100) ] ) # 评估模型 y_pred = model.predict(X_test) auc_score = roc_auc_score(y_test, y_pred) print(f"✅ 模型训练完成,验证集AUC: {auc_score:.4f}") # 保存模型 joblib.dump(model, 'lightgbm_model.pkl') print("💾 模型已保存") # 保存特征列表 with open('feature_columns.txt', 'w') as f: f.write('\n'.join(available_features)) return model, available_features, auc_score def predict_new_data(model, feature_columns, test_file): """预测新数据内存优化版本)""" print("加载测试数据...") test_data = pd.read_csv(test_file, dtype={'did': 'category', 'vid': 'category'}) test_data = reduce_mem_usage(test_data) # 加载特征映射 user_df = pd.read_csv('user_click_rate.csv') if os.path.exists('user_click_rate.csv') else pd.DataFrame() video_df = pd.read_csv('video_popularity.csv') if os.path.exists('video_popularity.csv') else pd.DataFrame() # 使用全局均值用于填充新用户/新视频 global_user_rate = user_df['user_click_rate'].mean() if not user_df.empty else 0 global_video_pop = video_df['video_popularity'].mean() if not video_df.empty else 0 global_active_days = user_df['user_active_days'].mean() if not user_df.empty else 1 # 创建映射字典(减少内存) user_click_map = user_df.set_index('did')['user_click_rate'].to_dict() if not user_df.empty else {} video_pop_map = video_df.set_index('vid')['video_popularity'].to_dict() if not video_df.empty else {} user_active_map = user_df.set_index('did')['user_active_days'].to_dict() if not user_df.empty else {} # 添加特征 print("添加特征...") test_data['user_click_rate'] = test_data['did'].map(user_click_map).fillna(global_user_rate).astype(np.float32) test_data['video_popularity'] = test_data['vid'].map(video_pop_map).fillna(global_video_pop).astype(np.float32) test_data['user_active_days'] = test_data['did'].map(user_active_map).fillna(global_active_days).astype(np.int16) # 特征交叉 test_data['user_video_interaction'] = (test_data['user_active_days'] * np.log1p(test_data['video_popularity'])).astype(np.float32) test_data['user_video_affinity'] = (test_data['user_click_rate'] * test_data['video_popularity']).astype(np.float32) # 确保所有特征都存在 print("准备预测数据...") test_features = test_data[feature_columns].copy() # 释放内存 del test_data gc.collect() # 分批预测(避免内存溢出) print("开始预测...") batch_size = 100000 predictions = [] for i in tqdm(range(0, len(test_features), batch_size), desc="预测批次"): batch = test_features.iloc[i:i+batch_size] preds = model.predict(batch) predictions.extend(preds.tolist()) del batch gc.collect() # 重新加载测试数据以获取did和vid test_data = pd.read_csv(test_file, dtype={'did': 'category', 'vid': 'category'}, usecols=['did', 'vid']) test_data['click_prob'] = predictions # 生成并保存结果 print("生成最终结果...") top_predictions = test_data.sort_values('click_prob', ascending=False).groupby('did').head(1) result = top_predictions[['did', 'vid', 'click_prob']].copy() result.to_csv('prediction_result.csv', index=False) print(f"✅ 预测完成,结果已保存至 prediction_result.csv") print(f"预测样本数量: {len(result)}") # 释放内存 del test_features, predictions, top_predictions gc.collect() return result if __name__ == '__main__': try: print("🚀 开始视频推荐模型训练与预测流程 (内存优化版)") # 设置较小的天数 TRAIN_DAYS = 7 # 仅使用7天数据 print(f"⚙️ 配置: 使用{TRAIN_DAYS}天数据训练") # 准备样本 print("🔧 准备训练样本...") samples, _, _ = prepare_samples(days=TRAIN_DAYS) if samples is None: raise ValueError("样本准备失败") print(f"✅ 样本准备完成 - 总样本数: {len(samples)}") # 标签分布 label_dist = samples['binary_label'].value_counts(normalize=True) print(f"📊 标签分布 - 正样本: {label_dist[1]:.2%}, 负样本: {label_dist[0]:.2%}") # 训练模型 print("🤖 开始训练LightGBM模型...") model, features, auc_score = train_model(samples) print(f"🎯 最优模型AUC: {auc_score:.4f}") # 释放内存 del samples gc.collect() # 预测新数据 print("🔮 开始预测新数据...") test_file = 'testA_did_show.csv' # 直接加载保存的模型(避免内存中的模型占用) if not os.path.exists('lightgbm_model.pkl'): raise FileNotFoundError("模型文件不存在") model = joblib.load('lightgbm_model.pkl') # 加载特征列表 if not os.path.exists('feature_columns.txt'): raise FileNotFoundError("特征列表文件不存在") with open('feature_columns.txt', 'r') as f: features = f.read().splitlines() result = predict_new_data(model, features, test_file) print("✅ 流程成功完成!") except Exception as e: print(f"❌ 流程出错: {str(e)}") import traceback traceback.print_exc()
07-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值