['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Pool QC', 'Fence', 'Misc Feature', 'Misc Val', 'Mo Sold', 'Yr Sold', 'Sale Type', 'Sale Condition', 'SalePrice'] --------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[12], line 13 11 cols_to_fill_none = ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Fireplace Qu', 'Garage Type', 'Garage Cond'] 12 # 检查并填充分类特征 ---> 13 check_and_fill_columns(df, cols_to_fill_none, 'None') 15 # (2)缺失值填充:车库数值特征 16 garage_numeric_cols = ['Garage Yr Blt', 'Garage Area', 'Garage Cars'] NameError: name 'check_and_fill_columns' is not defined
时间: 2025-06-15 15:42:08 浏览: 14
感谢你的反馈。看起来我们在调用 `check_and_fill_columns` 函数之前忘记定义它了。我们需要确保这个辅助函数定义在调用它的位置之前。
此外,我们还需要确保所有列名与实际数据集中的列名完全匹配。以下是修正后的完整代码,确保所有辅助函数定义正确,并且列名与实际数据集匹配。
### 完整修正代码
```python
import pandas as pd
import numpy as np
# 读取数据
df = pd.read_csv('AmesHousing.csv')
# 打印列名以确认
print(df.columns.tolist())
# 辅助函数:检查并填充列
def check_and_fill_columns(df, columns, fill_value='None'):
for col in columns:
if col not in df.columns:
print(f"警告: 列 '{col}' 不存在于数据集中")
continue
df[col] = df[col].fillna(fill_value)
# (1)缺失值填充:分类特征
cols_to_fill_none = ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Fireplace Qu', 'Garage Type', 'Garage Cond']
# 确保列名与实际数据集一致
cols_to_fill_none = ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Fireplace Qu', 'Garage Type', 'Garage Cond']
# 检查并填充分类特征
check_and_fill_columns(df, cols_to_fill_none, 'None')
# (2)缺失值填充:车库数值特征
garage_numeric_cols = ['Garage Yr Blt', 'Garage Area', 'Garage Cars']
# 确保列名与实际数据集一致
garage_numeric_cols = ['Garage Yr Blt', 'Garage Area', 'Garage Cars']
# 检查并填充车库数值特征
check_and_fill_columns(df, garage_numeric_cols, 0)
# (3)缺失值填充:Lot Frontage
def impute_lot_frontage(df):
for neighborhood in df['Neighborhood'].unique():
median = df[df['Neighborhood'] == neighborhood]['Lot Frontage'].median()
df.loc[(df['Neighborhood'] == neighborhood) & (df['Lot Frontage'].isnull()), 'Lot Frontage'] = median
return df
df = impute_lot_frontage(df)
# (4)创建房龄相关的特征
df['HouseAge'] = df['Yr Sold'] - df['Year Built']
df['RemodelAge'] = df['Yr Sold'] - df['Year Remod/Add']
df['IsRemodeled'] = (df['Year Built'] != df['Year Remod/Add']).astype(int)
# (5)创建总面积特征
df['TotalSF'] = df['Total Bsmt SF'] + df['1st Flr SF'] + df['2nd Flr SF']
df['TotalBath'] = df['Full Bath'] + 0.5 * df['Half Bath'] + df['Bsmt Full Bath'] + 0.5 * df['Bsmt Half Bath']
# (6)删除原始序列
cols_to_drop = ['Year Built', 'Year Remod/Add', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Full Bath', 'Half Bath', 'Bsmt Full Bath', 'Bsmt Half Bath']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
# (7)转换质量特征为数值
quality_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
quality_cols = ['Exter Qual', 'Kitchen Qual', 'Heating QC', 'Bsmt Qual']
# 将 NaN 值(处理后可能为 'None')映射为 0
df[quality_cols] = df[quality_cols].replace('None', np.nan).fillna(0)
# 应用映射
for col in quality_cols:
df[col] = df[col].replace(quality_mapping).fillna(0)
# (8)计算综合质量得分
df['OverallQualScore'] = df[quality_cols].mean(axis=1)
# (9)计算皮尔逊相关系数
correlation = df[['OverallQualScore', 'SalePrice']].corr().iloc[0, 1]
print(f"皮尔逊相关系数: {correlation}")
# 分析合理性
if correlation > 0.7:
print("相关性较强,合理的。")
else:
print("相关性较弱,可能需要进一步分析。")
# (10)创建房龄类别
bins = [0, 10, 40, float('inf')]
labels = ['New', 'Mid-Age', 'Old']
df['AgeCategory'] = pd.cut(df['HouseAge'], bins=bins, labels=labels)
# (11)分析不同街区、不同房龄类别的房价中位数
median_prices = df.groupby(['Neighborhood', 'AgeCategory'])['SalePrice'].median().unstack()
print(median_prices)
# (12)分析新房和老房的价格中位数
new_house_median = df[df['AgeCategory'] == 'New'].groupby('Neighborhood')['SalePrice'].median().sort_values(ascending=False)
old_house_median = df[df['AgeCategory'] == 'Old'].groupby('Neighborhood')['SalePrice'].median().sort_values(ascending=False)
print(f"新房价格中位数最高的街区: {new_house_median.index[0]}")
print(f"老房价格中位数最高的街区: {old_house_median.index[0]}")
# (13)分析房价随房龄变化的模式
expensive_neighborhood = 'NridgHt' # 假设这是昂贵区
affordable_neighborhood = 'CollgCr' # 假设这是平价区
expensive_prices = df[df['Neighborhood'] == expensive_neighborhood].groupby('AgeCategory')['SalePrice'].median()
affordable_prices = df[df['Neighborhood'] == affordable_neighborhood].groupby('AgeCategory')['SalePrice'].median()
print(f"{expensive_neighborhood} 房价随房龄变化: \n{expensive_prices}")
print(f"{affordable_neighborhood} 房价随房龄变化: \n{affordable_prices}")
# 分析变化模式的不同
if expensive_prices['New'] > affordable_prices['New']:
print(f"{expensive_neighborhood} 的新房价格更高,可能因为该地区整体环境和设施更好。")
if expensive_prices['Old'] > affordable_prices['Old']:
print(f"{expensive_neighborhood} 的老房更保值,可能因为该地区的房产品质较高。")
# (14)投资建议
target_neighborhoods = ['NridgHt', 'CollgCr']
# 分析这两个街区的老房子中位数价格和增值潜力
for neighborhood in target_neighborhoods:
old_median_price = df[(df['Neighborhood'] == neighborhood) & (df['AgeCategory'] == 'Old')]['SalePrice'].median()
new_median_price = df[(df['Neighborhood'] == neighborhood) & (df['AgeCategory'] == 'New')]['SalePrice'].median()
print(f"街区: {neighborhood}")
print(f"老房价格中位数: {old_median_price}")
print(f"新房价格中位数: {new_median_price}")
print(f"增值潜力: {new_median_price - old_median_price}\n")
# 投资建议
print("根据以上分析,建议投资者优先考虑以下街区:")
for neighborhood in target_neighborhoods:
if new_median_price > old_median_price:
print(f"- {neighborhood}: 该街区的老房子翻新后的增值潜力较大,且房价基数较高,适合投资。")
```
### 关键点总结
1. **定义辅助函数**:确保 `check_and_fill_columns` 函数在调用之前已经定义。
2. **列名一致性**:确保所有列名与实际数据集中的列名完全一致,特别注意空格和斜杠。
3. **辅助函数的作用**:使用辅助函数 `check_and_fill_columns` 来检查并填充缺失值,提高代码的健壮性。
4. **删除列时使用 `errors='ignore'`**:在删除列时,使用 `errors='ignore'` 参数以防止由于列不存在而导致的错误。
通过这些修正,你应该能够顺利运行代码并完成数据分析任务。如果有更多问题或需要进一步的帮助,请随时告知!
阅读全文
相关推荐
















