import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.impute import SimpleImputer import numpy as np # 数据输入:手动创建数据集 data = { "日期": ["2025/4/1"] * 24, "时间": [f"{i}:00:00" for i in range(24)], "气温℃": [3.58, 3.70, 3.22, 2.23, 1.94, 2.42, 2.23, 2.87, 3.31, 10.53, 11.32, 11.63, 14.11, 14.18, 14.08, 14.69, 14.66, 11.51, 11.39, 12.41, 11.57, 7.35, 5.35], "湿度%": [11.26, 13.23, 14.41, 18.66, 17.96, 21.34, 21.91, 20.31, 19.76, 10.60, 9.40, 9.15, 6.93, 6.34, 6.19, 5.95, 6.04, 7.34, 5.95, 4.26, 4.57, 9.14, 9.58], "气压hPa": [867.53, 867.39, 867.19, 866.97, 866.44, 866.28, 867.03, 867.62, 867.86, 868.11, 868.08, 868.01, 867.15, 866.47, 865.33, 865.03, 864.78, 865.46, 864.93, 865.26, 866.01, 866.25, 866.10], "风向°": [275.53, 276.13, 279.62, 281.28, 284.63, 296.73, 294.89, 298.60, 300.94, 304.77, 312.40, 318.26, 318.40, 324.85, 330.32, 334.99, 335.53, 340.04, 5.06, 48.04, 74.37, 105.08, 106.24], "输出功率(MW)": [0.31, 0.48, 0.49, 0.47, 0.43, 0.42, 0.34, 0.28, 0.18, 0.11, 0.17, 0.10, 0.06, 0.05, 0.04, 0.04, 0.02, 0, 0, 0.0037, 0.013, 0.030, 0.017], "风速(m/s)": [6.92, 7.99, 8.06, 7.96, 7.66, 7.61, 7.12, 6.68, 5.81, 5.11, 5.73, 4.88, 4.28, 3.98, 3.81, 3.68, 3.20, 1.88, 1.39, 2.39, 3.09, 3.51, 3.51] } # 转换为DataFrame df = pd.DataFrame(data) # 检查是否有缺失值 if df.isnull().values.any(): print("警告: 数据中存在缺失值,将使用均值填充。") # 使用SimpleImputer填充缺失值 imputer = SimpleImputer(strategy='mean') # 使用均值填充 df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) # 选择预测目标:假设我们想预测“输出功率(MW)”基于其他变量 X = df_imputed[["气温℃", "湿度%", "气压hPa", "风向°", "风速(m/s)"]] # 特征 y = df_imputed["输出功率(MW)"] # 目标变量 # 拆分训练数据和测试数据 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建并训练线性回归模型 model = LinearRegression() model.fit(X_train, y_train) # 在测试集上预测 y_pred = model.predict(X_test) # 评估模型 mse = mean_squared_error(y_test, y_pred) print(f"均方误差(MSE): {mse}") # 输出预测结果 print(f"预测值: {y_pred}") print(f"实际值: {y_test.values}") 以上程序产生以下错误 (most recent call last): File "C:\Users\22564\PycharmProjects\pythonProject26\1.py", line 21, in <module> df = pd.DataFrame(data) ^^^^^^^^^^^^^^^^^^ File "C:\Users\22564\PycharmProjects\pythonProject26\.venv\Lib\site-packages\pandas\core\frame.py", line 778, in __init__ mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\22564\PycharmProjects\pythonProject26\.venv\Lib\site-packages\pandas\core\internals\construction.py", line 503, in dict_to_mgr return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\22564\PycharmProjects\pythonProject26\.venv\Lib\site-packages\pandas\core\internals\construction.py", line 114, in arrays_to_mgr index = _extract_index(arrays) ^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\22564\PycharmProjects\pythonProject26\.venv\Lib\site-packages\pandas\core\internals\construction.py", line 677, in _extract_index raise ValueError("All arrays must be of the same length") ValueError: All arrays must be of the same length
时间: 2025-05-31 21:56:01 浏览: 26
这个错误的原因是传入 `pd.DataFrame` 的数据字典中,某些列的长度不一致。具体来说,`pandas` 在创建 `DataFrame` 时要求所有列的长度必须相同,否则会抛出 `ValueError`。
### 解决方法
我们需要检查数据字典中的每一列,确保它们的长度一致。如果发现长度不一致的列,需要找出问题并修复。
以下是修正代码和解决方法:
```python
import pandas as pd
# 数据输入:手动创建数据集
data = {
"日期": ["2025/4/1"] * 24,
"时间": [f"{i}:00:00" for i in range(24)],
"气温℃": [3.58, 3.70, 3.22, 2.23, 1.94, 2.42, 2.23, 2.87, 3.31, 10.53, 11.32, 11.63, 14.11, 14.18, 14.08, 14.69, 14.66, 11.51, 11.39, 12.41, 11.57, 7.35, 5.35],
"湿度%": [11.26, 13.23, 14.41, 18.66, 17.96, 21.34, 21.91, 20.31, 19.76, 10.60, 9.40, 9.15, 6.93, 6.34, 6.19, 5.95, 6.04, 7.34, 5.95, 4.26, 4.57, 9.14, 9.58],
"气压hPa": [867.53, 867.39, 867.19, 866.97, 866.44, 866.28, 867.03, 867.62, 867.86, 868.11, 868.08, 868.01, 867.15, 866.47, 865.33, 865.03, 864.78, 865.46, 864.93, 865.26, 866.01, 866.25, 866.10],
"风向°": [275.53, 276.13, 279.62, 281.28, 284.63, 296.73, 294.89, 298.60, 300.94, 304.77, 312.40, 318.26, 318.40, 324.85, 330.32, 334.99, 335.53, 340.04, 5.06, 48.04, 74.37, 105.08, 106.24],
"输出功率(MW)": [0.31, 0.48, 0.49, 0.47, 0.43, 0.42, 0.34, 0.28, 0.18, 0.11, 0.17, 0.10, 0.06, 0.05, 0.04, 0.04, 0.02, 0, 0, 0.0037, 0.013, 0.030, 0.017],
"风速(m/s)": [6.92, 7.99, 8.06, 7.96, 7.66, 7.61, 7.12, 6.68, 5.81, 5.11, 5.73, 4.88, 4.28, 3.98, 3.81, 3.68, 3.20, 1.88, 1.39, 2.39, 3.09, 3.51, 3.51]
}
# 检查每列的长度是否一致
lengths = {key: len(value) for key, value in data.items()}
print("各列长度:", lengths)
# 找到最短的列长度,并对所有列进行补齐
max_length = max(lengths.values())
for key, value in data.items():
if len(value) < max_length:
data[key].append(None) # 使用 None 填充较短的列
# 转换为 DataFrame
df = pd.DataFrame(data)
print(df)
```
### 给出解释
在原始数据中,`气温℃` 列的长度为 23,而其他列的长度为 24。由于 `pandas` 要求所有列的长度一致,因此会抛出 `ValueError`。通过检查每列的长度,并使用 `None` 填充较短的列,可以解决这个问题。
#### 相关注意事项
- 如果数据是从外部文件(如 CSV)读取的,确保文件中的每一行都有相同的列数。
- 在实际应用中,可能需要根据业务逻辑选择合适的填充方式(如均值、中位数或插值)。
阅读全文